xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
19 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24 #include "llvm/CodeGen/GlobalISel/Utils.h"
25 #include "llvm/CodeGen/MachineConstantPool.h"
26 #include "llvm/CodeGen/MachineFrameInfo.h"
27 #include "llvm/CodeGen/MachineRegisterInfo.h"
28 #include "llvm/CodeGen/RuntimeLibcalls.h"
29 #include "llvm/CodeGen/TargetFrameLowering.h"
30 #include "llvm/CodeGen/TargetInstrInfo.h"
31 #include "llvm/CodeGen/TargetLowering.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/CodeGen/TargetSubtargetInfo.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/Support/Debug.h"
36 #include "llvm/Support/MathExtras.h"
37 #include "llvm/Support/raw_ostream.h"
38 #include "llvm/Target/TargetMachine.h"
39 #include <numeric>
40 #include <optional>
41 
42 #define DEBUG_TYPE "legalizer"
43 
44 using namespace llvm;
45 using namespace LegalizeActions;
46 using namespace MIPatternMatch;
47 
48 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
49 ///
50 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
51 /// with any leftover piece as type \p LeftoverTy
52 ///
53 /// Returns -1 in the first element of the pair if the breakdown is not
54 /// satisfiable.
55 static std::pair<int, int>
56 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
57   assert(!LeftoverTy.isValid() && "this is an out argument");
58 
59   unsigned Size = OrigTy.getSizeInBits();
60   unsigned NarrowSize = NarrowTy.getSizeInBits();
61   unsigned NumParts = Size / NarrowSize;
62   unsigned LeftoverSize = Size - NumParts * NarrowSize;
63   assert(Size > NarrowSize);
64 
65   if (LeftoverSize == 0)
66     return {NumParts, 0};
67 
68   if (NarrowTy.isVector()) {
69     unsigned EltSize = OrigTy.getScalarSizeInBits();
70     if (LeftoverSize % EltSize != 0)
71       return {-1, -1};
72     LeftoverTy = LLT::scalarOrVector(
73         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
74   } else {
75     LeftoverTy = LLT::scalar(LeftoverSize);
76   }
77 
78   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
79   return std::make_pair(NumParts, NumLeftover);
80 }
81 
82 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
83 
84   if (!Ty.isScalar())
85     return nullptr;
86 
87   switch (Ty.getSizeInBits()) {
88   case 16:
89     return Type::getHalfTy(Ctx);
90   case 32:
91     return Type::getFloatTy(Ctx);
92   case 64:
93     return Type::getDoubleTy(Ctx);
94   case 80:
95     return Type::getX86_FP80Ty(Ctx);
96   case 128:
97     return Type::getFP128Ty(Ctx);
98   default:
99     return nullptr;
100   }
101 }
102 
103 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
104                                  GISelChangeObserver &Observer,
105                                  MachineIRBuilder &Builder)
106     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
107       LI(*MF.getSubtarget().getLegalizerInfo()),
108       TLI(*MF.getSubtarget().getTargetLowering()), KB(nullptr) {}
109 
110 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
111                                  GISelChangeObserver &Observer,
112                                  MachineIRBuilder &B, GISelKnownBits *KB)
113     : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
114       TLI(*MF.getSubtarget().getTargetLowering()), KB(KB) {}
115 
116 LegalizerHelper::LegalizeResult
117 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
118                                    LostDebugLocObserver &LocObserver) {
119   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
120 
121   MIRBuilder.setInstrAndDebugLoc(MI);
122 
123   if (isa<GIntrinsic>(MI))
124     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
125   auto Step = LI.getAction(MI, MRI);
126   switch (Step.Action) {
127   case Legal:
128     LLVM_DEBUG(dbgs() << ".. Already legal\n");
129     return AlreadyLegal;
130   case Libcall:
131     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
132     return libcall(MI, LocObserver);
133   case NarrowScalar:
134     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
135     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
136   case WidenScalar:
137     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
138     return widenScalar(MI, Step.TypeIdx, Step.NewType);
139   case Bitcast:
140     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
141     return bitcast(MI, Step.TypeIdx, Step.NewType);
142   case Lower:
143     LLVM_DEBUG(dbgs() << ".. Lower\n");
144     return lower(MI, Step.TypeIdx, Step.NewType);
145   case FewerElements:
146     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
147     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
148   case MoreElements:
149     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
150     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
151   case Custom:
152     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
153     return LI.legalizeCustom(*this, MI, LocObserver) ? Legalized
154                                                      : UnableToLegalize;
155   default:
156     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
157     return UnableToLegalize;
158   }
159 }
160 
161 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
162                                    SmallVectorImpl<Register> &VRegs) {
163   for (int i = 0; i < NumParts; ++i)
164     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
165   MIRBuilder.buildUnmerge(VRegs, Reg);
166 }
167 
168 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
169                                    LLT MainTy, LLT &LeftoverTy,
170                                    SmallVectorImpl<Register> &VRegs,
171                                    SmallVectorImpl<Register> &LeftoverRegs) {
172   assert(!LeftoverTy.isValid() && "this is an out argument");
173 
174   unsigned RegSize = RegTy.getSizeInBits();
175   unsigned MainSize = MainTy.getSizeInBits();
176   unsigned NumParts = RegSize / MainSize;
177   unsigned LeftoverSize = RegSize - NumParts * MainSize;
178 
179   // Use an unmerge when possible.
180   if (LeftoverSize == 0) {
181     for (unsigned I = 0; I < NumParts; ++I)
182       VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
183     MIRBuilder.buildUnmerge(VRegs, Reg);
184     return true;
185   }
186 
187   // Perform irregular split. Leftover is last element of RegPieces.
188   if (MainTy.isVector()) {
189     SmallVector<Register, 8> RegPieces;
190     extractVectorParts(Reg, MainTy.getNumElements(), RegPieces);
191     for (unsigned i = 0; i < RegPieces.size() - 1; ++i)
192       VRegs.push_back(RegPieces[i]);
193     LeftoverRegs.push_back(RegPieces[RegPieces.size() - 1]);
194     LeftoverTy = MRI.getType(LeftoverRegs[0]);
195     return true;
196   }
197 
198   LeftoverTy = LLT::scalar(LeftoverSize);
199   // For irregular sizes, extract the individual parts.
200   for (unsigned I = 0; I != NumParts; ++I) {
201     Register NewReg = MRI.createGenericVirtualRegister(MainTy);
202     VRegs.push_back(NewReg);
203     MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
204   }
205 
206   for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
207        Offset += LeftoverSize) {
208     Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
209     LeftoverRegs.push_back(NewReg);
210     MIRBuilder.buildExtract(NewReg, Reg, Offset);
211   }
212 
213   return true;
214 }
215 
216 void LegalizerHelper::extractVectorParts(Register Reg, unsigned NumElts,
217                                          SmallVectorImpl<Register> &VRegs) {
218   LLT RegTy = MRI.getType(Reg);
219   assert(RegTy.isVector() && "Expected a vector type");
220 
221   LLT EltTy = RegTy.getElementType();
222   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
223   unsigned RegNumElts = RegTy.getNumElements();
224   unsigned LeftoverNumElts = RegNumElts % NumElts;
225   unsigned NumNarrowTyPieces = RegNumElts / NumElts;
226 
227   // Perfect split without leftover
228   if (LeftoverNumElts == 0)
229     return extractParts(Reg, NarrowTy, NumNarrowTyPieces, VRegs);
230 
231   // Irregular split. Provide direct access to all elements for artifact
232   // combiner using unmerge to elements. Then build vectors with NumElts
233   // elements. Remaining element(s) will be (used to build vector) Leftover.
234   SmallVector<Register, 8> Elts;
235   extractParts(Reg, EltTy, RegNumElts, Elts);
236 
237   unsigned Offset = 0;
238   // Requested sub-vectors of NarrowTy.
239   for (unsigned i = 0; i < NumNarrowTyPieces; ++i, Offset += NumElts) {
240     ArrayRef<Register> Pieces(&Elts[Offset], NumElts);
241     VRegs.push_back(MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0));
242   }
243 
244   // Leftover element(s).
245   if (LeftoverNumElts == 1) {
246     VRegs.push_back(Elts[Offset]);
247   } else {
248     LLT LeftoverTy = LLT::fixed_vector(LeftoverNumElts, EltTy);
249     ArrayRef<Register> Pieces(&Elts[Offset], LeftoverNumElts);
250     VRegs.push_back(
251         MIRBuilder.buildMergeLikeInstr(LeftoverTy, Pieces).getReg(0));
252   }
253 }
254 
255 void LegalizerHelper::insertParts(Register DstReg,
256                                   LLT ResultTy, LLT PartTy,
257                                   ArrayRef<Register> PartRegs,
258                                   LLT LeftoverTy,
259                                   ArrayRef<Register> LeftoverRegs) {
260   if (!LeftoverTy.isValid()) {
261     assert(LeftoverRegs.empty());
262 
263     if (!ResultTy.isVector()) {
264       MIRBuilder.buildMergeLikeInstr(DstReg, PartRegs);
265       return;
266     }
267 
268     if (PartTy.isVector())
269       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
270     else
271       MIRBuilder.buildBuildVector(DstReg, PartRegs);
272     return;
273   }
274 
275   // Merge sub-vectors with different number of elements and insert into DstReg.
276   if (ResultTy.isVector()) {
277     assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
278     SmallVector<Register, 8> AllRegs;
279     for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs))
280       AllRegs.push_back(Reg);
281     return mergeMixedSubvectors(DstReg, AllRegs);
282   }
283 
284   SmallVector<Register> GCDRegs;
285   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
286   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
287     extractGCDType(GCDRegs, GCDTy, PartReg);
288   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
289   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
290 }
291 
292 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
293                                        Register Reg) {
294   LLT Ty = MRI.getType(Reg);
295   SmallVector<Register, 8> RegElts;
296   extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts);
297   Elts.append(RegElts);
298 }
299 
300 /// Merge \p PartRegs with different types into \p DstReg.
301 void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
302                                            ArrayRef<Register> PartRegs) {
303   SmallVector<Register, 8> AllElts;
304   for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
305     appendVectorElts(AllElts, PartRegs[i]);
306 
307   Register Leftover = PartRegs[PartRegs.size() - 1];
308   if (MRI.getType(Leftover).isScalar())
309     AllElts.push_back(Leftover);
310   else
311     appendVectorElts(AllElts, Leftover);
312 
313   MIRBuilder.buildMergeLikeInstr(DstReg, AllElts);
314 }
315 
316 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
317 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
318                               const MachineInstr &MI) {
319   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
320 
321   const int StartIdx = Regs.size();
322   const int NumResults = MI.getNumOperands() - 1;
323   Regs.resize(Regs.size() + NumResults);
324   for (int I = 0; I != NumResults; ++I)
325     Regs[StartIdx + I] = MI.getOperand(I).getReg();
326 }
327 
328 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
329                                      LLT GCDTy, Register SrcReg) {
330   LLT SrcTy = MRI.getType(SrcReg);
331   if (SrcTy == GCDTy) {
332     // If the source already evenly divides the result type, we don't need to do
333     // anything.
334     Parts.push_back(SrcReg);
335   } else {
336     // Need to split into common type sized pieces.
337     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
338     getUnmergeResults(Parts, *Unmerge);
339   }
340 }
341 
342 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
343                                     LLT NarrowTy, Register SrcReg) {
344   LLT SrcTy = MRI.getType(SrcReg);
345   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
346   extractGCDType(Parts, GCDTy, SrcReg);
347   return GCDTy;
348 }
349 
350 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
351                                          SmallVectorImpl<Register> &VRegs,
352                                          unsigned PadStrategy) {
353   LLT LCMTy = getLCMType(DstTy, NarrowTy);
354 
355   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
356   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
357   int NumOrigSrc = VRegs.size();
358 
359   Register PadReg;
360 
361   // Get a value we can use to pad the source value if the sources won't evenly
362   // cover the result type.
363   if (NumOrigSrc < NumParts * NumSubParts) {
364     if (PadStrategy == TargetOpcode::G_ZEXT)
365       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
366     else if (PadStrategy == TargetOpcode::G_ANYEXT)
367       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
368     else {
369       assert(PadStrategy == TargetOpcode::G_SEXT);
370 
371       // Shift the sign bit of the low register through the high register.
372       auto ShiftAmt =
373         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
374       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
375     }
376   }
377 
378   // Registers for the final merge to be produced.
379   SmallVector<Register, 4> Remerge(NumParts);
380 
381   // Registers needed for intermediate merges, which will be merged into a
382   // source for Remerge.
383   SmallVector<Register, 4> SubMerge(NumSubParts);
384 
385   // Once we've fully read off the end of the original source bits, we can reuse
386   // the same high bits for remaining padding elements.
387   Register AllPadReg;
388 
389   // Build merges to the LCM type to cover the original result type.
390   for (int I = 0; I != NumParts; ++I) {
391     bool AllMergePartsArePadding = true;
392 
393     // Build the requested merges to the requested type.
394     for (int J = 0; J != NumSubParts; ++J) {
395       int Idx = I * NumSubParts + J;
396       if (Idx >= NumOrigSrc) {
397         SubMerge[J] = PadReg;
398         continue;
399       }
400 
401       SubMerge[J] = VRegs[Idx];
402 
403       // There are meaningful bits here we can't reuse later.
404       AllMergePartsArePadding = false;
405     }
406 
407     // If we've filled up a complete piece with padding bits, we can directly
408     // emit the natural sized constant if applicable, rather than a merge of
409     // smaller constants.
410     if (AllMergePartsArePadding && !AllPadReg) {
411       if (PadStrategy == TargetOpcode::G_ANYEXT)
412         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
413       else if (PadStrategy == TargetOpcode::G_ZEXT)
414         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
415 
416       // If this is a sign extension, we can't materialize a trivial constant
417       // with the right type and have to produce a merge.
418     }
419 
420     if (AllPadReg) {
421       // Avoid creating additional instructions if we're just adding additional
422       // copies of padding bits.
423       Remerge[I] = AllPadReg;
424       continue;
425     }
426 
427     if (NumSubParts == 1)
428       Remerge[I] = SubMerge[0];
429     else
430       Remerge[I] = MIRBuilder.buildMergeLikeInstr(NarrowTy, SubMerge).getReg(0);
431 
432     // In the sign extend padding case, re-use the first all-signbit merge.
433     if (AllMergePartsArePadding && !AllPadReg)
434       AllPadReg = Remerge[I];
435   }
436 
437   VRegs = std::move(Remerge);
438   return LCMTy;
439 }
440 
441 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
442                                                ArrayRef<Register> RemergeRegs) {
443   LLT DstTy = MRI.getType(DstReg);
444 
445   // Create the merge to the widened source, and extract the relevant bits into
446   // the result.
447 
448   if (DstTy == LCMTy) {
449     MIRBuilder.buildMergeLikeInstr(DstReg, RemergeRegs);
450     return;
451   }
452 
453   auto Remerge = MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs);
454   if (DstTy.isScalar() && LCMTy.isScalar()) {
455     MIRBuilder.buildTrunc(DstReg, Remerge);
456     return;
457   }
458 
459   if (LCMTy.isVector()) {
460     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
461     SmallVector<Register, 8> UnmergeDefs(NumDefs);
462     UnmergeDefs[0] = DstReg;
463     for (unsigned I = 1; I != NumDefs; ++I)
464       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
465 
466     MIRBuilder.buildUnmerge(UnmergeDefs,
467                             MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs));
468     return;
469   }
470 
471   llvm_unreachable("unhandled case");
472 }
473 
474 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
475 #define RTLIBCASE_INT(LibcallPrefix)                                           \
476   do {                                                                         \
477     switch (Size) {                                                            \
478     case 32:                                                                   \
479       return RTLIB::LibcallPrefix##32;                                         \
480     case 64:                                                                   \
481       return RTLIB::LibcallPrefix##64;                                         \
482     case 128:                                                                  \
483       return RTLIB::LibcallPrefix##128;                                        \
484     default:                                                                   \
485       llvm_unreachable("unexpected size");                                     \
486     }                                                                          \
487   } while (0)
488 
489 #define RTLIBCASE(LibcallPrefix)                                               \
490   do {                                                                         \
491     switch (Size) {                                                            \
492     case 32:                                                                   \
493       return RTLIB::LibcallPrefix##32;                                         \
494     case 64:                                                                   \
495       return RTLIB::LibcallPrefix##64;                                         \
496     case 80:                                                                   \
497       return RTLIB::LibcallPrefix##80;                                         \
498     case 128:                                                                  \
499       return RTLIB::LibcallPrefix##128;                                        \
500     default:                                                                   \
501       llvm_unreachable("unexpected size");                                     \
502     }                                                                          \
503   } while (0)
504 
505   switch (Opcode) {
506   case TargetOpcode::G_MUL:
507     RTLIBCASE_INT(MUL_I);
508   case TargetOpcode::G_SDIV:
509     RTLIBCASE_INT(SDIV_I);
510   case TargetOpcode::G_UDIV:
511     RTLIBCASE_INT(UDIV_I);
512   case TargetOpcode::G_SREM:
513     RTLIBCASE_INT(SREM_I);
514   case TargetOpcode::G_UREM:
515     RTLIBCASE_INT(UREM_I);
516   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
517     RTLIBCASE_INT(CTLZ_I);
518   case TargetOpcode::G_FADD:
519     RTLIBCASE(ADD_F);
520   case TargetOpcode::G_FSUB:
521     RTLIBCASE(SUB_F);
522   case TargetOpcode::G_FMUL:
523     RTLIBCASE(MUL_F);
524   case TargetOpcode::G_FDIV:
525     RTLIBCASE(DIV_F);
526   case TargetOpcode::G_FEXP:
527     RTLIBCASE(EXP_F);
528   case TargetOpcode::G_FEXP2:
529     RTLIBCASE(EXP2_F);
530   case TargetOpcode::G_FEXP10:
531     RTLIBCASE(EXP10_F);
532   case TargetOpcode::G_FREM:
533     RTLIBCASE(REM_F);
534   case TargetOpcode::G_FPOW:
535     RTLIBCASE(POW_F);
536   case TargetOpcode::G_FPOWI:
537     RTLIBCASE(POWI_F);
538   case TargetOpcode::G_FMA:
539     RTLIBCASE(FMA_F);
540   case TargetOpcode::G_FSIN:
541     RTLIBCASE(SIN_F);
542   case TargetOpcode::G_FCOS:
543     RTLIBCASE(COS_F);
544   case TargetOpcode::G_FLOG10:
545     RTLIBCASE(LOG10_F);
546   case TargetOpcode::G_FLOG:
547     RTLIBCASE(LOG_F);
548   case TargetOpcode::G_FLOG2:
549     RTLIBCASE(LOG2_F);
550   case TargetOpcode::G_FLDEXP:
551     RTLIBCASE(LDEXP_F);
552   case TargetOpcode::G_FCEIL:
553     RTLIBCASE(CEIL_F);
554   case TargetOpcode::G_FFLOOR:
555     RTLIBCASE(FLOOR_F);
556   case TargetOpcode::G_FMINNUM:
557     RTLIBCASE(FMIN_F);
558   case TargetOpcode::G_FMAXNUM:
559     RTLIBCASE(FMAX_F);
560   case TargetOpcode::G_FSQRT:
561     RTLIBCASE(SQRT_F);
562   case TargetOpcode::G_FRINT:
563     RTLIBCASE(RINT_F);
564   case TargetOpcode::G_FNEARBYINT:
565     RTLIBCASE(NEARBYINT_F);
566   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
567     RTLIBCASE(ROUNDEVEN_F);
568   }
569   llvm_unreachable("Unknown libcall function");
570 }
571 
572 /// True if an instruction is in tail position in its caller. Intended for
573 /// legalizing libcalls as tail calls when possible.
574 static bool isLibCallInTailPosition(const CallLowering::ArgInfo &Result,
575                                     MachineInstr &MI,
576                                     const TargetInstrInfo &TII,
577                                     MachineRegisterInfo &MRI) {
578   MachineBasicBlock &MBB = *MI.getParent();
579   const Function &F = MBB.getParent()->getFunction();
580 
581   // Conservatively require the attributes of the call to match those of
582   // the return. Ignore NoAlias and NonNull because they don't affect the
583   // call sequence.
584   AttributeList CallerAttrs = F.getAttributes();
585   if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
586           .removeAttribute(Attribute::NoAlias)
587           .removeAttribute(Attribute::NonNull)
588           .hasAttributes())
589     return false;
590 
591   // It's not safe to eliminate the sign / zero extension of the return value.
592   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
593       CallerAttrs.hasRetAttr(Attribute::SExt))
594     return false;
595 
596   // Only tail call if the following instruction is a standard return or if we
597   // have a `thisreturn` callee, and a sequence like:
598   //
599   //   G_MEMCPY %0, %1, %2
600   //   $x0 = COPY %0
601   //   RET_ReallyLR implicit $x0
602   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
603   if (Next != MBB.instr_end() && Next->isCopy()) {
604     if (MI.getOpcode() == TargetOpcode::G_BZERO)
605       return false;
606 
607     // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the
608     // mempy/etc routines return the same parameter. For other it will be the
609     // returned value.
610     Register VReg = MI.getOperand(0).getReg();
611     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
612       return false;
613 
614     Register PReg = Next->getOperand(0).getReg();
615     if (!PReg.isPhysical())
616       return false;
617 
618     auto Ret = next_nodbg(Next, MBB.instr_end());
619     if (Ret == MBB.instr_end() || !Ret->isReturn())
620       return false;
621 
622     if (Ret->getNumImplicitOperands() != 1)
623       return false;
624 
625     if (!Ret->getOperand(0).isReg() || PReg != Ret->getOperand(0).getReg())
626       return false;
627 
628     // Skip over the COPY that we just validated.
629     Next = Ret;
630   }
631 
632   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
633     return false;
634 
635   return true;
636 }
637 
638 LegalizerHelper::LegalizeResult
639 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
640                     const CallLowering::ArgInfo &Result,
641                     ArrayRef<CallLowering::ArgInfo> Args,
642                     const CallingConv::ID CC, LostDebugLocObserver &LocObserver,
643                     MachineInstr *MI) {
644   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
645 
646   CallLowering::CallLoweringInfo Info;
647   Info.CallConv = CC;
648   Info.Callee = MachineOperand::CreateES(Name);
649   Info.OrigRet = Result;
650   if (MI)
651     Info.IsTailCall =
652         (Result.Ty->isVoidTy() ||
653          Result.Ty == MIRBuilder.getMF().getFunction().getReturnType()) &&
654         isLibCallInTailPosition(Result, *MI, MIRBuilder.getTII(),
655                                 *MIRBuilder.getMRI());
656 
657   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
658   if (!CLI.lowerCall(MIRBuilder, Info))
659     return LegalizerHelper::UnableToLegalize;
660 
661   if (MI && Info.LoweredTailCall) {
662     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
663 
664     // Check debug locations before removing the return.
665     LocObserver.checkpoint(true);
666 
667     // We must have a return following the call (or debug insts) to get past
668     // isLibCallInTailPosition.
669     do {
670       MachineInstr *Next = MI->getNextNode();
671       assert(Next &&
672              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
673              "Expected instr following MI to be return or debug inst?");
674       // We lowered a tail call, so the call is now the return from the block.
675       // Delete the old return.
676       Next->eraseFromParent();
677     } while (MI->getNextNode());
678 
679     // We expect to lose the debug location from the return.
680     LocObserver.checkpoint(false);
681   }
682   return LegalizerHelper::Legalized;
683 }
684 
685 LegalizerHelper::LegalizeResult
686 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
687                     const CallLowering::ArgInfo &Result,
688                     ArrayRef<CallLowering::ArgInfo> Args,
689                     LostDebugLocObserver &LocObserver, MachineInstr *MI) {
690   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
691   const char *Name = TLI.getLibcallName(Libcall);
692   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
693   return createLibcall(MIRBuilder, Name, Result, Args, CC, LocObserver, MI);
694 }
695 
696 // Useful for libcalls where all operands have the same type.
697 static LegalizerHelper::LegalizeResult
698 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
699               Type *OpType, LostDebugLocObserver &LocObserver) {
700   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
701 
702   // FIXME: What does the original arg index mean here?
703   SmallVector<CallLowering::ArgInfo, 3> Args;
704   for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
705     Args.push_back({MO.getReg(), OpType, 0});
706   return createLibcall(MIRBuilder, Libcall,
707                        {MI.getOperand(0).getReg(), OpType, 0}, Args,
708                        LocObserver, &MI);
709 }
710 
711 LegalizerHelper::LegalizeResult
712 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
713                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
714   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
715 
716   SmallVector<CallLowering::ArgInfo, 3> Args;
717   // Add all the args, except for the last which is an imm denoting 'tail'.
718   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
719     Register Reg = MI.getOperand(i).getReg();
720 
721     // Need derive an IR type for call lowering.
722     LLT OpLLT = MRI.getType(Reg);
723     Type *OpTy = nullptr;
724     if (OpLLT.isPointer())
725       OpTy = PointerType::get(Ctx, OpLLT.getAddressSpace());
726     else
727       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
728     Args.push_back({Reg, OpTy, 0});
729   }
730 
731   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
732   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
733   RTLIB::Libcall RTLibcall;
734   unsigned Opc = MI.getOpcode();
735   switch (Opc) {
736   case TargetOpcode::G_BZERO:
737     RTLibcall = RTLIB::BZERO;
738     break;
739   case TargetOpcode::G_MEMCPY:
740     RTLibcall = RTLIB::MEMCPY;
741     Args[0].Flags[0].setReturned();
742     break;
743   case TargetOpcode::G_MEMMOVE:
744     RTLibcall = RTLIB::MEMMOVE;
745     Args[0].Flags[0].setReturned();
746     break;
747   case TargetOpcode::G_MEMSET:
748     RTLibcall = RTLIB::MEMSET;
749     Args[0].Flags[0].setReturned();
750     break;
751   default:
752     llvm_unreachable("unsupported opcode");
753   }
754   const char *Name = TLI.getLibcallName(RTLibcall);
755 
756   // Unsupported libcall on the target.
757   if (!Name) {
758     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
759                       << MIRBuilder.getTII().getName(Opc) << "\n");
760     return LegalizerHelper::UnableToLegalize;
761   }
762 
763   CallLowering::CallLoweringInfo Info;
764   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
765   Info.Callee = MachineOperand::CreateES(Name);
766   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
767   Info.IsTailCall =
768       MI.getOperand(MI.getNumOperands() - 1).getImm() &&
769       isLibCallInTailPosition(Info.OrigRet, MI, MIRBuilder.getTII(), MRI);
770 
771   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
772   if (!CLI.lowerCall(MIRBuilder, Info))
773     return LegalizerHelper::UnableToLegalize;
774 
775   if (Info.LoweredTailCall) {
776     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
777 
778     // Check debug locations before removing the return.
779     LocObserver.checkpoint(true);
780 
781     // We must have a return following the call (or debug insts) to get past
782     // isLibCallInTailPosition.
783     do {
784       MachineInstr *Next = MI.getNextNode();
785       assert(Next &&
786              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
787              "Expected instr following MI to be return or debug inst?");
788       // We lowered a tail call, so the call is now the return from the block.
789       // Delete the old return.
790       Next->eraseFromParent();
791     } while (MI.getNextNode());
792 
793     // We expect to lose the debug location from the return.
794     LocObserver.checkpoint(false);
795   }
796 
797   return LegalizerHelper::Legalized;
798 }
799 
800 static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
801   unsigned Opc = MI.getOpcode();
802   auto &AtomicMI = cast<GMemOperation>(MI);
803   auto &MMO = AtomicMI.getMMO();
804   auto Ordering = MMO.getMergedOrdering();
805   LLT MemType = MMO.getMemoryType();
806   uint64_t MemSize = MemType.getSizeInBytes();
807   if (MemType.isVector())
808     return RTLIB::UNKNOWN_LIBCALL;
809 
810 #define LCALLS(A, B)                                                           \
811   { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
812 #define LCALL5(A)                                                              \
813   LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
814   switch (Opc) {
815   case TargetOpcode::G_ATOMIC_CMPXCHG:
816   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
817     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
818     return getOutlineAtomicHelper(LC, Ordering, MemSize);
819   }
820   case TargetOpcode::G_ATOMICRMW_XCHG: {
821     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
822     return getOutlineAtomicHelper(LC, Ordering, MemSize);
823   }
824   case TargetOpcode::G_ATOMICRMW_ADD:
825   case TargetOpcode::G_ATOMICRMW_SUB: {
826     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
827     return getOutlineAtomicHelper(LC, Ordering, MemSize);
828   }
829   case TargetOpcode::G_ATOMICRMW_AND: {
830     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
831     return getOutlineAtomicHelper(LC, Ordering, MemSize);
832   }
833   case TargetOpcode::G_ATOMICRMW_OR: {
834     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
835     return getOutlineAtomicHelper(LC, Ordering, MemSize);
836   }
837   case TargetOpcode::G_ATOMICRMW_XOR: {
838     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
839     return getOutlineAtomicHelper(LC, Ordering, MemSize);
840   }
841   default:
842     return RTLIB::UNKNOWN_LIBCALL;
843   }
844 #undef LCALLS
845 #undef LCALL5
846 }
847 
848 static LegalizerHelper::LegalizeResult
849 createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
850   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
851 
852   Type *RetTy;
853   SmallVector<Register> RetRegs;
854   SmallVector<CallLowering::ArgInfo, 3> Args;
855   unsigned Opc = MI.getOpcode();
856   switch (Opc) {
857   case TargetOpcode::G_ATOMIC_CMPXCHG:
858   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
859     Register Success;
860     LLT SuccessLLT;
861     auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
862         MI.getFirst4RegLLTs();
863     RetRegs.push_back(Ret);
864     RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
865     if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
866       std::tie(Ret, RetLLT, Success, SuccessLLT, Mem, MemLLT, Cmp, CmpLLT, New,
867                NewLLT) = MI.getFirst5RegLLTs();
868       RetRegs.push_back(Success);
869       RetTy = StructType::get(
870           Ctx, {RetTy, IntegerType::get(Ctx, SuccessLLT.getSizeInBits())});
871     }
872     Args.push_back({Cmp, IntegerType::get(Ctx, CmpLLT.getSizeInBits()), 0});
873     Args.push_back({New, IntegerType::get(Ctx, NewLLT.getSizeInBits()), 0});
874     Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
875     break;
876   }
877   case TargetOpcode::G_ATOMICRMW_XCHG:
878   case TargetOpcode::G_ATOMICRMW_ADD:
879   case TargetOpcode::G_ATOMICRMW_SUB:
880   case TargetOpcode::G_ATOMICRMW_AND:
881   case TargetOpcode::G_ATOMICRMW_OR:
882   case TargetOpcode::G_ATOMICRMW_XOR: {
883     auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
884     RetRegs.push_back(Ret);
885     RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
886     if (Opc == TargetOpcode::G_ATOMICRMW_AND)
887       Val =
888           MIRBuilder.buildXor(ValLLT, MIRBuilder.buildConstant(ValLLT, -1), Val)
889               .getReg(0);
890     else if (Opc == TargetOpcode::G_ATOMICRMW_SUB)
891       Val =
892           MIRBuilder.buildSub(ValLLT, MIRBuilder.buildConstant(ValLLT, 0), Val)
893               .getReg(0);
894     Args.push_back({Val, IntegerType::get(Ctx, ValLLT.getSizeInBits()), 0});
895     Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
896     break;
897   }
898   default:
899     llvm_unreachable("unsupported opcode");
900   }
901 
902   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
903   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
904   RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI);
905   const char *Name = TLI.getLibcallName(RTLibcall);
906 
907   // Unsupported libcall on the target.
908   if (!Name) {
909     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
910                       << MIRBuilder.getTII().getName(Opc) << "\n");
911     return LegalizerHelper::UnableToLegalize;
912   }
913 
914   CallLowering::CallLoweringInfo Info;
915   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
916   Info.Callee = MachineOperand::CreateES(Name);
917   Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
918 
919   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
920   if (!CLI.lowerCall(MIRBuilder, Info))
921     return LegalizerHelper::UnableToLegalize;
922 
923   return LegalizerHelper::Legalized;
924 }
925 
926 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
927                                        Type *FromType) {
928   auto ToMVT = MVT::getVT(ToType);
929   auto FromMVT = MVT::getVT(FromType);
930 
931   switch (Opcode) {
932   case TargetOpcode::G_FPEXT:
933     return RTLIB::getFPEXT(FromMVT, ToMVT);
934   case TargetOpcode::G_FPTRUNC:
935     return RTLIB::getFPROUND(FromMVT, ToMVT);
936   case TargetOpcode::G_FPTOSI:
937     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
938   case TargetOpcode::G_FPTOUI:
939     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
940   case TargetOpcode::G_SITOFP:
941     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
942   case TargetOpcode::G_UITOFP:
943     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
944   }
945   llvm_unreachable("Unsupported libcall function");
946 }
947 
948 static LegalizerHelper::LegalizeResult
949 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
950                   Type *FromType, LostDebugLocObserver &LocObserver) {
951   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
952   return createLibcall(
953       MIRBuilder, Libcall, {MI.getOperand(0).getReg(), ToType, 0},
954       {{MI.getOperand(1).getReg(), FromType, 0}}, LocObserver, &MI);
955 }
956 
957 static RTLIB::Libcall
958 getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) {
959   RTLIB::Libcall RTLibcall;
960   switch (MI.getOpcode()) {
961   case TargetOpcode::G_GET_FPMODE:
962     RTLibcall = RTLIB::FEGETMODE;
963     break;
964   case TargetOpcode::G_SET_FPMODE:
965   case TargetOpcode::G_RESET_FPMODE:
966     RTLibcall = RTLIB::FESETMODE;
967     break;
968   default:
969     llvm_unreachable("Unexpected opcode");
970   }
971   return RTLibcall;
972 }
973 
974 // Some library functions that read FP state (fegetmode, fegetenv) write the
975 // state into a region in memory. IR intrinsics that do the same operations
976 // (get_fpmode, get_fpenv) return the state as integer value. To implement these
977 // intrinsics via the library functions, we need to use temporary variable,
978 // for example:
979 //
980 //     %0:_(s32) = G_GET_FPMODE
981 //
982 // is transformed to:
983 //
984 //     %1:_(p0) = G_FRAME_INDEX %stack.0
985 //     BL &fegetmode
986 //     %0:_(s32) = G_LOAD % 1
987 //
988 LegalizerHelper::LegalizeResult
989 LegalizerHelper::createGetStateLibcall(MachineIRBuilder &MIRBuilder,
990                                        MachineInstr &MI,
991                                        LostDebugLocObserver &LocObserver) {
992   const DataLayout &DL = MIRBuilder.getDataLayout();
993   auto &MF = MIRBuilder.getMF();
994   auto &MRI = *MIRBuilder.getMRI();
995   auto &Ctx = MF.getFunction().getContext();
996 
997   // Create temporary, where library function will put the read state.
998   Register Dst = MI.getOperand(0).getReg();
999   LLT StateTy = MRI.getType(Dst);
1000   TypeSize StateSize = StateTy.getSizeInBytes();
1001   Align TempAlign = getStackTemporaryAlignment(StateTy);
1002   MachinePointerInfo TempPtrInfo;
1003   auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo);
1004 
1005   // Create a call to library function, with the temporary as an argument.
1006   unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1007   Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace);
1008   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1009   auto Res =
1010       createLibcall(MIRBuilder, RTLibcall,
1011                     CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
1012                     CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
1013                     LocObserver, nullptr);
1014   if (Res != LegalizerHelper::Legalized)
1015     return Res;
1016 
1017   // Create a load from the temporary.
1018   MachineMemOperand *MMO = MF.getMachineMemOperand(
1019       TempPtrInfo, MachineMemOperand::MOLoad, StateTy, TempAlign);
1020   MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Temp, *MMO);
1021 
1022   return LegalizerHelper::Legalized;
1023 }
1024 
1025 // Similar to `createGetStateLibcall` the function calls a library function
1026 // using transient space in stack. In this case the library function reads
1027 // content of memory region.
1028 LegalizerHelper::LegalizeResult
1029 LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
1030                                        MachineInstr &MI,
1031                                        LostDebugLocObserver &LocObserver) {
1032   const DataLayout &DL = MIRBuilder.getDataLayout();
1033   auto &MF = MIRBuilder.getMF();
1034   auto &MRI = *MIRBuilder.getMRI();
1035   auto &Ctx = MF.getFunction().getContext();
1036 
1037   // Create temporary, where library function will get the new state.
1038   Register Src = MI.getOperand(0).getReg();
1039   LLT StateTy = MRI.getType(Src);
1040   TypeSize StateSize = StateTy.getSizeInBytes();
1041   Align TempAlign = getStackTemporaryAlignment(StateTy);
1042   MachinePointerInfo TempPtrInfo;
1043   auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo);
1044 
1045   // Put the new state into the temporary.
1046   MachineMemOperand *MMO = MF.getMachineMemOperand(
1047       TempPtrInfo, MachineMemOperand::MOStore, StateTy, TempAlign);
1048   MIRBuilder.buildStore(Src, Temp, *MMO);
1049 
1050   // Create a call to library function, with the temporary as an argument.
1051   unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1052   Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace);
1053   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1054   return createLibcall(MIRBuilder, RTLibcall,
1055                        CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
1056                        CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
1057                        LocObserver, nullptr);
1058 }
1059 
1060 // The function is used to legalize operations that set default environment
1061 // state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
1062 // On most targets supported in glibc FE_DFL_MODE is defined as
1063 // `((const femode_t *) -1)`. Such assumption is used here. If for some target
1064 // it is not true, the target must provide custom lowering.
1065 LegalizerHelper::LegalizeResult
1066 LegalizerHelper::createResetStateLibcall(MachineIRBuilder &MIRBuilder,
1067                                          MachineInstr &MI,
1068                                          LostDebugLocObserver &LocObserver) {
1069   const DataLayout &DL = MIRBuilder.getDataLayout();
1070   auto &MF = MIRBuilder.getMF();
1071   auto &Ctx = MF.getFunction().getContext();
1072 
1073   // Create an argument for the library function.
1074   unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
1075   Type *StatePtrTy = PointerType::get(Ctx, AddrSpace);
1076   unsigned PtrSize = DL.getPointerSizeInBits(AddrSpace);
1077   LLT MemTy = LLT::pointer(AddrSpace, PtrSize);
1078   auto DefValue = MIRBuilder.buildConstant(LLT::scalar(PtrSize), -1LL);
1079   DstOp Dest(MRI.createGenericVirtualRegister(MemTy));
1080   MIRBuilder.buildIntToPtr(Dest, DefValue);
1081 
1082   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1083   return createLibcall(MIRBuilder, RTLibcall,
1084                        CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
1085                        CallLowering::ArgInfo({Dest.getReg(), StatePtrTy, 0}),
1086                        LocObserver, &MI);
1087 }
1088 
1089 LegalizerHelper::LegalizeResult
1090 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
1091   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
1092 
1093   switch (MI.getOpcode()) {
1094   default:
1095     return UnableToLegalize;
1096   case TargetOpcode::G_MUL:
1097   case TargetOpcode::G_SDIV:
1098   case TargetOpcode::G_UDIV:
1099   case TargetOpcode::G_SREM:
1100   case TargetOpcode::G_UREM:
1101   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
1102     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1103     unsigned Size = LLTy.getSizeInBits();
1104     Type *HLTy = IntegerType::get(Ctx, Size);
1105     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1106     if (Status != Legalized)
1107       return Status;
1108     break;
1109   }
1110   case TargetOpcode::G_FADD:
1111   case TargetOpcode::G_FSUB:
1112   case TargetOpcode::G_FMUL:
1113   case TargetOpcode::G_FDIV:
1114   case TargetOpcode::G_FMA:
1115   case TargetOpcode::G_FPOW:
1116   case TargetOpcode::G_FREM:
1117   case TargetOpcode::G_FCOS:
1118   case TargetOpcode::G_FSIN:
1119   case TargetOpcode::G_FLOG10:
1120   case TargetOpcode::G_FLOG:
1121   case TargetOpcode::G_FLOG2:
1122   case TargetOpcode::G_FLDEXP:
1123   case TargetOpcode::G_FEXP:
1124   case TargetOpcode::G_FEXP2:
1125   case TargetOpcode::G_FEXP10:
1126   case TargetOpcode::G_FCEIL:
1127   case TargetOpcode::G_FFLOOR:
1128   case TargetOpcode::G_FMINNUM:
1129   case TargetOpcode::G_FMAXNUM:
1130   case TargetOpcode::G_FSQRT:
1131   case TargetOpcode::G_FRINT:
1132   case TargetOpcode::G_FNEARBYINT:
1133   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
1134     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1135     unsigned Size = LLTy.getSizeInBits();
1136     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1137     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1138       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1139       return UnableToLegalize;
1140     }
1141     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1142     if (Status != Legalized)
1143       return Status;
1144     break;
1145   }
1146   case TargetOpcode::G_FPOWI: {
1147     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1148     unsigned Size = LLTy.getSizeInBits();
1149     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1150     Type *ITy = IntegerType::get(
1151         Ctx, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
1152     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1153       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1154       return UnableToLegalize;
1155     }
1156     auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
1157     std::initializer_list<CallLowering::ArgInfo> Args = {
1158         {MI.getOperand(1).getReg(), HLTy, 0},
1159         {MI.getOperand(2).getReg(), ITy, 1}};
1160     LegalizeResult Status =
1161         createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), HLTy, 0},
1162                       Args, LocObserver, &MI);
1163     if (Status != Legalized)
1164       return Status;
1165     break;
1166   }
1167   case TargetOpcode::G_FPEXT:
1168   case TargetOpcode::G_FPTRUNC: {
1169     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
1170     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
1171     if (!FromTy || !ToTy)
1172       return UnableToLegalize;
1173     LegalizeResult Status =
1174         conversionLibcall(MI, MIRBuilder, ToTy, FromTy, LocObserver);
1175     if (Status != Legalized)
1176       return Status;
1177     break;
1178   }
1179   case TargetOpcode::G_FPTOSI:
1180   case TargetOpcode::G_FPTOUI: {
1181     // FIXME: Support other types
1182     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1183     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1184     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
1185       return UnableToLegalize;
1186     LegalizeResult Status = conversionLibcall(
1187         MI, MIRBuilder,
1188         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
1189         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
1190         LocObserver);
1191     if (Status != Legalized)
1192       return Status;
1193     break;
1194   }
1195   case TargetOpcode::G_SITOFP:
1196   case TargetOpcode::G_UITOFP: {
1197     // FIXME: Support other types
1198     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1199     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1200     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
1201       return UnableToLegalize;
1202     LegalizeResult Status = conversionLibcall(
1203         MI, MIRBuilder,
1204         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
1205         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
1206         LocObserver);
1207     if (Status != Legalized)
1208       return Status;
1209     break;
1210   }
1211   case TargetOpcode::G_ATOMICRMW_XCHG:
1212   case TargetOpcode::G_ATOMICRMW_ADD:
1213   case TargetOpcode::G_ATOMICRMW_SUB:
1214   case TargetOpcode::G_ATOMICRMW_AND:
1215   case TargetOpcode::G_ATOMICRMW_OR:
1216   case TargetOpcode::G_ATOMICRMW_XOR:
1217   case TargetOpcode::G_ATOMIC_CMPXCHG:
1218   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
1219     auto Status = createAtomicLibcall(MIRBuilder, MI);
1220     if (Status != Legalized)
1221       return Status;
1222     break;
1223   }
1224   case TargetOpcode::G_BZERO:
1225   case TargetOpcode::G_MEMCPY:
1226   case TargetOpcode::G_MEMMOVE:
1227   case TargetOpcode::G_MEMSET: {
1228     LegalizeResult Result =
1229         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
1230     if (Result != Legalized)
1231       return Result;
1232     MI.eraseFromParent();
1233     return Result;
1234   }
1235   case TargetOpcode::G_GET_FPMODE: {
1236     LegalizeResult Result = createGetStateLibcall(MIRBuilder, MI, LocObserver);
1237     if (Result != Legalized)
1238       return Result;
1239     break;
1240   }
1241   case TargetOpcode::G_SET_FPMODE: {
1242     LegalizeResult Result = createSetStateLibcall(MIRBuilder, MI, LocObserver);
1243     if (Result != Legalized)
1244       return Result;
1245     break;
1246   }
1247   case TargetOpcode::G_RESET_FPMODE: {
1248     LegalizeResult Result =
1249         createResetStateLibcall(MIRBuilder, MI, LocObserver);
1250     if (Result != Legalized)
1251       return Result;
1252     break;
1253   }
1254   }
1255 
1256   MI.eraseFromParent();
1257   return Legalized;
1258 }
1259 
1260 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
1261                                                               unsigned TypeIdx,
1262                                                               LLT NarrowTy) {
1263   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1264   uint64_t NarrowSize = NarrowTy.getSizeInBits();
1265 
1266   switch (MI.getOpcode()) {
1267   default:
1268     return UnableToLegalize;
1269   case TargetOpcode::G_IMPLICIT_DEF: {
1270     Register DstReg = MI.getOperand(0).getReg();
1271     LLT DstTy = MRI.getType(DstReg);
1272 
1273     // If SizeOp0 is not an exact multiple of NarrowSize, emit
1274     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
1275     // FIXME: Although this would also be legal for the general case, it causes
1276     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
1277     //  combines not being hit). This seems to be a problem related to the
1278     //  artifact combiner.
1279     if (SizeOp0 % NarrowSize != 0) {
1280       LLT ImplicitTy = NarrowTy;
1281       if (DstTy.isVector())
1282         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
1283 
1284       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
1285       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
1286 
1287       MI.eraseFromParent();
1288       return Legalized;
1289     }
1290 
1291     int NumParts = SizeOp0 / NarrowSize;
1292 
1293     SmallVector<Register, 2> DstRegs;
1294     for (int i = 0; i < NumParts; ++i)
1295       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
1296 
1297     if (DstTy.isVector())
1298       MIRBuilder.buildBuildVector(DstReg, DstRegs);
1299     else
1300       MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
1301     MI.eraseFromParent();
1302     return Legalized;
1303   }
1304   case TargetOpcode::G_CONSTANT: {
1305     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1306     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
1307     unsigned TotalSize = Ty.getSizeInBits();
1308     unsigned NarrowSize = NarrowTy.getSizeInBits();
1309     int NumParts = TotalSize / NarrowSize;
1310 
1311     SmallVector<Register, 4> PartRegs;
1312     for (int I = 0; I != NumParts; ++I) {
1313       unsigned Offset = I * NarrowSize;
1314       auto K = MIRBuilder.buildConstant(NarrowTy,
1315                                         Val.lshr(Offset).trunc(NarrowSize));
1316       PartRegs.push_back(K.getReg(0));
1317     }
1318 
1319     LLT LeftoverTy;
1320     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
1321     SmallVector<Register, 1> LeftoverRegs;
1322     if (LeftoverBits != 0) {
1323       LeftoverTy = LLT::scalar(LeftoverBits);
1324       auto K = MIRBuilder.buildConstant(
1325         LeftoverTy,
1326         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
1327       LeftoverRegs.push_back(K.getReg(0));
1328     }
1329 
1330     insertParts(MI.getOperand(0).getReg(),
1331                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
1332 
1333     MI.eraseFromParent();
1334     return Legalized;
1335   }
1336   case TargetOpcode::G_SEXT:
1337   case TargetOpcode::G_ZEXT:
1338   case TargetOpcode::G_ANYEXT:
1339     return narrowScalarExt(MI, TypeIdx, NarrowTy);
1340   case TargetOpcode::G_TRUNC: {
1341     if (TypeIdx != 1)
1342       return UnableToLegalize;
1343 
1344     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1345     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
1346       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
1347       return UnableToLegalize;
1348     }
1349 
1350     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
1351     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
1352     MI.eraseFromParent();
1353     return Legalized;
1354   }
1355 
1356   case TargetOpcode::G_FREEZE: {
1357     if (TypeIdx != 0)
1358       return UnableToLegalize;
1359 
1360     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1361     // Should widen scalar first
1362     if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1363       return UnableToLegalize;
1364 
1365     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
1366     SmallVector<Register, 8> Parts;
1367     for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1368       Parts.push_back(
1369           MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0));
1370     }
1371 
1372     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), Parts);
1373     MI.eraseFromParent();
1374     return Legalized;
1375   }
1376   case TargetOpcode::G_ADD:
1377   case TargetOpcode::G_SUB:
1378   case TargetOpcode::G_SADDO:
1379   case TargetOpcode::G_SSUBO:
1380   case TargetOpcode::G_SADDE:
1381   case TargetOpcode::G_SSUBE:
1382   case TargetOpcode::G_UADDO:
1383   case TargetOpcode::G_USUBO:
1384   case TargetOpcode::G_UADDE:
1385   case TargetOpcode::G_USUBE:
1386     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1387   case TargetOpcode::G_MUL:
1388   case TargetOpcode::G_UMULH:
1389     return narrowScalarMul(MI, NarrowTy);
1390   case TargetOpcode::G_EXTRACT:
1391     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
1392   case TargetOpcode::G_INSERT:
1393     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
1394   case TargetOpcode::G_LOAD: {
1395     auto &LoadMI = cast<GLoad>(MI);
1396     Register DstReg = LoadMI.getDstReg();
1397     LLT DstTy = MRI.getType(DstReg);
1398     if (DstTy.isVector())
1399       return UnableToLegalize;
1400 
1401     if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
1402       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1403       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
1404       MIRBuilder.buildAnyExt(DstReg, TmpReg);
1405       LoadMI.eraseFromParent();
1406       return Legalized;
1407     }
1408 
1409     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
1410   }
1411   case TargetOpcode::G_ZEXTLOAD:
1412   case TargetOpcode::G_SEXTLOAD: {
1413     auto &LoadMI = cast<GExtLoad>(MI);
1414     Register DstReg = LoadMI.getDstReg();
1415     Register PtrReg = LoadMI.getPointerReg();
1416 
1417     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1418     auto &MMO = LoadMI.getMMO();
1419     unsigned MemSize = MMO.getSizeInBits();
1420 
1421     if (MemSize == NarrowSize) {
1422       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
1423     } else if (MemSize < NarrowSize) {
1424       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
1425     } else if (MemSize > NarrowSize) {
1426       // FIXME: Need to split the load.
1427       return UnableToLegalize;
1428     }
1429 
1430     if (isa<GZExtLoad>(LoadMI))
1431       MIRBuilder.buildZExt(DstReg, TmpReg);
1432     else
1433       MIRBuilder.buildSExt(DstReg, TmpReg);
1434 
1435     LoadMI.eraseFromParent();
1436     return Legalized;
1437   }
1438   case TargetOpcode::G_STORE: {
1439     auto &StoreMI = cast<GStore>(MI);
1440 
1441     Register SrcReg = StoreMI.getValueReg();
1442     LLT SrcTy = MRI.getType(SrcReg);
1443     if (SrcTy.isVector())
1444       return UnableToLegalize;
1445 
1446     int NumParts = SizeOp0 / NarrowSize;
1447     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1448     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1449     if (SrcTy.isVector() && LeftoverBits != 0)
1450       return UnableToLegalize;
1451 
1452     if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
1453       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1454       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1455       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1456       StoreMI.eraseFromParent();
1457       return Legalized;
1458     }
1459 
1460     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1461   }
1462   case TargetOpcode::G_SELECT:
1463     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1464   case TargetOpcode::G_AND:
1465   case TargetOpcode::G_OR:
1466   case TargetOpcode::G_XOR: {
1467     // Legalize bitwise operation:
1468     // A = BinOp<Ty> B, C
1469     // into:
1470     // B1, ..., BN = G_UNMERGE_VALUES B
1471     // C1, ..., CN = G_UNMERGE_VALUES C
1472     // A1 = BinOp<Ty/N> B1, C2
1473     // ...
1474     // AN = BinOp<Ty/N> BN, CN
1475     // A = G_MERGE_VALUES A1, ..., AN
1476     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1477   }
1478   case TargetOpcode::G_SHL:
1479   case TargetOpcode::G_LSHR:
1480   case TargetOpcode::G_ASHR:
1481     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1482   case TargetOpcode::G_CTLZ:
1483   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1484   case TargetOpcode::G_CTTZ:
1485   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1486   case TargetOpcode::G_CTPOP:
1487     if (TypeIdx == 1)
1488       switch (MI.getOpcode()) {
1489       case TargetOpcode::G_CTLZ:
1490       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1491         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1492       case TargetOpcode::G_CTTZ:
1493       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1494         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1495       case TargetOpcode::G_CTPOP:
1496         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1497       default:
1498         return UnableToLegalize;
1499       }
1500 
1501     Observer.changingInstr(MI);
1502     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1503     Observer.changedInstr(MI);
1504     return Legalized;
1505   case TargetOpcode::G_INTTOPTR:
1506     if (TypeIdx != 1)
1507       return UnableToLegalize;
1508 
1509     Observer.changingInstr(MI);
1510     narrowScalarSrc(MI, NarrowTy, 1);
1511     Observer.changedInstr(MI);
1512     return Legalized;
1513   case TargetOpcode::G_PTRTOINT:
1514     if (TypeIdx != 0)
1515       return UnableToLegalize;
1516 
1517     Observer.changingInstr(MI);
1518     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1519     Observer.changedInstr(MI);
1520     return Legalized;
1521   case TargetOpcode::G_PHI: {
1522     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1523     // NarrowSize.
1524     if (SizeOp0 % NarrowSize != 0)
1525       return UnableToLegalize;
1526 
1527     unsigned NumParts = SizeOp0 / NarrowSize;
1528     SmallVector<Register, 2> DstRegs(NumParts);
1529     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1530     Observer.changingInstr(MI);
1531     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1532       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1533       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
1534       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1535                    SrcRegs[i / 2]);
1536     }
1537     MachineBasicBlock &MBB = *MI.getParent();
1538     MIRBuilder.setInsertPt(MBB, MI);
1539     for (unsigned i = 0; i < NumParts; ++i) {
1540       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1541       MachineInstrBuilder MIB =
1542           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1543       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1544         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1545     }
1546     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1547     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
1548     Observer.changedInstr(MI);
1549     MI.eraseFromParent();
1550     return Legalized;
1551   }
1552   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1553   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1554     if (TypeIdx != 2)
1555       return UnableToLegalize;
1556 
1557     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1558     Observer.changingInstr(MI);
1559     narrowScalarSrc(MI, NarrowTy, OpIdx);
1560     Observer.changedInstr(MI);
1561     return Legalized;
1562   }
1563   case TargetOpcode::G_ICMP: {
1564     Register LHS = MI.getOperand(2).getReg();
1565     LLT SrcTy = MRI.getType(LHS);
1566     uint64_t SrcSize = SrcTy.getSizeInBits();
1567     CmpInst::Predicate Pred =
1568         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1569 
1570     // TODO: Handle the non-equality case for weird sizes.
1571     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1572       return UnableToLegalize;
1573 
1574     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1575     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1576     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1577                       LHSLeftoverRegs))
1578       return UnableToLegalize;
1579 
1580     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1581     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1582     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1583                       RHSPartRegs, RHSLeftoverRegs))
1584       return UnableToLegalize;
1585 
1586     // We now have the LHS and RHS of the compare split into narrow-type
1587     // registers, plus potentially some leftover type.
1588     Register Dst = MI.getOperand(0).getReg();
1589     LLT ResTy = MRI.getType(Dst);
1590     if (ICmpInst::isEquality(Pred)) {
1591       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1592       // them together. For each equal part, the result should be all 0s. For
1593       // each non-equal part, we'll get at least one 1.
1594       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1595       SmallVector<Register, 4> Xors;
1596       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1597         auto LHS = std::get<0>(LHSAndRHS);
1598         auto RHS = std::get<1>(LHSAndRHS);
1599         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1600         Xors.push_back(Xor);
1601       }
1602 
1603       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1604       // to the desired narrow type so that we can OR them together later.
1605       SmallVector<Register, 4> WidenedXors;
1606       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1607         auto LHS = std::get<0>(LHSAndRHS);
1608         auto RHS = std::get<1>(LHSAndRHS);
1609         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1610         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1611         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1612                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1613         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1614       }
1615 
1616       // Now, for each part we broke up, we know if they are equal/not equal
1617       // based off the G_XOR. We can OR these all together and compare against
1618       // 0 to get the result.
1619       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1620       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1621       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1622         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1623       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1624     } else {
1625       // TODO: Handle non-power-of-two types.
1626       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1627       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1628       Register LHSL = LHSPartRegs[0];
1629       Register LHSH = LHSPartRegs[1];
1630       Register RHSL = RHSPartRegs[0];
1631       Register RHSH = RHSPartRegs[1];
1632       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1633       MachineInstrBuilder CmpHEQ =
1634           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1635       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1636           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1637       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1638     }
1639     MI.eraseFromParent();
1640     return Legalized;
1641   }
1642   case TargetOpcode::G_SEXT_INREG: {
1643     if (TypeIdx != 0)
1644       return UnableToLegalize;
1645 
1646     int64_t SizeInBits = MI.getOperand(2).getImm();
1647 
1648     // So long as the new type has more bits than the bits we're extending we
1649     // don't need to break it apart.
1650     if (NarrowTy.getScalarSizeInBits() > SizeInBits) {
1651       Observer.changingInstr(MI);
1652       // We don't lose any non-extension bits by truncating the src and
1653       // sign-extending the dst.
1654       MachineOperand &MO1 = MI.getOperand(1);
1655       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1656       MO1.setReg(TruncMIB.getReg(0));
1657 
1658       MachineOperand &MO2 = MI.getOperand(0);
1659       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1660       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1661       MIRBuilder.buildSExt(MO2, DstExt);
1662       MO2.setReg(DstExt);
1663       Observer.changedInstr(MI);
1664       return Legalized;
1665     }
1666 
1667     // Break it apart. Components below the extension point are unmodified. The
1668     // component containing the extension point becomes a narrower SEXT_INREG.
1669     // Components above it are ashr'd from the component containing the
1670     // extension point.
1671     if (SizeOp0 % NarrowSize != 0)
1672       return UnableToLegalize;
1673     int NumParts = SizeOp0 / NarrowSize;
1674 
1675     // List the registers where the destination will be scattered.
1676     SmallVector<Register, 2> DstRegs;
1677     // List the registers where the source will be split.
1678     SmallVector<Register, 2> SrcRegs;
1679 
1680     // Create all the temporary registers.
1681     for (int i = 0; i < NumParts; ++i) {
1682       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1683 
1684       SrcRegs.push_back(SrcReg);
1685     }
1686 
1687     // Explode the big arguments into smaller chunks.
1688     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1689 
1690     Register AshrCstReg =
1691         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1692             .getReg(0);
1693     Register FullExtensionReg;
1694     Register PartialExtensionReg;
1695 
1696     // Do the operation on each small part.
1697     for (int i = 0; i < NumParts; ++i) {
1698       if ((i + 1) * NarrowTy.getScalarSizeInBits() <= SizeInBits) {
1699         DstRegs.push_back(SrcRegs[i]);
1700         PartialExtensionReg = DstRegs.back();
1701       } else if (i * NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1702         assert(PartialExtensionReg &&
1703                "Expected to visit partial extension before full");
1704         if (FullExtensionReg) {
1705           DstRegs.push_back(FullExtensionReg);
1706           continue;
1707         }
1708         DstRegs.push_back(
1709             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1710                 .getReg(0));
1711         FullExtensionReg = DstRegs.back();
1712       } else {
1713         DstRegs.push_back(
1714             MIRBuilder
1715                 .buildInstr(
1716                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1717                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1718                 .getReg(0));
1719         PartialExtensionReg = DstRegs.back();
1720       }
1721     }
1722 
1723     // Gather the destination registers into the final destination.
1724     Register DstReg = MI.getOperand(0).getReg();
1725     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
1726     MI.eraseFromParent();
1727     return Legalized;
1728   }
1729   case TargetOpcode::G_BSWAP:
1730   case TargetOpcode::G_BITREVERSE: {
1731     if (SizeOp0 % NarrowSize != 0)
1732       return UnableToLegalize;
1733 
1734     Observer.changingInstr(MI);
1735     SmallVector<Register, 2> SrcRegs, DstRegs;
1736     unsigned NumParts = SizeOp0 / NarrowSize;
1737     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
1738 
1739     for (unsigned i = 0; i < NumParts; ++i) {
1740       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1741                                            {SrcRegs[NumParts - 1 - i]});
1742       DstRegs.push_back(DstPart.getReg(0));
1743     }
1744 
1745     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
1746 
1747     Observer.changedInstr(MI);
1748     MI.eraseFromParent();
1749     return Legalized;
1750   }
1751   case TargetOpcode::G_PTR_ADD:
1752   case TargetOpcode::G_PTRMASK: {
1753     if (TypeIdx != 1)
1754       return UnableToLegalize;
1755     Observer.changingInstr(MI);
1756     narrowScalarSrc(MI, NarrowTy, 2);
1757     Observer.changedInstr(MI);
1758     return Legalized;
1759   }
1760   case TargetOpcode::G_FPTOUI:
1761   case TargetOpcode::G_FPTOSI:
1762     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1763   case TargetOpcode::G_FPEXT:
1764     if (TypeIdx != 0)
1765       return UnableToLegalize;
1766     Observer.changingInstr(MI);
1767     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1768     Observer.changedInstr(MI);
1769     return Legalized;
1770   case TargetOpcode::G_FLDEXP:
1771   case TargetOpcode::G_STRICT_FLDEXP:
1772     return narrowScalarFLDEXP(MI, TypeIdx, NarrowTy);
1773   }
1774 }
1775 
1776 Register LegalizerHelper::coerceToScalar(Register Val) {
1777   LLT Ty = MRI.getType(Val);
1778   if (Ty.isScalar())
1779     return Val;
1780 
1781   const DataLayout &DL = MIRBuilder.getDataLayout();
1782   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1783   if (Ty.isPointer()) {
1784     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1785       return Register();
1786     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1787   }
1788 
1789   Register NewVal = Val;
1790 
1791   assert(Ty.isVector());
1792   LLT EltTy = Ty.getElementType();
1793   if (EltTy.isPointer())
1794     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1795   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1796 }
1797 
1798 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1799                                      unsigned OpIdx, unsigned ExtOpcode) {
1800   MachineOperand &MO = MI.getOperand(OpIdx);
1801   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1802   MO.setReg(ExtB.getReg(0));
1803 }
1804 
1805 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1806                                       unsigned OpIdx) {
1807   MachineOperand &MO = MI.getOperand(OpIdx);
1808   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1809   MO.setReg(ExtB.getReg(0));
1810 }
1811 
1812 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1813                                      unsigned OpIdx, unsigned TruncOpcode) {
1814   MachineOperand &MO = MI.getOperand(OpIdx);
1815   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1816   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1817   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1818   MO.setReg(DstExt);
1819 }
1820 
1821 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1822                                       unsigned OpIdx, unsigned ExtOpcode) {
1823   MachineOperand &MO = MI.getOperand(OpIdx);
1824   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1825   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1826   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1827   MO.setReg(DstTrunc);
1828 }
1829 
1830 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1831                                             unsigned OpIdx) {
1832   MachineOperand &MO = MI.getOperand(OpIdx);
1833   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1834   Register Dst = MO.getReg();
1835   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1836   MO.setReg(DstExt);
1837   MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
1838 }
1839 
1840 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1841                                             unsigned OpIdx) {
1842   MachineOperand &MO = MI.getOperand(OpIdx);
1843   SmallVector<Register, 8> Regs;
1844   MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
1845 }
1846 
1847 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1848   MachineOperand &Op = MI.getOperand(OpIdx);
1849   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1850 }
1851 
1852 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1853   MachineOperand &MO = MI.getOperand(OpIdx);
1854   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1855   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1856   MIRBuilder.buildBitcast(MO, CastDst);
1857   MO.setReg(CastDst);
1858 }
1859 
1860 LegalizerHelper::LegalizeResult
1861 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1862                                         LLT WideTy) {
1863   if (TypeIdx != 1)
1864     return UnableToLegalize;
1865 
1866   auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs();
1867   if (DstTy.isVector())
1868     return UnableToLegalize;
1869 
1870   LLT SrcTy = MRI.getType(Src1Reg);
1871   const int DstSize = DstTy.getSizeInBits();
1872   const int SrcSize = SrcTy.getSizeInBits();
1873   const int WideSize = WideTy.getSizeInBits();
1874   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1875 
1876   unsigned NumOps = MI.getNumOperands();
1877   unsigned NumSrc = MI.getNumOperands() - 1;
1878   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1879 
1880   if (WideSize >= DstSize) {
1881     // Directly pack the bits in the target type.
1882     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1Reg).getReg(0);
1883 
1884     for (unsigned I = 2; I != NumOps; ++I) {
1885       const unsigned Offset = (I - 1) * PartSize;
1886 
1887       Register SrcReg = MI.getOperand(I).getReg();
1888       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1889 
1890       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1891 
1892       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1893         MRI.createGenericVirtualRegister(WideTy);
1894 
1895       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1896       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1897       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1898       ResultReg = NextResult;
1899     }
1900 
1901     if (WideSize > DstSize)
1902       MIRBuilder.buildTrunc(DstReg, ResultReg);
1903     else if (DstTy.isPointer())
1904       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1905 
1906     MI.eraseFromParent();
1907     return Legalized;
1908   }
1909 
1910   // Unmerge the original values to the GCD type, and recombine to the next
1911   // multiple greater than the original type.
1912   //
1913   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1914   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1915   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1916   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1917   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1918   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1919   // %12:_(s12) = G_MERGE_VALUES %10, %11
1920   //
1921   // Padding with undef if necessary:
1922   //
1923   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1924   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1925   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1926   // %7:_(s2) = G_IMPLICIT_DEF
1927   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1928   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1929   // %10:_(s12) = G_MERGE_VALUES %8, %9
1930 
1931   const int GCD = std::gcd(SrcSize, WideSize);
1932   LLT GCDTy = LLT::scalar(GCD);
1933 
1934   SmallVector<Register, 8> Parts;
1935   SmallVector<Register, 8> NewMergeRegs;
1936   SmallVector<Register, 8> Unmerges;
1937   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1938 
1939   // Decompose the original operands if they don't evenly divide.
1940   for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
1941     Register SrcReg = MO.getReg();
1942     if (GCD == SrcSize) {
1943       Unmerges.push_back(SrcReg);
1944     } else {
1945       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1946       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1947         Unmerges.push_back(Unmerge.getReg(J));
1948     }
1949   }
1950 
1951   // Pad with undef to the next size that is a multiple of the requested size.
1952   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1953     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1954     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1955       Unmerges.push_back(UndefReg);
1956   }
1957 
1958   const int PartsPerGCD = WideSize / GCD;
1959 
1960   // Build merges of each piece.
1961   ArrayRef<Register> Slicer(Unmerges);
1962   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1963     auto Merge =
1964         MIRBuilder.buildMergeLikeInstr(WideTy, Slicer.take_front(PartsPerGCD));
1965     NewMergeRegs.push_back(Merge.getReg(0));
1966   }
1967 
1968   // A truncate may be necessary if the requested type doesn't evenly divide the
1969   // original result type.
1970   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1971     MIRBuilder.buildMergeLikeInstr(DstReg, NewMergeRegs);
1972   } else {
1973     auto FinalMerge = MIRBuilder.buildMergeLikeInstr(WideDstTy, NewMergeRegs);
1974     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1975   }
1976 
1977   MI.eraseFromParent();
1978   return Legalized;
1979 }
1980 
1981 LegalizerHelper::LegalizeResult
1982 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1983                                           LLT WideTy) {
1984   if (TypeIdx != 0)
1985     return UnableToLegalize;
1986 
1987   int NumDst = MI.getNumOperands() - 1;
1988   Register SrcReg = MI.getOperand(NumDst).getReg();
1989   LLT SrcTy = MRI.getType(SrcReg);
1990   if (SrcTy.isVector())
1991     return UnableToLegalize;
1992 
1993   Register Dst0Reg = MI.getOperand(0).getReg();
1994   LLT DstTy = MRI.getType(Dst0Reg);
1995   if (!DstTy.isScalar())
1996     return UnableToLegalize;
1997 
1998   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1999     if (SrcTy.isPointer()) {
2000       const DataLayout &DL = MIRBuilder.getDataLayout();
2001       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
2002         LLVM_DEBUG(
2003             dbgs() << "Not casting non-integral address space integer\n");
2004         return UnableToLegalize;
2005       }
2006 
2007       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
2008       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
2009     }
2010 
2011     // Widen SrcTy to WideTy. This does not affect the result, but since the
2012     // user requested this size, it is probably better handled than SrcTy and
2013     // should reduce the total number of legalization artifacts.
2014     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2015       SrcTy = WideTy;
2016       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
2017     }
2018 
2019     // Theres no unmerge type to target. Directly extract the bits from the
2020     // source type
2021     unsigned DstSize = DstTy.getSizeInBits();
2022 
2023     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
2024     for (int I = 1; I != NumDst; ++I) {
2025       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
2026       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
2027       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
2028     }
2029 
2030     MI.eraseFromParent();
2031     return Legalized;
2032   }
2033 
2034   // Extend the source to a wider type.
2035   LLT LCMTy = getLCMType(SrcTy, WideTy);
2036 
2037   Register WideSrc = SrcReg;
2038   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
2039     // TODO: If this is an integral address space, cast to integer and anyext.
2040     if (SrcTy.isPointer()) {
2041       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
2042       return UnableToLegalize;
2043     }
2044 
2045     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
2046   }
2047 
2048   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
2049 
2050   // Create a sequence of unmerges and merges to the original results. Since we
2051   // may have widened the source, we will need to pad the results with dead defs
2052   // to cover the source register.
2053   // e.g. widen s48 to s64:
2054   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
2055   //
2056   // =>
2057   //  %4:_(s192) = G_ANYEXT %0:_(s96)
2058   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
2059   //  ; unpack to GCD type, with extra dead defs
2060   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
2061   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
2062   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
2063   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
2064   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
2065   const LLT GCDTy = getGCDType(WideTy, DstTy);
2066   const int NumUnmerge = Unmerge->getNumOperands() - 1;
2067   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
2068 
2069   // Directly unmerge to the destination without going through a GCD type
2070   // if possible
2071   if (PartsPerRemerge == 1) {
2072     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
2073 
2074     for (int I = 0; I != NumUnmerge; ++I) {
2075       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
2076 
2077       for (int J = 0; J != PartsPerUnmerge; ++J) {
2078         int Idx = I * PartsPerUnmerge + J;
2079         if (Idx < NumDst)
2080           MIB.addDef(MI.getOperand(Idx).getReg());
2081         else {
2082           // Create dead def for excess components.
2083           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
2084         }
2085       }
2086 
2087       MIB.addUse(Unmerge.getReg(I));
2088     }
2089   } else {
2090     SmallVector<Register, 16> Parts;
2091     for (int J = 0; J != NumUnmerge; ++J)
2092       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
2093 
2094     SmallVector<Register, 8> RemergeParts;
2095     for (int I = 0; I != NumDst; ++I) {
2096       for (int J = 0; J < PartsPerRemerge; ++J) {
2097         const int Idx = I * PartsPerRemerge + J;
2098         RemergeParts.emplace_back(Parts[Idx]);
2099       }
2100 
2101       MIRBuilder.buildMergeLikeInstr(MI.getOperand(I).getReg(), RemergeParts);
2102       RemergeParts.clear();
2103     }
2104   }
2105 
2106   MI.eraseFromParent();
2107   return Legalized;
2108 }
2109 
2110 LegalizerHelper::LegalizeResult
2111 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
2112                                     LLT WideTy) {
2113   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
2114   unsigned Offset = MI.getOperand(2).getImm();
2115 
2116   if (TypeIdx == 0) {
2117     if (SrcTy.isVector() || DstTy.isVector())
2118       return UnableToLegalize;
2119 
2120     SrcOp Src(SrcReg);
2121     if (SrcTy.isPointer()) {
2122       // Extracts from pointers can be handled only if they are really just
2123       // simple integers.
2124       const DataLayout &DL = MIRBuilder.getDataLayout();
2125       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
2126         return UnableToLegalize;
2127 
2128       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
2129       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
2130       SrcTy = SrcAsIntTy;
2131     }
2132 
2133     if (DstTy.isPointer())
2134       return UnableToLegalize;
2135 
2136     if (Offset == 0) {
2137       // Avoid a shift in the degenerate case.
2138       MIRBuilder.buildTrunc(DstReg,
2139                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
2140       MI.eraseFromParent();
2141       return Legalized;
2142     }
2143 
2144     // Do a shift in the source type.
2145     LLT ShiftTy = SrcTy;
2146     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2147       Src = MIRBuilder.buildAnyExt(WideTy, Src);
2148       ShiftTy = WideTy;
2149     }
2150 
2151     auto LShr = MIRBuilder.buildLShr(
2152       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
2153     MIRBuilder.buildTrunc(DstReg, LShr);
2154     MI.eraseFromParent();
2155     return Legalized;
2156   }
2157 
2158   if (SrcTy.isScalar()) {
2159     Observer.changingInstr(MI);
2160     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2161     Observer.changedInstr(MI);
2162     return Legalized;
2163   }
2164 
2165   if (!SrcTy.isVector())
2166     return UnableToLegalize;
2167 
2168   if (DstTy != SrcTy.getElementType())
2169     return UnableToLegalize;
2170 
2171   if (Offset % SrcTy.getScalarSizeInBits() != 0)
2172     return UnableToLegalize;
2173 
2174   Observer.changingInstr(MI);
2175   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2176 
2177   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
2178                           Offset);
2179   widenScalarDst(MI, WideTy.getScalarType(), 0);
2180   Observer.changedInstr(MI);
2181   return Legalized;
2182 }
2183 
2184 LegalizerHelper::LegalizeResult
2185 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
2186                                    LLT WideTy) {
2187   if (TypeIdx != 0 || WideTy.isVector())
2188     return UnableToLegalize;
2189   Observer.changingInstr(MI);
2190   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2191   widenScalarDst(MI, WideTy);
2192   Observer.changedInstr(MI);
2193   return Legalized;
2194 }
2195 
2196 LegalizerHelper::LegalizeResult
2197 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
2198                                            LLT WideTy) {
2199   unsigned Opcode;
2200   unsigned ExtOpcode;
2201   std::optional<Register> CarryIn;
2202   switch (MI.getOpcode()) {
2203   default:
2204     llvm_unreachable("Unexpected opcode!");
2205   case TargetOpcode::G_SADDO:
2206     Opcode = TargetOpcode::G_ADD;
2207     ExtOpcode = TargetOpcode::G_SEXT;
2208     break;
2209   case TargetOpcode::G_SSUBO:
2210     Opcode = TargetOpcode::G_SUB;
2211     ExtOpcode = TargetOpcode::G_SEXT;
2212     break;
2213   case TargetOpcode::G_UADDO:
2214     Opcode = TargetOpcode::G_ADD;
2215     ExtOpcode = TargetOpcode::G_ZEXT;
2216     break;
2217   case TargetOpcode::G_USUBO:
2218     Opcode = TargetOpcode::G_SUB;
2219     ExtOpcode = TargetOpcode::G_ZEXT;
2220     break;
2221   case TargetOpcode::G_SADDE:
2222     Opcode = TargetOpcode::G_UADDE;
2223     ExtOpcode = TargetOpcode::G_SEXT;
2224     CarryIn = MI.getOperand(4).getReg();
2225     break;
2226   case TargetOpcode::G_SSUBE:
2227     Opcode = TargetOpcode::G_USUBE;
2228     ExtOpcode = TargetOpcode::G_SEXT;
2229     CarryIn = MI.getOperand(4).getReg();
2230     break;
2231   case TargetOpcode::G_UADDE:
2232     Opcode = TargetOpcode::G_UADDE;
2233     ExtOpcode = TargetOpcode::G_ZEXT;
2234     CarryIn = MI.getOperand(4).getReg();
2235     break;
2236   case TargetOpcode::G_USUBE:
2237     Opcode = TargetOpcode::G_USUBE;
2238     ExtOpcode = TargetOpcode::G_ZEXT;
2239     CarryIn = MI.getOperand(4).getReg();
2240     break;
2241   }
2242 
2243   if (TypeIdx == 1) {
2244     unsigned BoolExtOp = MIRBuilder.getBoolExtOp(WideTy.isVector(), false);
2245 
2246     Observer.changingInstr(MI);
2247     if (CarryIn)
2248       widenScalarSrc(MI, WideTy, 4, BoolExtOp);
2249     widenScalarDst(MI, WideTy, 1);
2250 
2251     Observer.changedInstr(MI);
2252     return Legalized;
2253   }
2254 
2255   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
2256   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
2257   // Do the arithmetic in the larger type.
2258   Register NewOp;
2259   if (CarryIn) {
2260     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
2261     NewOp = MIRBuilder
2262                 .buildInstr(Opcode, {WideTy, CarryOutTy},
2263                             {LHSExt, RHSExt, *CarryIn})
2264                 .getReg(0);
2265   } else {
2266     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
2267   }
2268   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
2269   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
2270   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
2271   // There is no overflow if the ExtOp is the same as NewOp.
2272   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
2273   // Now trunc the NewOp to the original result.
2274   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
2275   MI.eraseFromParent();
2276   return Legalized;
2277 }
2278 
2279 LegalizerHelper::LegalizeResult
2280 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
2281                                          LLT WideTy) {
2282   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
2283                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
2284                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
2285   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
2286                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
2287   // We can convert this to:
2288   //   1. Any extend iN to iM
2289   //   2. SHL by M-N
2290   //   3. [US][ADD|SUB|SHL]SAT
2291   //   4. L/ASHR by M-N
2292   //
2293   // It may be more efficient to lower this to a min and a max operation in
2294   // the higher precision arithmetic if the promoted operation isn't legal,
2295   // but this decision is up to the target's lowering request.
2296   Register DstReg = MI.getOperand(0).getReg();
2297 
2298   unsigned NewBits = WideTy.getScalarSizeInBits();
2299   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
2300 
2301   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
2302   // must not left shift the RHS to preserve the shift amount.
2303   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
2304   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
2305                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
2306   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
2307   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
2308   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
2309 
2310   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
2311                                         {ShiftL, ShiftR}, MI.getFlags());
2312 
2313   // Use a shift that will preserve the number of sign bits when the trunc is
2314   // folded away.
2315   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
2316                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
2317 
2318   MIRBuilder.buildTrunc(DstReg, Result);
2319   MI.eraseFromParent();
2320   return Legalized;
2321 }
2322 
2323 LegalizerHelper::LegalizeResult
2324 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
2325                                  LLT WideTy) {
2326   if (TypeIdx == 1) {
2327     Observer.changingInstr(MI);
2328     widenScalarDst(MI, WideTy, 1);
2329     Observer.changedInstr(MI);
2330     return Legalized;
2331   }
2332 
2333   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
2334   auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs();
2335   LLT SrcTy = MRI.getType(LHS);
2336   LLT OverflowTy = MRI.getType(OriginalOverflow);
2337   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
2338 
2339   // To determine if the result overflowed in the larger type, we extend the
2340   // input to the larger type, do the multiply (checking if it overflows),
2341   // then also check the high bits of the result to see if overflow happened
2342   // there.
2343   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2344   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
2345   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
2346 
2347   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2348   // so we don't need to check the overflow result of larger type Mulo.
2349   bool WideMulCanOverflow = WideTy.getScalarSizeInBits() < 2 * SrcBitWidth;
2350 
2351   unsigned MulOpc =
2352       WideMulCanOverflow ? MI.getOpcode() : (unsigned)TargetOpcode::G_MUL;
2353 
2354   MachineInstrBuilder Mulo;
2355   if (WideMulCanOverflow)
2356     Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy, OverflowTy},
2357                                  {LeftOperand, RightOperand});
2358   else
2359     Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy}, {LeftOperand, RightOperand});
2360 
2361   auto Mul = Mulo->getOperand(0);
2362   MIRBuilder.buildTrunc(Result, Mul);
2363 
2364   MachineInstrBuilder ExtResult;
2365   // Overflow occurred if it occurred in the larger type, or if the high part
2366   // of the result does not zero/sign-extend the low part.  Check this second
2367   // possibility first.
2368   if (IsSigned) {
2369     // For signed, overflow occurred when the high part does not sign-extend
2370     // the low part.
2371     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
2372   } else {
2373     // Unsigned overflow occurred when the high part does not zero-extend the
2374     // low part.
2375     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
2376   }
2377 
2378   if (WideMulCanOverflow) {
2379     auto Overflow =
2380         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
2381     // Finally check if the multiplication in the larger type itself overflowed.
2382     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
2383   } else {
2384     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
2385   }
2386   MI.eraseFromParent();
2387   return Legalized;
2388 }
2389 
2390 LegalizerHelper::LegalizeResult
2391 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2392   switch (MI.getOpcode()) {
2393   default:
2394     return UnableToLegalize;
2395   case TargetOpcode::G_ATOMICRMW_XCHG:
2396   case TargetOpcode::G_ATOMICRMW_ADD:
2397   case TargetOpcode::G_ATOMICRMW_SUB:
2398   case TargetOpcode::G_ATOMICRMW_AND:
2399   case TargetOpcode::G_ATOMICRMW_OR:
2400   case TargetOpcode::G_ATOMICRMW_XOR:
2401   case TargetOpcode::G_ATOMICRMW_MIN:
2402   case TargetOpcode::G_ATOMICRMW_MAX:
2403   case TargetOpcode::G_ATOMICRMW_UMIN:
2404   case TargetOpcode::G_ATOMICRMW_UMAX:
2405     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2406     Observer.changingInstr(MI);
2407     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2408     widenScalarDst(MI, WideTy, 0);
2409     Observer.changedInstr(MI);
2410     return Legalized;
2411   case TargetOpcode::G_ATOMIC_CMPXCHG:
2412     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2413     Observer.changingInstr(MI);
2414     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2415     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2416     widenScalarDst(MI, WideTy, 0);
2417     Observer.changedInstr(MI);
2418     return Legalized;
2419   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2420     if (TypeIdx == 0) {
2421       Observer.changingInstr(MI);
2422       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2423       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2424       widenScalarDst(MI, WideTy, 0);
2425       Observer.changedInstr(MI);
2426       return Legalized;
2427     }
2428     assert(TypeIdx == 1 &&
2429            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2430     Observer.changingInstr(MI);
2431     widenScalarDst(MI, WideTy, 1);
2432     Observer.changedInstr(MI);
2433     return Legalized;
2434   case TargetOpcode::G_EXTRACT:
2435     return widenScalarExtract(MI, TypeIdx, WideTy);
2436   case TargetOpcode::G_INSERT:
2437     return widenScalarInsert(MI, TypeIdx, WideTy);
2438   case TargetOpcode::G_MERGE_VALUES:
2439     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2440   case TargetOpcode::G_UNMERGE_VALUES:
2441     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2442   case TargetOpcode::G_SADDO:
2443   case TargetOpcode::G_SSUBO:
2444   case TargetOpcode::G_UADDO:
2445   case TargetOpcode::G_USUBO:
2446   case TargetOpcode::G_SADDE:
2447   case TargetOpcode::G_SSUBE:
2448   case TargetOpcode::G_UADDE:
2449   case TargetOpcode::G_USUBE:
2450     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2451   case TargetOpcode::G_UMULO:
2452   case TargetOpcode::G_SMULO:
2453     return widenScalarMulo(MI, TypeIdx, WideTy);
2454   case TargetOpcode::G_SADDSAT:
2455   case TargetOpcode::G_SSUBSAT:
2456   case TargetOpcode::G_SSHLSAT:
2457   case TargetOpcode::G_UADDSAT:
2458   case TargetOpcode::G_USUBSAT:
2459   case TargetOpcode::G_USHLSAT:
2460     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2461   case TargetOpcode::G_CTTZ:
2462   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2463   case TargetOpcode::G_CTLZ:
2464   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2465   case TargetOpcode::G_CTPOP: {
2466     if (TypeIdx == 0) {
2467       Observer.changingInstr(MI);
2468       widenScalarDst(MI, WideTy, 0);
2469       Observer.changedInstr(MI);
2470       return Legalized;
2471     }
2472 
2473     Register SrcReg = MI.getOperand(1).getReg();
2474 
2475     // First extend the input.
2476     unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2477                               MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2478                           ? TargetOpcode::G_ANYEXT
2479                           : TargetOpcode::G_ZEXT;
2480     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2481     LLT CurTy = MRI.getType(SrcReg);
2482     unsigned NewOpc = MI.getOpcode();
2483     if (NewOpc == TargetOpcode::G_CTTZ) {
2484       // The count is the same in the larger type except if the original
2485       // value was zero.  This can be handled by setting the bit just off
2486       // the top of the original type.
2487       auto TopBit =
2488           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2489       MIBSrc = MIRBuilder.buildOr(
2490         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2491       // Now we know the operand is non-zero, use the more relaxed opcode.
2492       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2493     }
2494 
2495     // Perform the operation at the larger size.
2496     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2497     // This is already the correct result for CTPOP and CTTZs
2498     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2499         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2500       // The correct result is NewOp - (Difference in widety and current ty).
2501       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2502       MIBNewOp = MIRBuilder.buildSub(
2503           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2504     }
2505 
2506     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2507     MI.eraseFromParent();
2508     return Legalized;
2509   }
2510   case TargetOpcode::G_BSWAP: {
2511     Observer.changingInstr(MI);
2512     Register DstReg = MI.getOperand(0).getReg();
2513 
2514     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2515     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2516     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2517     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2518 
2519     MI.getOperand(0).setReg(DstExt);
2520 
2521     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2522 
2523     LLT Ty = MRI.getType(DstReg);
2524     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2525     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2526     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2527 
2528     MIRBuilder.buildTrunc(DstReg, ShrReg);
2529     Observer.changedInstr(MI);
2530     return Legalized;
2531   }
2532   case TargetOpcode::G_BITREVERSE: {
2533     Observer.changingInstr(MI);
2534 
2535     Register DstReg = MI.getOperand(0).getReg();
2536     LLT Ty = MRI.getType(DstReg);
2537     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2538 
2539     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2540     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2541     MI.getOperand(0).setReg(DstExt);
2542     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2543 
2544     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2545     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2546     MIRBuilder.buildTrunc(DstReg, Shift);
2547     Observer.changedInstr(MI);
2548     return Legalized;
2549   }
2550   case TargetOpcode::G_FREEZE:
2551     Observer.changingInstr(MI);
2552     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2553     widenScalarDst(MI, WideTy);
2554     Observer.changedInstr(MI);
2555     return Legalized;
2556 
2557   case TargetOpcode::G_ABS:
2558     Observer.changingInstr(MI);
2559     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2560     widenScalarDst(MI, WideTy);
2561     Observer.changedInstr(MI);
2562     return Legalized;
2563 
2564   case TargetOpcode::G_ADD:
2565   case TargetOpcode::G_AND:
2566   case TargetOpcode::G_MUL:
2567   case TargetOpcode::G_OR:
2568   case TargetOpcode::G_XOR:
2569   case TargetOpcode::G_SUB:
2570     // Perform operation at larger width (any extension is fines here, high bits
2571     // don't affect the result) and then truncate the result back to the
2572     // original type.
2573     Observer.changingInstr(MI);
2574     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2575     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2576     widenScalarDst(MI, WideTy);
2577     Observer.changedInstr(MI);
2578     return Legalized;
2579 
2580   case TargetOpcode::G_SBFX:
2581   case TargetOpcode::G_UBFX:
2582     Observer.changingInstr(MI);
2583 
2584     if (TypeIdx == 0) {
2585       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2586       widenScalarDst(MI, WideTy);
2587     } else {
2588       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2589       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2590     }
2591 
2592     Observer.changedInstr(MI);
2593     return Legalized;
2594 
2595   case TargetOpcode::G_SHL:
2596     Observer.changingInstr(MI);
2597 
2598     if (TypeIdx == 0) {
2599       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2600       widenScalarDst(MI, WideTy);
2601     } else {
2602       assert(TypeIdx == 1);
2603       // The "number of bits to shift" operand must preserve its value as an
2604       // unsigned integer:
2605       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2606     }
2607 
2608     Observer.changedInstr(MI);
2609     return Legalized;
2610 
2611   case TargetOpcode::G_ROTR:
2612   case TargetOpcode::G_ROTL:
2613     if (TypeIdx != 1)
2614       return UnableToLegalize;
2615 
2616     Observer.changingInstr(MI);
2617     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2618     Observer.changedInstr(MI);
2619     return Legalized;
2620 
2621   case TargetOpcode::G_SDIV:
2622   case TargetOpcode::G_SREM:
2623   case TargetOpcode::G_SMIN:
2624   case TargetOpcode::G_SMAX:
2625     Observer.changingInstr(MI);
2626     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2627     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2628     widenScalarDst(MI, WideTy);
2629     Observer.changedInstr(MI);
2630     return Legalized;
2631 
2632   case TargetOpcode::G_SDIVREM:
2633     Observer.changingInstr(MI);
2634     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2635     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2636     widenScalarDst(MI, WideTy);
2637     widenScalarDst(MI, WideTy, 1);
2638     Observer.changedInstr(MI);
2639     return Legalized;
2640 
2641   case TargetOpcode::G_ASHR:
2642   case TargetOpcode::G_LSHR:
2643     Observer.changingInstr(MI);
2644 
2645     if (TypeIdx == 0) {
2646       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2647         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2648 
2649       widenScalarSrc(MI, WideTy, 1, CvtOp);
2650       widenScalarDst(MI, WideTy);
2651     } else {
2652       assert(TypeIdx == 1);
2653       // The "number of bits to shift" operand must preserve its value as an
2654       // unsigned integer:
2655       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2656     }
2657 
2658     Observer.changedInstr(MI);
2659     return Legalized;
2660   case TargetOpcode::G_UDIV:
2661   case TargetOpcode::G_UREM:
2662   case TargetOpcode::G_UMIN:
2663   case TargetOpcode::G_UMAX:
2664     Observer.changingInstr(MI);
2665     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2666     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2667     widenScalarDst(MI, WideTy);
2668     Observer.changedInstr(MI);
2669     return Legalized;
2670 
2671   case TargetOpcode::G_UDIVREM:
2672     Observer.changingInstr(MI);
2673     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2674     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2675     widenScalarDst(MI, WideTy);
2676     widenScalarDst(MI, WideTy, 1);
2677     Observer.changedInstr(MI);
2678     return Legalized;
2679 
2680   case TargetOpcode::G_SELECT:
2681     Observer.changingInstr(MI);
2682     if (TypeIdx == 0) {
2683       // Perform operation at larger width (any extension is fine here, high
2684       // bits don't affect the result) and then truncate the result back to the
2685       // original type.
2686       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2687       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2688       widenScalarDst(MI, WideTy);
2689     } else {
2690       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2691       // Explicit extension is required here since high bits affect the result.
2692       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2693     }
2694     Observer.changedInstr(MI);
2695     return Legalized;
2696 
2697   case TargetOpcode::G_FPTOSI:
2698   case TargetOpcode::G_FPTOUI:
2699   case TargetOpcode::G_IS_FPCLASS:
2700     Observer.changingInstr(MI);
2701 
2702     if (TypeIdx == 0)
2703       widenScalarDst(MI, WideTy);
2704     else
2705       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2706 
2707     Observer.changedInstr(MI);
2708     return Legalized;
2709   case TargetOpcode::G_SITOFP:
2710     Observer.changingInstr(MI);
2711 
2712     if (TypeIdx == 0)
2713       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2714     else
2715       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2716 
2717     Observer.changedInstr(MI);
2718     return Legalized;
2719   case TargetOpcode::G_UITOFP:
2720     Observer.changingInstr(MI);
2721 
2722     if (TypeIdx == 0)
2723       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2724     else
2725       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2726 
2727     Observer.changedInstr(MI);
2728     return Legalized;
2729   case TargetOpcode::G_LOAD:
2730   case TargetOpcode::G_SEXTLOAD:
2731   case TargetOpcode::G_ZEXTLOAD:
2732     Observer.changingInstr(MI);
2733     widenScalarDst(MI, WideTy);
2734     Observer.changedInstr(MI);
2735     return Legalized;
2736 
2737   case TargetOpcode::G_STORE: {
2738     if (TypeIdx != 0)
2739       return UnableToLegalize;
2740 
2741     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2742     if (!Ty.isScalar())
2743       return UnableToLegalize;
2744 
2745     Observer.changingInstr(MI);
2746 
2747     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2748       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2749     widenScalarSrc(MI, WideTy, 0, ExtType);
2750 
2751     Observer.changedInstr(MI);
2752     return Legalized;
2753   }
2754   case TargetOpcode::G_CONSTANT: {
2755     MachineOperand &SrcMO = MI.getOperand(1);
2756     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2757     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2758         MRI.getType(MI.getOperand(0).getReg()));
2759     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2760             ExtOpc == TargetOpcode::G_ANYEXT) &&
2761            "Illegal Extend");
2762     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2763     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2764                            ? SrcVal.sext(WideTy.getSizeInBits())
2765                            : SrcVal.zext(WideTy.getSizeInBits());
2766     Observer.changingInstr(MI);
2767     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2768 
2769     widenScalarDst(MI, WideTy);
2770     Observer.changedInstr(MI);
2771     return Legalized;
2772   }
2773   case TargetOpcode::G_FCONSTANT: {
2774     // To avoid changing the bits of the constant due to extension to a larger
2775     // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
2776     MachineOperand &SrcMO = MI.getOperand(1);
2777     APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
2778     MIRBuilder.setInstrAndDebugLoc(MI);
2779     auto IntCst = MIRBuilder.buildConstant(MI.getOperand(0).getReg(), Val);
2780     widenScalarDst(*IntCst, WideTy, 0, TargetOpcode::G_TRUNC);
2781     MI.eraseFromParent();
2782     return Legalized;
2783   }
2784   case TargetOpcode::G_IMPLICIT_DEF: {
2785     Observer.changingInstr(MI);
2786     widenScalarDst(MI, WideTy);
2787     Observer.changedInstr(MI);
2788     return Legalized;
2789   }
2790   case TargetOpcode::G_BRCOND:
2791     Observer.changingInstr(MI);
2792     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2793     Observer.changedInstr(MI);
2794     return Legalized;
2795 
2796   case TargetOpcode::G_FCMP:
2797     Observer.changingInstr(MI);
2798     if (TypeIdx == 0)
2799       widenScalarDst(MI, WideTy);
2800     else {
2801       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2802       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2803     }
2804     Observer.changedInstr(MI);
2805     return Legalized;
2806 
2807   case TargetOpcode::G_ICMP:
2808     Observer.changingInstr(MI);
2809     if (TypeIdx == 0)
2810       widenScalarDst(MI, WideTy);
2811     else {
2812       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2813                                MI.getOperand(1).getPredicate()))
2814                                ? TargetOpcode::G_SEXT
2815                                : TargetOpcode::G_ZEXT;
2816       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2817       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2818     }
2819     Observer.changedInstr(MI);
2820     return Legalized;
2821 
2822   case TargetOpcode::G_PTR_ADD:
2823     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2824     Observer.changingInstr(MI);
2825     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2826     Observer.changedInstr(MI);
2827     return Legalized;
2828 
2829   case TargetOpcode::G_PHI: {
2830     assert(TypeIdx == 0 && "Expecting only Idx 0");
2831 
2832     Observer.changingInstr(MI);
2833     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2834       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2835       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
2836       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2837     }
2838 
2839     MachineBasicBlock &MBB = *MI.getParent();
2840     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2841     widenScalarDst(MI, WideTy);
2842     Observer.changedInstr(MI);
2843     return Legalized;
2844   }
2845   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2846     if (TypeIdx == 0) {
2847       Register VecReg = MI.getOperand(1).getReg();
2848       LLT VecTy = MRI.getType(VecReg);
2849       Observer.changingInstr(MI);
2850 
2851       widenScalarSrc(
2852           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2853           TargetOpcode::G_ANYEXT);
2854 
2855       widenScalarDst(MI, WideTy, 0);
2856       Observer.changedInstr(MI);
2857       return Legalized;
2858     }
2859 
2860     if (TypeIdx != 2)
2861       return UnableToLegalize;
2862     Observer.changingInstr(MI);
2863     // TODO: Probably should be zext
2864     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2865     Observer.changedInstr(MI);
2866     return Legalized;
2867   }
2868   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2869     if (TypeIdx == 0) {
2870       Observer.changingInstr(MI);
2871       const LLT WideEltTy = WideTy.getElementType();
2872 
2873       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2874       widenScalarSrc(MI, WideEltTy, 2, TargetOpcode::G_ANYEXT);
2875       widenScalarDst(MI, WideTy, 0);
2876       Observer.changedInstr(MI);
2877       return Legalized;
2878     }
2879 
2880     if (TypeIdx == 1) {
2881       Observer.changingInstr(MI);
2882 
2883       Register VecReg = MI.getOperand(1).getReg();
2884       LLT VecTy = MRI.getType(VecReg);
2885       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2886 
2887       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2888       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2889       widenScalarDst(MI, WideVecTy, 0);
2890       Observer.changedInstr(MI);
2891       return Legalized;
2892     }
2893 
2894     if (TypeIdx == 2) {
2895       Observer.changingInstr(MI);
2896       // TODO: Probably should be zext
2897       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2898       Observer.changedInstr(MI);
2899       return Legalized;
2900     }
2901 
2902     return UnableToLegalize;
2903   }
2904   case TargetOpcode::G_FADD:
2905   case TargetOpcode::G_FMUL:
2906   case TargetOpcode::G_FSUB:
2907   case TargetOpcode::G_FMA:
2908   case TargetOpcode::G_FMAD:
2909   case TargetOpcode::G_FNEG:
2910   case TargetOpcode::G_FABS:
2911   case TargetOpcode::G_FCANONICALIZE:
2912   case TargetOpcode::G_FMINNUM:
2913   case TargetOpcode::G_FMAXNUM:
2914   case TargetOpcode::G_FMINNUM_IEEE:
2915   case TargetOpcode::G_FMAXNUM_IEEE:
2916   case TargetOpcode::G_FMINIMUM:
2917   case TargetOpcode::G_FMAXIMUM:
2918   case TargetOpcode::G_FDIV:
2919   case TargetOpcode::G_FREM:
2920   case TargetOpcode::G_FCEIL:
2921   case TargetOpcode::G_FFLOOR:
2922   case TargetOpcode::G_FCOS:
2923   case TargetOpcode::G_FSIN:
2924   case TargetOpcode::G_FLOG10:
2925   case TargetOpcode::G_FLOG:
2926   case TargetOpcode::G_FLOG2:
2927   case TargetOpcode::G_FRINT:
2928   case TargetOpcode::G_FNEARBYINT:
2929   case TargetOpcode::G_FSQRT:
2930   case TargetOpcode::G_FEXP:
2931   case TargetOpcode::G_FEXP2:
2932   case TargetOpcode::G_FEXP10:
2933   case TargetOpcode::G_FPOW:
2934   case TargetOpcode::G_INTRINSIC_TRUNC:
2935   case TargetOpcode::G_INTRINSIC_ROUND:
2936   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2937     assert(TypeIdx == 0);
2938     Observer.changingInstr(MI);
2939 
2940     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2941       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2942 
2943     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2944     Observer.changedInstr(MI);
2945     return Legalized;
2946   case TargetOpcode::G_FPOWI:
2947   case TargetOpcode::G_FLDEXP:
2948   case TargetOpcode::G_STRICT_FLDEXP: {
2949     if (TypeIdx == 0) {
2950       if (MI.getOpcode() == TargetOpcode::G_STRICT_FLDEXP)
2951         return UnableToLegalize;
2952 
2953       Observer.changingInstr(MI);
2954       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2955       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2956       Observer.changedInstr(MI);
2957       return Legalized;
2958     }
2959 
2960     if (TypeIdx == 1) {
2961       // For some reason SelectionDAG tries to promote to a libcall without
2962       // actually changing the integer type for promotion.
2963       Observer.changingInstr(MI);
2964       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2965       Observer.changedInstr(MI);
2966       return Legalized;
2967     }
2968 
2969     return UnableToLegalize;
2970   }
2971   case TargetOpcode::G_FFREXP: {
2972     Observer.changingInstr(MI);
2973 
2974     if (TypeIdx == 0) {
2975       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2976       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2977     } else {
2978       widenScalarDst(MI, WideTy, 1);
2979     }
2980 
2981     Observer.changedInstr(MI);
2982     return Legalized;
2983   }
2984   case TargetOpcode::G_INTTOPTR:
2985     if (TypeIdx != 1)
2986       return UnableToLegalize;
2987 
2988     Observer.changingInstr(MI);
2989     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2990     Observer.changedInstr(MI);
2991     return Legalized;
2992   case TargetOpcode::G_PTRTOINT:
2993     if (TypeIdx != 0)
2994       return UnableToLegalize;
2995 
2996     Observer.changingInstr(MI);
2997     widenScalarDst(MI, WideTy, 0);
2998     Observer.changedInstr(MI);
2999     return Legalized;
3000   case TargetOpcode::G_BUILD_VECTOR: {
3001     Observer.changingInstr(MI);
3002 
3003     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
3004     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
3005       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
3006 
3007     // Avoid changing the result vector type if the source element type was
3008     // requested.
3009     if (TypeIdx == 1) {
3010       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
3011     } else {
3012       widenScalarDst(MI, WideTy, 0);
3013     }
3014 
3015     Observer.changedInstr(MI);
3016     return Legalized;
3017   }
3018   case TargetOpcode::G_SEXT_INREG:
3019     if (TypeIdx != 0)
3020       return UnableToLegalize;
3021 
3022     Observer.changingInstr(MI);
3023     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
3024     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
3025     Observer.changedInstr(MI);
3026     return Legalized;
3027   case TargetOpcode::G_PTRMASK: {
3028     if (TypeIdx != 1)
3029       return UnableToLegalize;
3030     Observer.changingInstr(MI);
3031     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
3032     Observer.changedInstr(MI);
3033     return Legalized;
3034   }
3035   case TargetOpcode::G_VECREDUCE_FADD:
3036   case TargetOpcode::G_VECREDUCE_FMUL:
3037   case TargetOpcode::G_VECREDUCE_FMIN:
3038   case TargetOpcode::G_VECREDUCE_FMAX:
3039   case TargetOpcode::G_VECREDUCE_FMINIMUM:
3040   case TargetOpcode::G_VECREDUCE_FMAXIMUM:
3041     if (TypeIdx != 0)
3042       return UnableToLegalize;
3043     Observer.changingInstr(MI);
3044     Register VecReg = MI.getOperand(1).getReg();
3045     LLT VecTy = MRI.getType(VecReg);
3046     LLT WideVecTy = VecTy.isVector()
3047                         ? LLT::vector(VecTy.getElementCount(), WideTy)
3048                         : WideTy;
3049     widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_FPEXT);
3050     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3051     Observer.changedInstr(MI);
3052     return Legalized;
3053   }
3054 }
3055 
3056 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
3057                              MachineIRBuilder &B, Register Src, LLT Ty) {
3058   auto Unmerge = B.buildUnmerge(Ty, Src);
3059   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3060     Pieces.push_back(Unmerge.getReg(I));
3061 }
3062 
3063 LegalizerHelper::LegalizeResult
3064 LegalizerHelper::lowerFConstant(MachineInstr &MI) {
3065   Register Dst = MI.getOperand(0).getReg();
3066 
3067   MachineFunction &MF = MIRBuilder.getMF();
3068   const DataLayout &DL = MIRBuilder.getDataLayout();
3069 
3070   unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
3071   LLT AddrPtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3072   Align Alignment = Align(DL.getABITypeAlign(
3073       getFloatTypeForLLT(MF.getFunction().getContext(), MRI.getType(Dst))));
3074 
3075   auto Addr = MIRBuilder.buildConstantPool(
3076       AddrPtrTy, MF.getConstantPool()->getConstantPoolIndex(
3077                      MI.getOperand(1).getFPImm(), Alignment));
3078 
3079   MachineMemOperand *MMO = MF.getMachineMemOperand(
3080       MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
3081       MRI.getType(Dst), Alignment);
3082 
3083   MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Addr, *MMO);
3084   MI.eraseFromParent();
3085 
3086   return Legalized;
3087 }
3088 
3089 LegalizerHelper::LegalizeResult
3090 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
3091   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
3092   if (SrcTy.isVector()) {
3093     LLT SrcEltTy = SrcTy.getElementType();
3094     SmallVector<Register, 8> SrcRegs;
3095 
3096     if (DstTy.isVector()) {
3097       int NumDstElt = DstTy.getNumElements();
3098       int NumSrcElt = SrcTy.getNumElements();
3099 
3100       LLT DstEltTy = DstTy.getElementType();
3101       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
3102       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
3103 
3104       // If there's an element size mismatch, insert intermediate casts to match
3105       // the result element type.
3106       if (NumSrcElt < NumDstElt) { // Source element type is larger.
3107         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
3108         //
3109         // =>
3110         //
3111         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
3112         // %3:_(<2 x s8>) = G_BITCAST %2
3113         // %4:_(<2 x s8>) = G_BITCAST %3
3114         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
3115         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
3116         SrcPartTy = SrcEltTy;
3117       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
3118         //
3119         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
3120         //
3121         // =>
3122         //
3123         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
3124         // %3:_(s16) = G_BITCAST %2
3125         // %4:_(s16) = G_BITCAST %3
3126         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
3127         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
3128         DstCastTy = DstEltTy;
3129       }
3130 
3131       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
3132       for (Register &SrcReg : SrcRegs)
3133         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
3134     } else
3135       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
3136 
3137     MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
3138     MI.eraseFromParent();
3139     return Legalized;
3140   }
3141 
3142   if (DstTy.isVector()) {
3143     SmallVector<Register, 8> SrcRegs;
3144     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
3145     MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
3146     MI.eraseFromParent();
3147     return Legalized;
3148   }
3149 
3150   return UnableToLegalize;
3151 }
3152 
3153 /// Figure out the bit offset into a register when coercing a vector index for
3154 /// the wide element type. This is only for the case when promoting vector to
3155 /// one with larger elements.
3156 //
3157 ///
3158 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3159 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3160 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
3161                                                    Register Idx,
3162                                                    unsigned NewEltSize,
3163                                                    unsigned OldEltSize) {
3164   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3165   LLT IdxTy = B.getMRI()->getType(Idx);
3166 
3167   // Now figure out the amount we need to shift to get the target bits.
3168   auto OffsetMask = B.buildConstant(
3169       IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
3170   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
3171   return B.buildShl(IdxTy, OffsetIdx,
3172                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
3173 }
3174 
3175 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
3176 /// is casting to a vector with a smaller element size, perform multiple element
3177 /// extracts and merge the results. If this is coercing to a vector with larger
3178 /// elements, index the bitcasted vector and extract the target element with bit
3179 /// operations. This is intended to force the indexing in the native register
3180 /// size for architectures that can dynamically index the register file.
3181 LegalizerHelper::LegalizeResult
3182 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
3183                                          LLT CastTy) {
3184   if (TypeIdx != 1)
3185     return UnableToLegalize;
3186 
3187   auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs();
3188 
3189   LLT SrcEltTy = SrcVecTy.getElementType();
3190   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3191   unsigned OldNumElts = SrcVecTy.getNumElements();
3192 
3193   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3194   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
3195 
3196   const unsigned NewEltSize = NewEltTy.getSizeInBits();
3197   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
3198   if (NewNumElts > OldNumElts) {
3199     // Decreasing the vector element size
3200     //
3201     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
3202     //  =>
3203     //  v4i32:castx = bitcast x:v2i64
3204     //
3205     // i64 = bitcast
3206     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
3207     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
3208     //
3209     if (NewNumElts % OldNumElts != 0)
3210       return UnableToLegalize;
3211 
3212     // Type of the intermediate result vector.
3213     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
3214     LLT MidTy =
3215         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
3216 
3217     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
3218 
3219     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
3220     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
3221 
3222     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
3223       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
3224       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
3225       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
3226       NewOps[I] = Elt.getReg(0);
3227     }
3228 
3229     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
3230     MIRBuilder.buildBitcast(Dst, NewVec);
3231     MI.eraseFromParent();
3232     return Legalized;
3233   }
3234 
3235   if (NewNumElts < OldNumElts) {
3236     if (NewEltSize % OldEltSize != 0)
3237       return UnableToLegalize;
3238 
3239     // This only depends on powers of 2 because we use bit tricks to figure out
3240     // the bit offset we need to shift to get the target element. A general
3241     // expansion could emit division/multiply.
3242     if (!isPowerOf2_32(NewEltSize / OldEltSize))
3243       return UnableToLegalize;
3244 
3245     // Increasing the vector element size.
3246     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
3247     //
3248     //   =>
3249     //
3250     // %cast = G_BITCAST %vec
3251     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
3252     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
3253     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3254     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3255     // %elt_bits = G_LSHR %wide_elt, %offset_bits
3256     // %elt = G_TRUNC %elt_bits
3257 
3258     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3259     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
3260 
3261     // Divide to get the index in the wider element type.
3262     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
3263 
3264     Register WideElt = CastVec;
3265     if (CastTy.isVector()) {
3266       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
3267                                                      ScaledIdx).getReg(0);
3268     }
3269 
3270     // Compute the bit offset into the register of the target element.
3271     Register OffsetBits = getBitcastWiderVectorElementOffset(
3272       MIRBuilder, Idx, NewEltSize, OldEltSize);
3273 
3274     // Shift the wide element to get the target element.
3275     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
3276     MIRBuilder.buildTrunc(Dst, ExtractedBits);
3277     MI.eraseFromParent();
3278     return Legalized;
3279   }
3280 
3281   return UnableToLegalize;
3282 }
3283 
3284 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
3285 /// TargetReg, while preserving other bits in \p TargetReg.
3286 ///
3287 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
3288 static Register buildBitFieldInsert(MachineIRBuilder &B,
3289                                     Register TargetReg, Register InsertReg,
3290                                     Register OffsetBits) {
3291   LLT TargetTy = B.getMRI()->getType(TargetReg);
3292   LLT InsertTy = B.getMRI()->getType(InsertReg);
3293   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
3294   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
3295 
3296   // Produce a bitmask of the value to insert
3297   auto EltMask = B.buildConstant(
3298     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
3299                                    InsertTy.getSizeInBits()));
3300   // Shift it into position
3301   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
3302   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
3303 
3304   // Clear out the bits in the wide element
3305   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
3306 
3307   // The value to insert has all zeros already, so stick it into the masked
3308   // wide element.
3309   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
3310 }
3311 
3312 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
3313 /// is increasing the element size, perform the indexing in the target element
3314 /// type, and use bit operations to insert at the element position. This is
3315 /// intended for architectures that can dynamically index the register file and
3316 /// want to force indexing in the native register size.
3317 LegalizerHelper::LegalizeResult
3318 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
3319                                         LLT CastTy) {
3320   if (TypeIdx != 0)
3321     return UnableToLegalize;
3322 
3323   auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] =
3324       MI.getFirst4RegLLTs();
3325   LLT VecTy = DstTy;
3326 
3327   LLT VecEltTy = VecTy.getElementType();
3328   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3329   const unsigned NewEltSize = NewEltTy.getSizeInBits();
3330   const unsigned OldEltSize = VecEltTy.getSizeInBits();
3331 
3332   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3333   unsigned OldNumElts = VecTy.getNumElements();
3334 
3335   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
3336   if (NewNumElts < OldNumElts) {
3337     if (NewEltSize % OldEltSize != 0)
3338       return UnableToLegalize;
3339 
3340     // This only depends on powers of 2 because we use bit tricks to figure out
3341     // the bit offset we need to shift to get the target element. A general
3342     // expansion could emit division/multiply.
3343     if (!isPowerOf2_32(NewEltSize / OldEltSize))
3344       return UnableToLegalize;
3345 
3346     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3347     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
3348 
3349     // Divide to get the index in the wider element type.
3350     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
3351 
3352     Register ExtractedElt = CastVec;
3353     if (CastTy.isVector()) {
3354       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
3355                                                           ScaledIdx).getReg(0);
3356     }
3357 
3358     // Compute the bit offset into the register of the target element.
3359     Register OffsetBits = getBitcastWiderVectorElementOffset(
3360       MIRBuilder, Idx, NewEltSize, OldEltSize);
3361 
3362     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
3363                                                Val, OffsetBits);
3364     if (CastTy.isVector()) {
3365       InsertedElt = MIRBuilder.buildInsertVectorElement(
3366         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
3367     }
3368 
3369     MIRBuilder.buildBitcast(Dst, InsertedElt);
3370     MI.eraseFromParent();
3371     return Legalized;
3372   }
3373 
3374   return UnableToLegalize;
3375 }
3376 
3377 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
3378   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
3379   Register DstReg = LoadMI.getDstReg();
3380   Register PtrReg = LoadMI.getPointerReg();
3381   LLT DstTy = MRI.getType(DstReg);
3382   MachineMemOperand &MMO = LoadMI.getMMO();
3383   LLT MemTy = MMO.getMemoryType();
3384   MachineFunction &MF = MIRBuilder.getMF();
3385 
3386   unsigned MemSizeInBits = MemTy.getSizeInBits();
3387   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
3388 
3389   if (MemSizeInBits != MemStoreSizeInBits) {
3390     if (MemTy.isVector())
3391       return UnableToLegalize;
3392 
3393     // Promote to a byte-sized load if not loading an integral number of
3394     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
3395     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
3396     MachineMemOperand *NewMMO =
3397         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
3398 
3399     Register LoadReg = DstReg;
3400     LLT LoadTy = DstTy;
3401 
3402     // If this wasn't already an extending load, we need to widen the result
3403     // register to avoid creating a load with a narrower result than the source.
3404     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
3405       LoadTy = WideMemTy;
3406       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
3407     }
3408 
3409     if (isa<GSExtLoad>(LoadMI)) {
3410       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
3411       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
3412     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == LoadTy) {
3413       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
3414       // The extra bits are guaranteed to be zero, since we stored them that
3415       // way.  A zext load from Wide thus automatically gives zext from MemVT.
3416       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
3417     } else {
3418       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
3419     }
3420 
3421     if (DstTy != LoadTy)
3422       MIRBuilder.buildTrunc(DstReg, LoadReg);
3423 
3424     LoadMI.eraseFromParent();
3425     return Legalized;
3426   }
3427 
3428   // Big endian lowering not implemented.
3429   if (MIRBuilder.getDataLayout().isBigEndian())
3430     return UnableToLegalize;
3431 
3432   // This load needs splitting into power of 2 sized loads.
3433   //
3434   // Our strategy here is to generate anyextending loads for the smaller
3435   // types up to next power-2 result type, and then combine the two larger
3436   // result values together, before truncating back down to the non-pow-2
3437   // type.
3438   // E.g. v1 = i24 load =>
3439   // v2 = i32 zextload (2 byte)
3440   // v3 = i32 load (1 byte)
3441   // v4 = i32 shl v3, 16
3442   // v5 = i32 or v4, v2
3443   // v1 = i24 trunc v5
3444   // By doing this we generate the correct truncate which should get
3445   // combined away as an artifact with a matching extend.
3446 
3447   uint64_t LargeSplitSize, SmallSplitSize;
3448 
3449   if (!isPowerOf2_32(MemSizeInBits)) {
3450     // This load needs splitting into power of 2 sized loads.
3451     LargeSplitSize = llvm::bit_floor(MemSizeInBits);
3452     SmallSplitSize = MemSizeInBits - LargeSplitSize;
3453   } else {
3454     // This is already a power of 2, but we still need to split this in half.
3455     //
3456     // Assume we're being asked to decompose an unaligned load.
3457     // TODO: If this requires multiple splits, handle them all at once.
3458     auto &Ctx = MF.getFunction().getContext();
3459     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3460       return UnableToLegalize;
3461 
3462     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3463   }
3464 
3465   if (MemTy.isVector()) {
3466     // TODO: Handle vector extloads
3467     if (MemTy != DstTy)
3468       return UnableToLegalize;
3469 
3470     // TODO: We can do better than scalarizing the vector and at least split it
3471     // in half.
3472     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
3473   }
3474 
3475   MachineMemOperand *LargeMMO =
3476       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3477   MachineMemOperand *SmallMMO =
3478       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3479 
3480   LLT PtrTy = MRI.getType(PtrReg);
3481   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
3482   LLT AnyExtTy = LLT::scalar(AnyExtSize);
3483   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
3484                                              PtrReg, *LargeMMO);
3485 
3486   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
3487                                             LargeSplitSize / 8);
3488   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3489   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3490   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3491                                              SmallPtr, *SmallMMO);
3492 
3493   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3494   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3495 
3496   if (AnyExtTy == DstTy)
3497     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3498   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3499     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3500     MIRBuilder.buildTrunc(DstReg, {Or});
3501   } else {
3502     assert(DstTy.isPointer() && "expected pointer");
3503     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3504 
3505     // FIXME: We currently consider this to be illegal for non-integral address
3506     // spaces, but we need still need a way to reinterpret the bits.
3507     MIRBuilder.buildIntToPtr(DstReg, Or);
3508   }
3509 
3510   LoadMI.eraseFromParent();
3511   return Legalized;
3512 }
3513 
3514 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3515   // Lower a non-power of 2 store into multiple pow-2 stores.
3516   // E.g. split an i24 store into an i16 store + i8 store.
3517   // We do this by first extending the stored value to the next largest power
3518   // of 2 type, and then using truncating stores to store the components.
3519   // By doing this, likewise with G_LOAD, generate an extend that can be
3520   // artifact-combined away instead of leaving behind extracts.
3521   Register SrcReg = StoreMI.getValueReg();
3522   Register PtrReg = StoreMI.getPointerReg();
3523   LLT SrcTy = MRI.getType(SrcReg);
3524   MachineFunction &MF = MIRBuilder.getMF();
3525   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3526   LLT MemTy = MMO.getMemoryType();
3527 
3528   unsigned StoreWidth = MemTy.getSizeInBits();
3529   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3530 
3531   if (StoreWidth != StoreSizeInBits) {
3532     if (SrcTy.isVector())
3533       return UnableToLegalize;
3534 
3535     // Promote to a byte-sized store with upper bits zero if not
3536     // storing an integral number of bytes.  For example, promote
3537     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3538     LLT WideTy = LLT::scalar(StoreSizeInBits);
3539 
3540     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3541       // Avoid creating a store with a narrower source than result.
3542       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3543       SrcTy = WideTy;
3544     }
3545 
3546     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3547 
3548     MachineMemOperand *NewMMO =
3549         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3550     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3551     StoreMI.eraseFromParent();
3552     return Legalized;
3553   }
3554 
3555   if (MemTy.isVector()) {
3556     // TODO: Handle vector trunc stores
3557     if (MemTy != SrcTy)
3558       return UnableToLegalize;
3559 
3560     // TODO: We can do better than scalarizing the vector and at least split it
3561     // in half.
3562     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3563   }
3564 
3565   unsigned MemSizeInBits = MemTy.getSizeInBits();
3566   uint64_t LargeSplitSize, SmallSplitSize;
3567 
3568   if (!isPowerOf2_32(MemSizeInBits)) {
3569     LargeSplitSize = llvm::bit_floor<uint64_t>(MemTy.getSizeInBits());
3570     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3571   } else {
3572     auto &Ctx = MF.getFunction().getContext();
3573     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3574       return UnableToLegalize; // Don't know what we're being asked to do.
3575 
3576     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3577   }
3578 
3579   // Extend to the next pow-2. If this store was itself the result of lowering,
3580   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3581   // that's wider than the stored size.
3582   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3583   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3584 
3585   if (SrcTy.isPointer()) {
3586     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3587     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3588   }
3589 
3590   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3591 
3592   // Obtain the smaller value by shifting away the larger value.
3593   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3594   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3595 
3596   // Generate the PtrAdd and truncating stores.
3597   LLT PtrTy = MRI.getType(PtrReg);
3598   auto OffsetCst = MIRBuilder.buildConstant(
3599     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3600   auto SmallPtr =
3601     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3602 
3603   MachineMemOperand *LargeMMO =
3604     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3605   MachineMemOperand *SmallMMO =
3606     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3607   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3608   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3609   StoreMI.eraseFromParent();
3610   return Legalized;
3611 }
3612 
3613 LegalizerHelper::LegalizeResult
3614 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3615   switch (MI.getOpcode()) {
3616   case TargetOpcode::G_LOAD: {
3617     if (TypeIdx != 0)
3618       return UnableToLegalize;
3619     MachineMemOperand &MMO = **MI.memoperands_begin();
3620 
3621     // Not sure how to interpret a bitcast of an extending load.
3622     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3623       return UnableToLegalize;
3624 
3625     Observer.changingInstr(MI);
3626     bitcastDst(MI, CastTy, 0);
3627     MMO.setType(CastTy);
3628     Observer.changedInstr(MI);
3629     return Legalized;
3630   }
3631   case TargetOpcode::G_STORE: {
3632     if (TypeIdx != 0)
3633       return UnableToLegalize;
3634 
3635     MachineMemOperand &MMO = **MI.memoperands_begin();
3636 
3637     // Not sure how to interpret a bitcast of a truncating store.
3638     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3639       return UnableToLegalize;
3640 
3641     Observer.changingInstr(MI);
3642     bitcastSrc(MI, CastTy, 0);
3643     MMO.setType(CastTy);
3644     Observer.changedInstr(MI);
3645     return Legalized;
3646   }
3647   case TargetOpcode::G_SELECT: {
3648     if (TypeIdx != 0)
3649       return UnableToLegalize;
3650 
3651     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3652       LLVM_DEBUG(
3653           dbgs() << "bitcast action not implemented for vector select\n");
3654       return UnableToLegalize;
3655     }
3656 
3657     Observer.changingInstr(MI);
3658     bitcastSrc(MI, CastTy, 2);
3659     bitcastSrc(MI, CastTy, 3);
3660     bitcastDst(MI, CastTy, 0);
3661     Observer.changedInstr(MI);
3662     return Legalized;
3663   }
3664   case TargetOpcode::G_AND:
3665   case TargetOpcode::G_OR:
3666   case TargetOpcode::G_XOR: {
3667     Observer.changingInstr(MI);
3668     bitcastSrc(MI, CastTy, 1);
3669     bitcastSrc(MI, CastTy, 2);
3670     bitcastDst(MI, CastTy, 0);
3671     Observer.changedInstr(MI);
3672     return Legalized;
3673   }
3674   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3675     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3676   case TargetOpcode::G_INSERT_VECTOR_ELT:
3677     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3678   default:
3679     return UnableToLegalize;
3680   }
3681 }
3682 
3683 // Legalize an instruction by changing the opcode in place.
3684 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3685     Observer.changingInstr(MI);
3686     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3687     Observer.changedInstr(MI);
3688 }
3689 
3690 LegalizerHelper::LegalizeResult
3691 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3692   using namespace TargetOpcode;
3693 
3694   switch(MI.getOpcode()) {
3695   default:
3696     return UnableToLegalize;
3697   case TargetOpcode::G_FCONSTANT:
3698     return lowerFConstant(MI);
3699   case TargetOpcode::G_BITCAST:
3700     return lowerBitcast(MI);
3701   case TargetOpcode::G_SREM:
3702   case TargetOpcode::G_UREM: {
3703     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3704     auto Quot =
3705         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3706                               {MI.getOperand(1), MI.getOperand(2)});
3707 
3708     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3709     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3710     MI.eraseFromParent();
3711     return Legalized;
3712   }
3713   case TargetOpcode::G_SADDO:
3714   case TargetOpcode::G_SSUBO:
3715     return lowerSADDO_SSUBO(MI);
3716   case TargetOpcode::G_UMULH:
3717   case TargetOpcode::G_SMULH:
3718     return lowerSMULH_UMULH(MI);
3719   case TargetOpcode::G_SMULO:
3720   case TargetOpcode::G_UMULO: {
3721     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3722     // result.
3723     auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs();
3724     LLT Ty = MRI.getType(Res);
3725 
3726     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3727                           ? TargetOpcode::G_SMULH
3728                           : TargetOpcode::G_UMULH;
3729 
3730     Observer.changingInstr(MI);
3731     const auto &TII = MIRBuilder.getTII();
3732     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3733     MI.removeOperand(1);
3734     Observer.changedInstr(MI);
3735 
3736     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3737     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3738 
3739     // Move insert point forward so we can use the Res register if needed.
3740     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3741 
3742     // For *signed* multiply, overflow is detected by checking:
3743     // (hi != (lo >> bitwidth-1))
3744     if (Opcode == TargetOpcode::G_SMULH) {
3745       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3746       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3747       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3748     } else {
3749       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3750     }
3751     return Legalized;
3752   }
3753   case TargetOpcode::G_FNEG: {
3754     auto [Res, SubByReg] = MI.getFirst2Regs();
3755     LLT Ty = MRI.getType(Res);
3756 
3757     // TODO: Handle vector types once we are able to
3758     // represent them.
3759     if (Ty.isVector())
3760       return UnableToLegalize;
3761     auto SignMask =
3762         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3763     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3764     MI.eraseFromParent();
3765     return Legalized;
3766   }
3767   case TargetOpcode::G_FSUB:
3768   case TargetOpcode::G_STRICT_FSUB: {
3769     auto [Res, LHS, RHS] = MI.getFirst3Regs();
3770     LLT Ty = MRI.getType(Res);
3771 
3772     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3773     auto Neg = MIRBuilder.buildFNeg(Ty, RHS);
3774 
3775     if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB)
3776       MIRBuilder.buildStrictFAdd(Res, LHS, Neg, MI.getFlags());
3777     else
3778       MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3779 
3780     MI.eraseFromParent();
3781     return Legalized;
3782   }
3783   case TargetOpcode::G_FMAD:
3784     return lowerFMad(MI);
3785   case TargetOpcode::G_FFLOOR:
3786     return lowerFFloor(MI);
3787   case TargetOpcode::G_INTRINSIC_ROUND:
3788     return lowerIntrinsicRound(MI);
3789   case TargetOpcode::G_FRINT: {
3790     // Since round even is the assumed rounding mode for unconstrained FP
3791     // operations, rint and roundeven are the same operation.
3792     changeOpcode(MI, TargetOpcode::G_INTRINSIC_ROUNDEVEN);
3793     return Legalized;
3794   }
3795   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3796     auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
3797     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3798                                   **MI.memoperands_begin());
3799     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3800     MI.eraseFromParent();
3801     return Legalized;
3802   }
3803   case TargetOpcode::G_LOAD:
3804   case TargetOpcode::G_SEXTLOAD:
3805   case TargetOpcode::G_ZEXTLOAD:
3806     return lowerLoad(cast<GAnyLoad>(MI));
3807   case TargetOpcode::G_STORE:
3808     return lowerStore(cast<GStore>(MI));
3809   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3810   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3811   case TargetOpcode::G_CTLZ:
3812   case TargetOpcode::G_CTTZ:
3813   case TargetOpcode::G_CTPOP:
3814     return lowerBitCount(MI);
3815   case G_UADDO: {
3816     auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs();
3817 
3818     MIRBuilder.buildAdd(Res, LHS, RHS);
3819     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3820 
3821     MI.eraseFromParent();
3822     return Legalized;
3823   }
3824   case G_UADDE: {
3825     auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
3826     const LLT CondTy = MRI.getType(CarryOut);
3827     const LLT Ty = MRI.getType(Res);
3828 
3829     // Initial add of the two operands.
3830     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3831 
3832     // Initial check for carry.
3833     auto Carry = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, TmpRes, LHS);
3834 
3835     // Add the sum and the carry.
3836     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3837     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3838 
3839     // Second check for carry. We can only carry if the initial sum is all 1s
3840     // and the carry is set, resulting in a new sum of 0.
3841     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3842     auto ResEqZero = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, Res, Zero);
3843     auto Carry2 = MIRBuilder.buildAnd(CondTy, ResEqZero, CarryIn);
3844     MIRBuilder.buildOr(CarryOut, Carry, Carry2);
3845 
3846     MI.eraseFromParent();
3847     return Legalized;
3848   }
3849   case G_USUBO: {
3850     auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs();
3851 
3852     MIRBuilder.buildSub(Res, LHS, RHS);
3853     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3854 
3855     MI.eraseFromParent();
3856     return Legalized;
3857   }
3858   case G_USUBE: {
3859     auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs();
3860     const LLT CondTy = MRI.getType(BorrowOut);
3861     const LLT Ty = MRI.getType(Res);
3862 
3863     // Initial subtract of the two operands.
3864     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3865 
3866     // Initial check for borrow.
3867     auto Borrow = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, CondTy, TmpRes, LHS);
3868 
3869     // Subtract the borrow from the first subtract.
3870     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3871     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3872 
3873     // Second check for borrow. We can only borrow if the initial difference is
3874     // 0 and the borrow is set, resulting in a new difference of all 1s.
3875     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3876     auto TmpResEqZero =
3877         MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, TmpRes, Zero);
3878     auto Borrow2 = MIRBuilder.buildAnd(CondTy, TmpResEqZero, BorrowIn);
3879     MIRBuilder.buildOr(BorrowOut, Borrow, Borrow2);
3880 
3881     MI.eraseFromParent();
3882     return Legalized;
3883   }
3884   case G_UITOFP:
3885     return lowerUITOFP(MI);
3886   case G_SITOFP:
3887     return lowerSITOFP(MI);
3888   case G_FPTOUI:
3889     return lowerFPTOUI(MI);
3890   case G_FPTOSI:
3891     return lowerFPTOSI(MI);
3892   case G_FPTRUNC:
3893     return lowerFPTRUNC(MI);
3894   case G_FPOWI:
3895     return lowerFPOWI(MI);
3896   case G_SMIN:
3897   case G_SMAX:
3898   case G_UMIN:
3899   case G_UMAX:
3900     return lowerMinMax(MI);
3901   case G_FCOPYSIGN:
3902     return lowerFCopySign(MI);
3903   case G_FMINNUM:
3904   case G_FMAXNUM:
3905     return lowerFMinNumMaxNum(MI);
3906   case G_MERGE_VALUES:
3907     return lowerMergeValues(MI);
3908   case G_UNMERGE_VALUES:
3909     return lowerUnmergeValues(MI);
3910   case TargetOpcode::G_SEXT_INREG: {
3911     assert(MI.getOperand(2).isImm() && "Expected immediate");
3912     int64_t SizeInBits = MI.getOperand(2).getImm();
3913 
3914     auto [DstReg, SrcReg] = MI.getFirst2Regs();
3915     LLT DstTy = MRI.getType(DstReg);
3916     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3917 
3918     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3919     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3920     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3921     MI.eraseFromParent();
3922     return Legalized;
3923   }
3924   case G_EXTRACT_VECTOR_ELT:
3925   case G_INSERT_VECTOR_ELT:
3926     return lowerExtractInsertVectorElt(MI);
3927   case G_SHUFFLE_VECTOR:
3928     return lowerShuffleVector(MI);
3929   case G_DYN_STACKALLOC:
3930     return lowerDynStackAlloc(MI);
3931   case G_STACKSAVE:
3932     return lowerStackSave(MI);
3933   case G_STACKRESTORE:
3934     return lowerStackRestore(MI);
3935   case G_EXTRACT:
3936     return lowerExtract(MI);
3937   case G_INSERT:
3938     return lowerInsert(MI);
3939   case G_BSWAP:
3940     return lowerBswap(MI);
3941   case G_BITREVERSE:
3942     return lowerBitreverse(MI);
3943   case G_READ_REGISTER:
3944   case G_WRITE_REGISTER:
3945     return lowerReadWriteRegister(MI);
3946   case G_UADDSAT:
3947   case G_USUBSAT: {
3948     // Try to make a reasonable guess about which lowering strategy to use. The
3949     // target can override this with custom lowering and calling the
3950     // implementation functions.
3951     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3952     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3953       return lowerAddSubSatToMinMax(MI);
3954     return lowerAddSubSatToAddoSubo(MI);
3955   }
3956   case G_SADDSAT:
3957   case G_SSUBSAT: {
3958     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3959 
3960     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3961     // since it's a shorter expansion. However, we would need to figure out the
3962     // preferred boolean type for the carry out for the query.
3963     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3964       return lowerAddSubSatToMinMax(MI);
3965     return lowerAddSubSatToAddoSubo(MI);
3966   }
3967   case G_SSHLSAT:
3968   case G_USHLSAT:
3969     return lowerShlSat(MI);
3970   case G_ABS:
3971     return lowerAbsToAddXor(MI);
3972   case G_SELECT:
3973     return lowerSelect(MI);
3974   case G_IS_FPCLASS:
3975     return lowerISFPCLASS(MI);
3976   case G_SDIVREM:
3977   case G_UDIVREM:
3978     return lowerDIVREM(MI);
3979   case G_FSHL:
3980   case G_FSHR:
3981     return lowerFunnelShift(MI);
3982   case G_ROTL:
3983   case G_ROTR:
3984     return lowerRotate(MI);
3985   case G_MEMSET:
3986   case G_MEMCPY:
3987   case G_MEMMOVE:
3988     return lowerMemCpyFamily(MI);
3989   case G_MEMCPY_INLINE:
3990     return lowerMemcpyInline(MI);
3991   case G_ZEXT:
3992   case G_SEXT:
3993   case G_ANYEXT:
3994     return lowerEXT(MI);
3995   case G_TRUNC:
3996     return lowerTRUNC(MI);
3997   GISEL_VECREDUCE_CASES_NONSEQ
3998     return lowerVectorReduction(MI);
3999   case G_VAARG:
4000     return lowerVAArg(MI);
4001   }
4002 }
4003 
4004 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
4005                                                   Align MinAlign) const {
4006   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
4007   // datalayout for the preferred alignment. Also there should be a target hook
4008   // for this to allow targets to reduce the alignment and ignore the
4009   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
4010   // the type.
4011   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
4012 }
4013 
4014 MachineInstrBuilder
4015 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
4016                                       MachinePointerInfo &PtrInfo) {
4017   MachineFunction &MF = MIRBuilder.getMF();
4018   const DataLayout &DL = MIRBuilder.getDataLayout();
4019   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
4020 
4021   unsigned AddrSpace = DL.getAllocaAddrSpace();
4022   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
4023 
4024   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
4025   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
4026 }
4027 
4028 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
4029                                         LLT VecTy) {
4030   int64_t IdxVal;
4031   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
4032     return IdxReg;
4033 
4034   LLT IdxTy = B.getMRI()->getType(IdxReg);
4035   unsigned NElts = VecTy.getNumElements();
4036   if (isPowerOf2_32(NElts)) {
4037     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
4038     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
4039   }
4040 
4041   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
4042       .getReg(0);
4043 }
4044 
4045 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
4046                                                   Register Index) {
4047   LLT EltTy = VecTy.getElementType();
4048 
4049   // Calculate the element offset and add it to the pointer.
4050   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
4051   assert(EltSize * 8 == EltTy.getSizeInBits() &&
4052          "Converting bits to bytes lost precision");
4053 
4054   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
4055 
4056   LLT IdxTy = MRI.getType(Index);
4057   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
4058                                  MIRBuilder.buildConstant(IdxTy, EltSize));
4059 
4060   LLT PtrTy = MRI.getType(VecPtr);
4061   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
4062 }
4063 
4064 #ifndef NDEBUG
4065 /// Check that all vector operands have same number of elements. Other operands
4066 /// should be listed in NonVecOp.
4067 static bool hasSameNumEltsOnAllVectorOperands(
4068     GenericMachineInstr &MI, MachineRegisterInfo &MRI,
4069     std::initializer_list<unsigned> NonVecOpIndices) {
4070   if (MI.getNumMemOperands() != 0)
4071     return false;
4072 
4073   LLT VecTy = MRI.getType(MI.getReg(0));
4074   if (!VecTy.isVector())
4075     return false;
4076   unsigned NumElts = VecTy.getNumElements();
4077 
4078   for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
4079     MachineOperand &Op = MI.getOperand(OpIdx);
4080     if (!Op.isReg()) {
4081       if (!is_contained(NonVecOpIndices, OpIdx))
4082         return false;
4083       continue;
4084     }
4085 
4086     LLT Ty = MRI.getType(Op.getReg());
4087     if (!Ty.isVector()) {
4088       if (!is_contained(NonVecOpIndices, OpIdx))
4089         return false;
4090       continue;
4091     }
4092 
4093     if (Ty.getNumElements() != NumElts)
4094       return false;
4095   }
4096 
4097   return true;
4098 }
4099 #endif
4100 
4101 /// Fill \p DstOps with DstOps that have same number of elements combined as
4102 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
4103 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
4104 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
4105 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
4106                        unsigned NumElts) {
4107   LLT LeftoverTy;
4108   assert(Ty.isVector() && "Expected vector type");
4109   LLT EltTy = Ty.getElementType();
4110   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
4111   int NumParts, NumLeftover;
4112   std::tie(NumParts, NumLeftover) =
4113       getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
4114 
4115   assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
4116   for (int i = 0; i < NumParts; ++i) {
4117     DstOps.push_back(NarrowTy);
4118   }
4119 
4120   if (LeftoverTy.isValid()) {
4121     assert(NumLeftover == 1 && "expected exactly one leftover");
4122     DstOps.push_back(LeftoverTy);
4123   }
4124 }
4125 
4126 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
4127 /// made from \p Op depending on operand type.
4128 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
4129                            MachineOperand &Op) {
4130   for (unsigned i = 0; i < N; ++i) {
4131     if (Op.isReg())
4132       Ops.push_back(Op.getReg());
4133     else if (Op.isImm())
4134       Ops.push_back(Op.getImm());
4135     else if (Op.isPredicate())
4136       Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
4137     else
4138       llvm_unreachable("Unsupported type");
4139   }
4140 }
4141 
4142 // Handle splitting vector operations which need to have the same number of
4143 // elements in each type index, but each type index may have a different element
4144 // type.
4145 //
4146 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
4147 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4148 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4149 //
4150 // Also handles some irregular breakdown cases, e.g.
4151 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
4152 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4153 //             s64 = G_SHL s64, s32
4154 LegalizerHelper::LegalizeResult
4155 LegalizerHelper::fewerElementsVectorMultiEltType(
4156     GenericMachineInstr &MI, unsigned NumElts,
4157     std::initializer_list<unsigned> NonVecOpIndices) {
4158   assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
4159          "Non-compatible opcode or not specified non-vector operands");
4160   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
4161 
4162   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
4163   unsigned NumDefs = MI.getNumDefs();
4164 
4165   // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
4166   // Build instructions with DstOps to use instruction found by CSE directly.
4167   // CSE copies found instruction into given vreg when building with vreg dest.
4168   SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
4169   // Output registers will be taken from created instructions.
4170   SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
4171   for (unsigned i = 0; i < NumDefs; ++i) {
4172     makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
4173   }
4174 
4175   // Split vector input operands into sub-vectors with NumElts elts + Leftover.
4176   // Operands listed in NonVecOpIndices will be used as is without splitting;
4177   // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
4178   // scalar condition (op 1), immediate in sext_inreg (op 2).
4179   SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
4180   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
4181        ++UseIdx, ++UseNo) {
4182     if (is_contained(NonVecOpIndices, UseIdx)) {
4183       broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
4184                      MI.getOperand(UseIdx));
4185     } else {
4186       SmallVector<Register, 8> SplitPieces;
4187       extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces);
4188       for (auto Reg : SplitPieces)
4189         InputOpsPieces[UseNo].push_back(Reg);
4190     }
4191   }
4192 
4193   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
4194 
4195   // Take i-th piece of each input operand split and build sub-vector/scalar
4196   // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
4197   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
4198     SmallVector<DstOp, 2> Defs;
4199     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
4200       Defs.push_back(OutputOpsPieces[DstNo][i]);
4201 
4202     SmallVector<SrcOp, 3> Uses;
4203     for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
4204       Uses.push_back(InputOpsPieces[InputNo][i]);
4205 
4206     auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
4207     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
4208       OutputRegs[DstNo].push_back(I.getReg(DstNo));
4209   }
4210 
4211   // Merge small outputs into MI's output for each def operand.
4212   if (NumLeftovers) {
4213     for (unsigned i = 0; i < NumDefs; ++i)
4214       mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
4215   } else {
4216     for (unsigned i = 0; i < NumDefs; ++i)
4217       MIRBuilder.buildMergeLikeInstr(MI.getReg(i), OutputRegs[i]);
4218   }
4219 
4220   MI.eraseFromParent();
4221   return Legalized;
4222 }
4223 
4224 LegalizerHelper::LegalizeResult
4225 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
4226                                         unsigned NumElts) {
4227   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
4228 
4229   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
4230   unsigned NumDefs = MI.getNumDefs();
4231 
4232   SmallVector<DstOp, 8> OutputOpsPieces;
4233   SmallVector<Register, 8> OutputRegs;
4234   makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
4235 
4236   // Instructions that perform register split will be inserted in basic block
4237   // where register is defined (basic block is in the next operand).
4238   SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
4239   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
4240        UseIdx += 2, ++UseNo) {
4241     MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
4242     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
4243     extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo]);
4244   }
4245 
4246   // Build PHIs with fewer elements.
4247   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
4248   MIRBuilder.setInsertPt(*MI.getParent(), MI);
4249   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
4250     auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
4251     Phi.addDef(
4252         MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
4253     OutputRegs.push_back(Phi.getReg(0));
4254 
4255     for (unsigned j = 0; j < NumInputs / 2; ++j) {
4256       Phi.addUse(InputOpsPieces[j][i]);
4257       Phi.add(MI.getOperand(1 + j * 2 + 1));
4258     }
4259   }
4260 
4261   // Merge small outputs into MI's def.
4262   if (NumLeftovers) {
4263     mergeMixedSubvectors(MI.getReg(0), OutputRegs);
4264   } else {
4265     MIRBuilder.buildMergeLikeInstr(MI.getReg(0), OutputRegs);
4266   }
4267 
4268   MI.eraseFromParent();
4269   return Legalized;
4270 }
4271 
4272 LegalizerHelper::LegalizeResult
4273 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
4274                                                   unsigned TypeIdx,
4275                                                   LLT NarrowTy) {
4276   const int NumDst = MI.getNumOperands() - 1;
4277   const Register SrcReg = MI.getOperand(NumDst).getReg();
4278   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
4279   LLT SrcTy = MRI.getType(SrcReg);
4280 
4281   if (TypeIdx != 1 || NarrowTy == DstTy)
4282     return UnableToLegalize;
4283 
4284   // Requires compatible types. Otherwise SrcReg should have been defined by
4285   // merge-like instruction that would get artifact combined. Most likely
4286   // instruction that defines SrcReg has to perform more/fewer elements
4287   // legalization compatible with NarrowTy.
4288   assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
4289   assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
4290 
4291   if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
4292       (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
4293     return UnableToLegalize;
4294 
4295   // This is most likely DstTy (smaller then register size) packed in SrcTy
4296   // (larger then register size) and since unmerge was not combined it will be
4297   // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
4298   // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
4299 
4300   // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
4301   //
4302   // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
4303   // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
4304   // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
4305   auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
4306   const int NumUnmerge = Unmerge->getNumOperands() - 1;
4307   const int PartsPerUnmerge = NumDst / NumUnmerge;
4308 
4309   for (int I = 0; I != NumUnmerge; ++I) {
4310     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
4311 
4312     for (int J = 0; J != PartsPerUnmerge; ++J)
4313       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
4314     MIB.addUse(Unmerge.getReg(I));
4315   }
4316 
4317   MI.eraseFromParent();
4318   return Legalized;
4319 }
4320 
4321 LegalizerHelper::LegalizeResult
4322 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
4323                                           LLT NarrowTy) {
4324   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
4325   // Requires compatible types. Otherwise user of DstReg did not perform unmerge
4326   // that should have been artifact combined. Most likely instruction that uses
4327   // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
4328   assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
4329   assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
4330   if (NarrowTy == SrcTy)
4331     return UnableToLegalize;
4332 
4333   // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
4334   // is for old mir tests. Since the changes to more/fewer elements it should no
4335   // longer be possible to generate MIR like this when starting from llvm-ir
4336   // because LCMTy approach was replaced with merge/unmerge to vector elements.
4337   if (TypeIdx == 1) {
4338     assert(SrcTy.isVector() && "Expected vector types");
4339     assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
4340     if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
4341         (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
4342       return UnableToLegalize;
4343     // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
4344     //
4345     // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
4346     // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
4347     // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
4348     // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
4349     // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
4350     // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
4351 
4352     SmallVector<Register, 8> Elts;
4353     LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
4354     for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
4355       auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
4356       for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
4357         Elts.push_back(Unmerge.getReg(j));
4358     }
4359 
4360     SmallVector<Register, 8> NarrowTyElts;
4361     unsigned NumNarrowTyElts = NarrowTy.getNumElements();
4362     unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
4363     for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
4364          ++i, Offset += NumNarrowTyElts) {
4365       ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
4366       NarrowTyElts.push_back(
4367           MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0));
4368     }
4369 
4370     MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
4371     MI.eraseFromParent();
4372     return Legalized;
4373   }
4374 
4375   assert(TypeIdx == 0 && "Bad type index");
4376   if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
4377       (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
4378     return UnableToLegalize;
4379 
4380   // This is most likely SrcTy (smaller then register size) packed in DstTy
4381   // (larger then register size) and since merge was not combined it will be
4382   // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
4383   // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
4384 
4385   // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
4386   //
4387   // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
4388   // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
4389   // %0:_(DstTy)  = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
4390   SmallVector<Register, 8> NarrowTyElts;
4391   unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
4392   unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
4393   unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
4394   for (unsigned i = 0; i < NumParts; ++i) {
4395     SmallVector<Register, 8> Sources;
4396     for (unsigned j = 0; j < NumElts; ++j)
4397       Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
4398     NarrowTyElts.push_back(
4399         MIRBuilder.buildMergeLikeInstr(NarrowTy, Sources).getReg(0));
4400   }
4401 
4402   MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
4403   MI.eraseFromParent();
4404   return Legalized;
4405 }
4406 
4407 LegalizerHelper::LegalizeResult
4408 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
4409                                                            unsigned TypeIdx,
4410                                                            LLT NarrowVecTy) {
4411   auto [DstReg, SrcVec] = MI.getFirst2Regs();
4412   Register InsertVal;
4413   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
4414 
4415   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
4416   if (IsInsert)
4417     InsertVal = MI.getOperand(2).getReg();
4418 
4419   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
4420 
4421   // TODO: Handle total scalarization case.
4422   if (!NarrowVecTy.isVector())
4423     return UnableToLegalize;
4424 
4425   LLT VecTy = MRI.getType(SrcVec);
4426 
4427   // If the index is a constant, we can really break this down as you would
4428   // expect, and index into the target size pieces.
4429   int64_t IdxVal;
4430   auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
4431   if (MaybeCst) {
4432     IdxVal = MaybeCst->Value.getSExtValue();
4433     // Avoid out of bounds indexing the pieces.
4434     if (IdxVal >= VecTy.getNumElements()) {
4435       MIRBuilder.buildUndef(DstReg);
4436       MI.eraseFromParent();
4437       return Legalized;
4438     }
4439 
4440     SmallVector<Register, 8> VecParts;
4441     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
4442 
4443     // Build a sequence of NarrowTy pieces in VecParts for this operand.
4444     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
4445                                     TargetOpcode::G_ANYEXT);
4446 
4447     unsigned NewNumElts = NarrowVecTy.getNumElements();
4448 
4449     LLT IdxTy = MRI.getType(Idx);
4450     int64_t PartIdx = IdxVal / NewNumElts;
4451     auto NewIdx =
4452         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
4453 
4454     if (IsInsert) {
4455       LLT PartTy = MRI.getType(VecParts[PartIdx]);
4456 
4457       // Use the adjusted index to insert into one of the subvectors.
4458       auto InsertPart = MIRBuilder.buildInsertVectorElement(
4459           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
4460       VecParts[PartIdx] = InsertPart.getReg(0);
4461 
4462       // Recombine the inserted subvector with the others to reform the result
4463       // vector.
4464       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4465     } else {
4466       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4467     }
4468 
4469     MI.eraseFromParent();
4470     return Legalized;
4471   }
4472 
4473   // With a variable index, we can't perform the operation in a smaller type, so
4474   // we're forced to expand this.
4475   //
4476   // TODO: We could emit a chain of compare/select to figure out which piece to
4477   // index.
4478   return lowerExtractInsertVectorElt(MI);
4479 }
4480 
4481 LegalizerHelper::LegalizeResult
4482 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4483                                       LLT NarrowTy) {
4484   // FIXME: Don't know how to handle secondary types yet.
4485   if (TypeIdx != 0)
4486     return UnableToLegalize;
4487 
4488   // This implementation doesn't work for atomics. Give up instead of doing
4489   // something invalid.
4490   if (LdStMI.isAtomic())
4491     return UnableToLegalize;
4492 
4493   bool IsLoad = isa<GLoad>(LdStMI);
4494   Register ValReg = LdStMI.getReg(0);
4495   Register AddrReg = LdStMI.getPointerReg();
4496   LLT ValTy = MRI.getType(ValReg);
4497 
4498   // FIXME: Do we need a distinct NarrowMemory legalize action?
4499   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4500     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4501     return UnableToLegalize;
4502   }
4503 
4504   int NumParts = -1;
4505   int NumLeftover = -1;
4506   LLT LeftoverTy;
4507   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4508   if (IsLoad) {
4509     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4510   } else {
4511     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4512                      NarrowLeftoverRegs)) {
4513       NumParts = NarrowRegs.size();
4514       NumLeftover = NarrowLeftoverRegs.size();
4515     }
4516   }
4517 
4518   if (NumParts == -1)
4519     return UnableToLegalize;
4520 
4521   LLT PtrTy = MRI.getType(AddrReg);
4522   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4523 
4524   unsigned TotalSize = ValTy.getSizeInBits();
4525 
4526   // Split the load/store into PartTy sized pieces starting at Offset. If this
4527   // is a load, return the new registers in ValRegs. For a store, each elements
4528   // of ValRegs should be PartTy. Returns the next offset that needs to be
4529   // handled.
4530   bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
4531   auto MMO = LdStMI.getMMO();
4532   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4533                              unsigned NumParts, unsigned Offset) -> unsigned {
4534     MachineFunction &MF = MIRBuilder.getMF();
4535     unsigned PartSize = PartTy.getSizeInBits();
4536     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4537          ++Idx) {
4538       unsigned ByteOffset = Offset / 8;
4539       Register NewAddrReg;
4540 
4541       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4542 
4543       MachineMemOperand *NewMMO =
4544           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4545 
4546       if (IsLoad) {
4547         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4548         ValRegs.push_back(Dst);
4549         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4550       } else {
4551         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4552       }
4553       Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
4554     }
4555 
4556     return Offset;
4557   };
4558 
4559   unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
4560   unsigned HandledOffset =
4561       splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
4562 
4563   // Handle the rest of the register if this isn't an even type breakdown.
4564   if (LeftoverTy.isValid())
4565     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
4566 
4567   if (IsLoad) {
4568     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4569                 LeftoverTy, NarrowLeftoverRegs);
4570   }
4571 
4572   LdStMI.eraseFromParent();
4573   return Legalized;
4574 }
4575 
4576 LegalizerHelper::LegalizeResult
4577 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4578                                      LLT NarrowTy) {
4579   using namespace TargetOpcode;
4580   GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
4581   unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
4582 
4583   switch (MI.getOpcode()) {
4584   case G_IMPLICIT_DEF:
4585   case G_TRUNC:
4586   case G_AND:
4587   case G_OR:
4588   case G_XOR:
4589   case G_ADD:
4590   case G_SUB:
4591   case G_MUL:
4592   case G_PTR_ADD:
4593   case G_SMULH:
4594   case G_UMULH:
4595   case G_FADD:
4596   case G_FMUL:
4597   case G_FSUB:
4598   case G_FNEG:
4599   case G_FABS:
4600   case G_FCANONICALIZE:
4601   case G_FDIV:
4602   case G_FREM:
4603   case G_FMA:
4604   case G_FMAD:
4605   case G_FPOW:
4606   case G_FEXP:
4607   case G_FEXP2:
4608   case G_FEXP10:
4609   case G_FLOG:
4610   case G_FLOG2:
4611   case G_FLOG10:
4612   case G_FLDEXP:
4613   case G_FNEARBYINT:
4614   case G_FCEIL:
4615   case G_FFLOOR:
4616   case G_FRINT:
4617   case G_INTRINSIC_ROUND:
4618   case G_INTRINSIC_ROUNDEVEN:
4619   case G_INTRINSIC_TRUNC:
4620   case G_FCOS:
4621   case G_FSIN:
4622   case G_FSQRT:
4623   case G_BSWAP:
4624   case G_BITREVERSE:
4625   case G_SDIV:
4626   case G_UDIV:
4627   case G_SREM:
4628   case G_UREM:
4629   case G_SDIVREM:
4630   case G_UDIVREM:
4631   case G_SMIN:
4632   case G_SMAX:
4633   case G_UMIN:
4634   case G_UMAX:
4635   case G_ABS:
4636   case G_FMINNUM:
4637   case G_FMAXNUM:
4638   case G_FMINNUM_IEEE:
4639   case G_FMAXNUM_IEEE:
4640   case G_FMINIMUM:
4641   case G_FMAXIMUM:
4642   case G_FSHL:
4643   case G_FSHR:
4644   case G_ROTL:
4645   case G_ROTR:
4646   case G_FREEZE:
4647   case G_SADDSAT:
4648   case G_SSUBSAT:
4649   case G_UADDSAT:
4650   case G_USUBSAT:
4651   case G_UMULO:
4652   case G_SMULO:
4653   case G_SHL:
4654   case G_LSHR:
4655   case G_ASHR:
4656   case G_SSHLSAT:
4657   case G_USHLSAT:
4658   case G_CTLZ:
4659   case G_CTLZ_ZERO_UNDEF:
4660   case G_CTTZ:
4661   case G_CTTZ_ZERO_UNDEF:
4662   case G_CTPOP:
4663   case G_FCOPYSIGN:
4664   case G_ZEXT:
4665   case G_SEXT:
4666   case G_ANYEXT:
4667   case G_FPEXT:
4668   case G_FPTRUNC:
4669   case G_SITOFP:
4670   case G_UITOFP:
4671   case G_FPTOSI:
4672   case G_FPTOUI:
4673   case G_INTTOPTR:
4674   case G_PTRTOINT:
4675   case G_ADDRSPACE_CAST:
4676   case G_UADDO:
4677   case G_USUBO:
4678   case G_UADDE:
4679   case G_USUBE:
4680   case G_SADDO:
4681   case G_SSUBO:
4682   case G_SADDE:
4683   case G_SSUBE:
4684   case G_STRICT_FADD:
4685   case G_STRICT_FSUB:
4686   case G_STRICT_FMUL:
4687   case G_STRICT_FMA:
4688   case G_STRICT_FLDEXP:
4689   case G_FFREXP:
4690     return fewerElementsVectorMultiEltType(GMI, NumElts);
4691   case G_ICMP:
4692   case G_FCMP:
4693     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
4694   case G_IS_FPCLASS:
4695     return fewerElementsVectorMultiEltType(GMI, NumElts, {2, 3 /*mask,fpsem*/});
4696   case G_SELECT:
4697     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4698       return fewerElementsVectorMultiEltType(GMI, NumElts);
4699     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
4700   case G_PHI:
4701     return fewerElementsVectorPhi(GMI, NumElts);
4702   case G_UNMERGE_VALUES:
4703     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4704   case G_BUILD_VECTOR:
4705     assert(TypeIdx == 0 && "not a vector type index");
4706     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4707   case G_CONCAT_VECTORS:
4708     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4709       return UnableToLegalize;
4710     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4711   case G_EXTRACT_VECTOR_ELT:
4712   case G_INSERT_VECTOR_ELT:
4713     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4714   case G_LOAD:
4715   case G_STORE:
4716     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4717   case G_SEXT_INREG:
4718     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
4719   GISEL_VECREDUCE_CASES_NONSEQ
4720     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4721   case TargetOpcode::G_VECREDUCE_SEQ_FADD:
4722   case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
4723     return fewerElementsVectorSeqReductions(MI, TypeIdx, NarrowTy);
4724   case G_SHUFFLE_VECTOR:
4725     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4726   case G_FPOWI:
4727     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*pow*/});
4728   default:
4729     return UnableToLegalize;
4730   }
4731 }
4732 
4733 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4734     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4735   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4736   if (TypeIdx != 0)
4737     return UnableToLegalize;
4738 
4739   auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] =
4740       MI.getFirst3RegLLTs();
4741   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4742   // The shuffle should be canonicalized by now.
4743   if (DstTy != Src1Ty)
4744     return UnableToLegalize;
4745   if (DstTy != Src2Ty)
4746     return UnableToLegalize;
4747 
4748   if (!isPowerOf2_32(DstTy.getNumElements()))
4749     return UnableToLegalize;
4750 
4751   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4752   // Further legalization attempts will be needed to do split further.
4753   NarrowTy =
4754       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4755   unsigned NewElts = NarrowTy.getNumElements();
4756 
4757   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4758   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
4759   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
4760   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4761                         SplitSrc2Regs[1]};
4762 
4763   Register Hi, Lo;
4764 
4765   // If Lo or Hi uses elements from at most two of the four input vectors, then
4766   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4767   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4768   SmallVector<int, 16> Ops;
4769   for (unsigned High = 0; High < 2; ++High) {
4770     Register &Output = High ? Hi : Lo;
4771 
4772     // Build a shuffle mask for the output, discovering on the fly which
4773     // input vectors to use as shuffle operands (recorded in InputUsed).
4774     // If building a suitable shuffle vector proves too hard, then bail
4775     // out with useBuildVector set.
4776     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4777     unsigned FirstMaskIdx = High * NewElts;
4778     bool UseBuildVector = false;
4779     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4780       // The mask element.  This indexes into the input.
4781       int Idx = Mask[FirstMaskIdx + MaskOffset];
4782 
4783       // The input vector this mask element indexes into.
4784       unsigned Input = (unsigned)Idx / NewElts;
4785 
4786       if (Input >= std::size(Inputs)) {
4787         // The mask element does not index into any input vector.
4788         Ops.push_back(-1);
4789         continue;
4790       }
4791 
4792       // Turn the index into an offset from the start of the input vector.
4793       Idx -= Input * NewElts;
4794 
4795       // Find or create a shuffle vector operand to hold this input.
4796       unsigned OpNo;
4797       for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) {
4798         if (InputUsed[OpNo] == Input) {
4799           // This input vector is already an operand.
4800           break;
4801         } else if (InputUsed[OpNo] == -1U) {
4802           // Create a new operand for this input vector.
4803           InputUsed[OpNo] = Input;
4804           break;
4805         }
4806       }
4807 
4808       if (OpNo >= std::size(InputUsed)) {
4809         // More than two input vectors used!  Give up on trying to create a
4810         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4811         UseBuildVector = true;
4812         break;
4813       }
4814 
4815       // Add the mask index for the new shuffle vector.
4816       Ops.push_back(Idx + OpNo * NewElts);
4817     }
4818 
4819     if (UseBuildVector) {
4820       LLT EltTy = NarrowTy.getElementType();
4821       SmallVector<Register, 16> SVOps;
4822 
4823       // Extract the input elements by hand.
4824       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4825         // The mask element.  This indexes into the input.
4826         int Idx = Mask[FirstMaskIdx + MaskOffset];
4827 
4828         // The input vector this mask element indexes into.
4829         unsigned Input = (unsigned)Idx / NewElts;
4830 
4831         if (Input >= std::size(Inputs)) {
4832           // The mask element is "undef" or indexes off the end of the input.
4833           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4834           continue;
4835         }
4836 
4837         // Turn the index into an offset from the start of the input vector.
4838         Idx -= Input * NewElts;
4839 
4840         // Extract the vector element by hand.
4841         SVOps.push_back(MIRBuilder
4842                             .buildExtractVectorElement(
4843                                 EltTy, Inputs[Input],
4844                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4845                             .getReg(0));
4846       }
4847 
4848       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4849       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4850     } else if (InputUsed[0] == -1U) {
4851       // No input vectors were used! The result is undefined.
4852       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4853     } else {
4854       Register Op0 = Inputs[InputUsed[0]];
4855       // If only one input was used, use an undefined vector for the other.
4856       Register Op1 = InputUsed[1] == -1U
4857                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4858                          : Inputs[InputUsed[1]];
4859       // At least one input vector was used. Create a new shuffle vector.
4860       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4861     }
4862 
4863     Ops.clear();
4864   }
4865 
4866   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4867   MI.eraseFromParent();
4868   return Legalized;
4869 }
4870 
4871 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4872     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4873   auto &RdxMI = cast<GVecReduce>(MI);
4874 
4875   if (TypeIdx != 1)
4876     return UnableToLegalize;
4877 
4878   // The semantics of the normal non-sequential reductions allow us to freely
4879   // re-associate the operation.
4880   auto [DstReg, DstTy, SrcReg, SrcTy] = RdxMI.getFirst2RegLLTs();
4881 
4882   if (NarrowTy.isVector() &&
4883       (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4884     return UnableToLegalize;
4885 
4886   unsigned ScalarOpc = RdxMI.getScalarOpcForReduction();
4887   SmallVector<Register> SplitSrcs;
4888   // If NarrowTy is a scalar then we're being asked to scalarize.
4889   const unsigned NumParts =
4890       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4891                           : SrcTy.getNumElements();
4892 
4893   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4894   if (NarrowTy.isScalar()) {
4895     if (DstTy != NarrowTy)
4896       return UnableToLegalize; // FIXME: handle implicit extensions.
4897 
4898     if (isPowerOf2_32(NumParts)) {
4899       // Generate a tree of scalar operations to reduce the critical path.
4900       SmallVector<Register> PartialResults;
4901       unsigned NumPartsLeft = NumParts;
4902       while (NumPartsLeft > 1) {
4903         for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4904           PartialResults.emplace_back(
4905               MIRBuilder
4906                   .buildInstr(ScalarOpc, {NarrowTy},
4907                               {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4908                   .getReg(0));
4909         }
4910         SplitSrcs = PartialResults;
4911         PartialResults.clear();
4912         NumPartsLeft = SplitSrcs.size();
4913       }
4914       assert(SplitSrcs.size() == 1);
4915       MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
4916       MI.eraseFromParent();
4917       return Legalized;
4918     }
4919     // If we can't generate a tree, then just do sequential operations.
4920     Register Acc = SplitSrcs[0];
4921     for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4922       Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
4923                 .getReg(0);
4924     MIRBuilder.buildCopy(DstReg, Acc);
4925     MI.eraseFromParent();
4926     return Legalized;
4927   }
4928   SmallVector<Register> PartialReductions;
4929   for (unsigned Part = 0; Part < NumParts; ++Part) {
4930     PartialReductions.push_back(
4931         MIRBuilder.buildInstr(RdxMI.getOpcode(), {DstTy}, {SplitSrcs[Part]})
4932             .getReg(0));
4933   }
4934 
4935   // If the types involved are powers of 2, we can generate intermediate vector
4936   // ops, before generating a final reduction operation.
4937   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4938       isPowerOf2_32(NarrowTy.getNumElements())) {
4939     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4940   }
4941 
4942   Register Acc = PartialReductions[0];
4943   for (unsigned Part = 1; Part < NumParts; ++Part) {
4944     if (Part == NumParts - 1) {
4945       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4946                             {Acc, PartialReductions[Part]});
4947     } else {
4948       Acc = MIRBuilder
4949                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4950                 .getReg(0);
4951     }
4952   }
4953   MI.eraseFromParent();
4954   return Legalized;
4955 }
4956 
4957 LegalizerHelper::LegalizeResult
4958 LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr &MI,
4959                                                   unsigned int TypeIdx,
4960                                                   LLT NarrowTy) {
4961   auto [DstReg, DstTy, ScalarReg, ScalarTy, SrcReg, SrcTy] =
4962       MI.getFirst3RegLLTs();
4963   if (!NarrowTy.isScalar() || TypeIdx != 2 || DstTy != ScalarTy ||
4964       DstTy != NarrowTy)
4965     return UnableToLegalize;
4966 
4967   assert((MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD ||
4968           MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FMUL) &&
4969          "Unexpected vecreduce opcode");
4970   unsigned ScalarOpc = MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD
4971                            ? TargetOpcode::G_FADD
4972                            : TargetOpcode::G_FMUL;
4973 
4974   SmallVector<Register> SplitSrcs;
4975   unsigned NumParts = SrcTy.getNumElements();
4976   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4977   Register Acc = ScalarReg;
4978   for (unsigned i = 0; i < NumParts; i++)
4979     Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[i]})
4980               .getReg(0);
4981 
4982   MIRBuilder.buildCopy(DstReg, Acc);
4983   MI.eraseFromParent();
4984   return Legalized;
4985 }
4986 
4987 LegalizerHelper::LegalizeResult
4988 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4989                                         LLT SrcTy, LLT NarrowTy,
4990                                         unsigned ScalarOpc) {
4991   SmallVector<Register> SplitSrcs;
4992   // Split the sources into NarrowTy size pieces.
4993   extractParts(SrcReg, NarrowTy,
4994                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
4995   // We're going to do a tree reduction using vector operations until we have
4996   // one NarrowTy size value left.
4997   while (SplitSrcs.size() > 1) {
4998     SmallVector<Register> PartialRdxs;
4999     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
5000       Register LHS = SplitSrcs[Idx];
5001       Register RHS = SplitSrcs[Idx + 1];
5002       // Create the intermediate vector op.
5003       Register Res =
5004           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
5005       PartialRdxs.push_back(Res);
5006     }
5007     SplitSrcs = std::move(PartialRdxs);
5008   }
5009   // Finally generate the requested NarrowTy based reduction.
5010   Observer.changingInstr(MI);
5011   MI.getOperand(1).setReg(SplitSrcs[0]);
5012   Observer.changedInstr(MI);
5013   return Legalized;
5014 }
5015 
5016 LegalizerHelper::LegalizeResult
5017 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
5018                                              const LLT HalfTy, const LLT AmtTy) {
5019 
5020   Register InL = MRI.createGenericVirtualRegister(HalfTy);
5021   Register InH = MRI.createGenericVirtualRegister(HalfTy);
5022   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
5023 
5024   if (Amt.isZero()) {
5025     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {InL, InH});
5026     MI.eraseFromParent();
5027     return Legalized;
5028   }
5029 
5030   LLT NVT = HalfTy;
5031   unsigned NVTBits = HalfTy.getSizeInBits();
5032   unsigned VTBits = 2 * NVTBits;
5033 
5034   SrcOp Lo(Register(0)), Hi(Register(0));
5035   if (MI.getOpcode() == TargetOpcode::G_SHL) {
5036     if (Amt.ugt(VTBits)) {
5037       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
5038     } else if (Amt.ugt(NVTBits)) {
5039       Lo = MIRBuilder.buildConstant(NVT, 0);
5040       Hi = MIRBuilder.buildShl(NVT, InL,
5041                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5042     } else if (Amt == NVTBits) {
5043       Lo = MIRBuilder.buildConstant(NVT, 0);
5044       Hi = InL;
5045     } else {
5046       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
5047       auto OrLHS =
5048           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
5049       auto OrRHS = MIRBuilder.buildLShr(
5050           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5051       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5052     }
5053   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
5054     if (Amt.ugt(VTBits)) {
5055       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
5056     } else if (Amt.ugt(NVTBits)) {
5057       Lo = MIRBuilder.buildLShr(NVT, InH,
5058                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5059       Hi = MIRBuilder.buildConstant(NVT, 0);
5060     } else if (Amt == NVTBits) {
5061       Lo = InH;
5062       Hi = MIRBuilder.buildConstant(NVT, 0);
5063     } else {
5064       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
5065 
5066       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
5067       auto OrRHS = MIRBuilder.buildShl(
5068           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5069 
5070       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5071       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
5072     }
5073   } else {
5074     if (Amt.ugt(VTBits)) {
5075       Hi = Lo = MIRBuilder.buildAShr(
5076           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5077     } else if (Amt.ugt(NVTBits)) {
5078       Lo = MIRBuilder.buildAShr(NVT, InH,
5079                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5080       Hi = MIRBuilder.buildAShr(NVT, InH,
5081                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5082     } else if (Amt == NVTBits) {
5083       Lo = InH;
5084       Hi = MIRBuilder.buildAShr(NVT, InH,
5085                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5086     } else {
5087       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
5088 
5089       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
5090       auto OrRHS = MIRBuilder.buildShl(
5091           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5092 
5093       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5094       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
5095     }
5096   }
5097 
5098   MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {Lo, Hi});
5099   MI.eraseFromParent();
5100 
5101   return Legalized;
5102 }
5103 
5104 // TODO: Optimize if constant shift amount.
5105 LegalizerHelper::LegalizeResult
5106 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
5107                                    LLT RequestedTy) {
5108   if (TypeIdx == 1) {
5109     Observer.changingInstr(MI);
5110     narrowScalarSrc(MI, RequestedTy, 2);
5111     Observer.changedInstr(MI);
5112     return Legalized;
5113   }
5114 
5115   Register DstReg = MI.getOperand(0).getReg();
5116   LLT DstTy = MRI.getType(DstReg);
5117   if (DstTy.isVector())
5118     return UnableToLegalize;
5119 
5120   Register Amt = MI.getOperand(2).getReg();
5121   LLT ShiftAmtTy = MRI.getType(Amt);
5122   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
5123   if (DstEltSize % 2 != 0)
5124     return UnableToLegalize;
5125 
5126   // Ignore the input type. We can only go to exactly half the size of the
5127   // input. If that isn't small enough, the resulting pieces will be further
5128   // legalized.
5129   const unsigned NewBitSize = DstEltSize / 2;
5130   const LLT HalfTy = LLT::scalar(NewBitSize);
5131   const LLT CondTy = LLT::scalar(1);
5132 
5133   if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
5134     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
5135                                        ShiftAmtTy);
5136   }
5137 
5138   // TODO: Expand with known bits.
5139 
5140   // Handle the fully general expansion by an unknown amount.
5141   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
5142 
5143   Register InL = MRI.createGenericVirtualRegister(HalfTy);
5144   Register InH = MRI.createGenericVirtualRegister(HalfTy);
5145   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
5146 
5147   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
5148   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
5149 
5150   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
5151   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
5152   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
5153 
5154   Register ResultRegs[2];
5155   switch (MI.getOpcode()) {
5156   case TargetOpcode::G_SHL: {
5157     // Short: ShAmt < NewBitSize
5158     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
5159 
5160     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
5161     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
5162     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
5163 
5164     // Long: ShAmt >= NewBitSize
5165     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
5166     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
5167 
5168     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
5169     auto Hi = MIRBuilder.buildSelect(
5170         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
5171 
5172     ResultRegs[0] = Lo.getReg(0);
5173     ResultRegs[1] = Hi.getReg(0);
5174     break;
5175   }
5176   case TargetOpcode::G_LSHR:
5177   case TargetOpcode::G_ASHR: {
5178     // Short: ShAmt < NewBitSize
5179     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
5180 
5181     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
5182     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
5183     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
5184 
5185     // Long: ShAmt >= NewBitSize
5186     MachineInstrBuilder HiL;
5187     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
5188       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
5189     } else {
5190       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
5191       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
5192     }
5193     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
5194                                      {InH, AmtExcess});     // Lo from Hi part.
5195 
5196     auto Lo = MIRBuilder.buildSelect(
5197         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
5198 
5199     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
5200 
5201     ResultRegs[0] = Lo.getReg(0);
5202     ResultRegs[1] = Hi.getReg(0);
5203     break;
5204   }
5205   default:
5206     llvm_unreachable("not a shift");
5207   }
5208 
5209   MIRBuilder.buildMergeLikeInstr(DstReg, ResultRegs);
5210   MI.eraseFromParent();
5211   return Legalized;
5212 }
5213 
5214 LegalizerHelper::LegalizeResult
5215 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
5216                                        LLT MoreTy) {
5217   assert(TypeIdx == 0 && "Expecting only Idx 0");
5218 
5219   Observer.changingInstr(MI);
5220   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5221     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
5222     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
5223     moreElementsVectorSrc(MI, MoreTy, I);
5224   }
5225 
5226   MachineBasicBlock &MBB = *MI.getParent();
5227   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
5228   moreElementsVectorDst(MI, MoreTy, 0);
5229   Observer.changedInstr(MI);
5230   return Legalized;
5231 }
5232 
5233 LegalizerHelper::LegalizeResult
5234 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
5235                                     LLT MoreTy) {
5236   unsigned Opc = MI.getOpcode();
5237   switch (Opc) {
5238   case TargetOpcode::G_IMPLICIT_DEF:
5239   case TargetOpcode::G_LOAD: {
5240     if (TypeIdx != 0)
5241       return UnableToLegalize;
5242     Observer.changingInstr(MI);
5243     moreElementsVectorDst(MI, MoreTy, 0);
5244     Observer.changedInstr(MI);
5245     return Legalized;
5246   }
5247   case TargetOpcode::G_STORE:
5248     if (TypeIdx != 0)
5249       return UnableToLegalize;
5250     Observer.changingInstr(MI);
5251     moreElementsVectorSrc(MI, MoreTy, 0);
5252     Observer.changedInstr(MI);
5253     return Legalized;
5254   case TargetOpcode::G_AND:
5255   case TargetOpcode::G_OR:
5256   case TargetOpcode::G_XOR:
5257   case TargetOpcode::G_ADD:
5258   case TargetOpcode::G_SUB:
5259   case TargetOpcode::G_MUL:
5260   case TargetOpcode::G_FADD:
5261   case TargetOpcode::G_FSUB:
5262   case TargetOpcode::G_FMUL:
5263   case TargetOpcode::G_FDIV:
5264   case TargetOpcode::G_UADDSAT:
5265   case TargetOpcode::G_USUBSAT:
5266   case TargetOpcode::G_SADDSAT:
5267   case TargetOpcode::G_SSUBSAT:
5268   case TargetOpcode::G_SMIN:
5269   case TargetOpcode::G_SMAX:
5270   case TargetOpcode::G_UMIN:
5271   case TargetOpcode::G_UMAX:
5272   case TargetOpcode::G_FMINNUM:
5273   case TargetOpcode::G_FMAXNUM:
5274   case TargetOpcode::G_FMINNUM_IEEE:
5275   case TargetOpcode::G_FMAXNUM_IEEE:
5276   case TargetOpcode::G_FMINIMUM:
5277   case TargetOpcode::G_FMAXIMUM:
5278   case TargetOpcode::G_STRICT_FADD:
5279   case TargetOpcode::G_STRICT_FSUB:
5280   case TargetOpcode::G_STRICT_FMUL: {
5281     Observer.changingInstr(MI);
5282     moreElementsVectorSrc(MI, MoreTy, 1);
5283     moreElementsVectorSrc(MI, MoreTy, 2);
5284     moreElementsVectorDst(MI, MoreTy, 0);
5285     Observer.changedInstr(MI);
5286     return Legalized;
5287   }
5288   case TargetOpcode::G_FMA:
5289   case TargetOpcode::G_STRICT_FMA:
5290   case TargetOpcode::G_FSHR:
5291   case TargetOpcode::G_FSHL: {
5292     Observer.changingInstr(MI);
5293     moreElementsVectorSrc(MI, MoreTy, 1);
5294     moreElementsVectorSrc(MI, MoreTy, 2);
5295     moreElementsVectorSrc(MI, MoreTy, 3);
5296     moreElementsVectorDst(MI, MoreTy, 0);
5297     Observer.changedInstr(MI);
5298     return Legalized;
5299   }
5300   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
5301   case TargetOpcode::G_EXTRACT:
5302     if (TypeIdx != 1)
5303       return UnableToLegalize;
5304     Observer.changingInstr(MI);
5305     moreElementsVectorSrc(MI, MoreTy, 1);
5306     Observer.changedInstr(MI);
5307     return Legalized;
5308   case TargetOpcode::G_INSERT:
5309   case TargetOpcode::G_INSERT_VECTOR_ELT:
5310   case TargetOpcode::G_FREEZE:
5311   case TargetOpcode::G_FNEG:
5312   case TargetOpcode::G_FABS:
5313   case TargetOpcode::G_FSQRT:
5314   case TargetOpcode::G_FCEIL:
5315   case TargetOpcode::G_FFLOOR:
5316   case TargetOpcode::G_FNEARBYINT:
5317   case TargetOpcode::G_FRINT:
5318   case TargetOpcode::G_INTRINSIC_ROUND:
5319   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
5320   case TargetOpcode::G_INTRINSIC_TRUNC:
5321   case TargetOpcode::G_BSWAP:
5322   case TargetOpcode::G_FCANONICALIZE:
5323   case TargetOpcode::G_SEXT_INREG:
5324     if (TypeIdx != 0)
5325       return UnableToLegalize;
5326     Observer.changingInstr(MI);
5327     moreElementsVectorSrc(MI, MoreTy, 1);
5328     moreElementsVectorDst(MI, MoreTy, 0);
5329     Observer.changedInstr(MI);
5330     return Legalized;
5331   case TargetOpcode::G_SELECT: {
5332     auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs();
5333     if (TypeIdx == 1) {
5334       if (!CondTy.isScalar() ||
5335           DstTy.getElementCount() != MoreTy.getElementCount())
5336         return UnableToLegalize;
5337 
5338       // This is turning a scalar select of vectors into a vector
5339       // select. Broadcast the select condition.
5340       auto ShufSplat = MIRBuilder.buildShuffleSplat(MoreTy, CondReg);
5341       Observer.changingInstr(MI);
5342       MI.getOperand(1).setReg(ShufSplat.getReg(0));
5343       Observer.changedInstr(MI);
5344       return Legalized;
5345     }
5346 
5347     if (CondTy.isVector())
5348       return UnableToLegalize;
5349 
5350     Observer.changingInstr(MI);
5351     moreElementsVectorSrc(MI, MoreTy, 2);
5352     moreElementsVectorSrc(MI, MoreTy, 3);
5353     moreElementsVectorDst(MI, MoreTy, 0);
5354     Observer.changedInstr(MI);
5355     return Legalized;
5356   }
5357   case TargetOpcode::G_UNMERGE_VALUES:
5358     return UnableToLegalize;
5359   case TargetOpcode::G_PHI:
5360     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
5361   case TargetOpcode::G_SHUFFLE_VECTOR:
5362     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
5363   case TargetOpcode::G_BUILD_VECTOR: {
5364     SmallVector<SrcOp, 8> Elts;
5365     for (auto Op : MI.uses()) {
5366       Elts.push_back(Op.getReg());
5367     }
5368 
5369     for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
5370       Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
5371     }
5372 
5373     MIRBuilder.buildDeleteTrailingVectorElements(
5374         MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
5375     MI.eraseFromParent();
5376     return Legalized;
5377   }
5378   case TargetOpcode::G_TRUNC:
5379   case TargetOpcode::G_FPTRUNC:
5380   case TargetOpcode::G_FPEXT:
5381   case TargetOpcode::G_FPTOSI:
5382   case TargetOpcode::G_FPTOUI:
5383   case TargetOpcode::G_SITOFP:
5384   case TargetOpcode::G_UITOFP: {
5385     if (TypeIdx != 0)
5386       return UnableToLegalize;
5387     Observer.changingInstr(MI);
5388     LLT SrcTy = LLT::fixed_vector(
5389         MoreTy.getNumElements(),
5390         MRI.getType(MI.getOperand(1).getReg()).getElementType());
5391     moreElementsVectorSrc(MI, SrcTy, 1);
5392     moreElementsVectorDst(MI, MoreTy, 0);
5393     Observer.changedInstr(MI);
5394     return Legalized;
5395   }
5396   default:
5397     return UnableToLegalize;
5398   }
5399 }
5400 
5401 LegalizerHelper::LegalizeResult
5402 LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) {
5403   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5404   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5405   unsigned MaskNumElts = Mask.size();
5406   unsigned SrcNumElts = SrcTy.getNumElements();
5407   LLT DestEltTy = DstTy.getElementType();
5408 
5409   if (MaskNumElts == SrcNumElts)
5410     return Legalized;
5411 
5412   if (MaskNumElts < SrcNumElts) {
5413     // Extend mask to match new destination vector size with
5414     // undef values.
5415     SmallVector<int, 16> NewMask(Mask);
5416     for (unsigned I = MaskNumElts; I < SrcNumElts; ++I)
5417       NewMask.push_back(-1);
5418 
5419     moreElementsVectorDst(MI, SrcTy, 0);
5420     MIRBuilder.setInstrAndDebugLoc(MI);
5421     MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
5422                                   MI.getOperand(1).getReg(),
5423                                   MI.getOperand(2).getReg(), NewMask);
5424     MI.eraseFromParent();
5425 
5426     return Legalized;
5427   }
5428 
5429   unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts);
5430   unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
5431   LLT PaddedTy = LLT::fixed_vector(PaddedMaskNumElts, DestEltTy);
5432 
5433   // Create new source vectors by concatenating the initial
5434   // source vectors with undefined vectors of the same size.
5435   auto Undef = MIRBuilder.buildUndef(SrcTy);
5436   SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(0));
5437   SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(0));
5438   MOps1[0] = MI.getOperand(1).getReg();
5439   MOps2[0] = MI.getOperand(2).getReg();
5440 
5441   auto Src1 = MIRBuilder.buildConcatVectors(PaddedTy, MOps1);
5442   auto Src2 = MIRBuilder.buildConcatVectors(PaddedTy, MOps2);
5443 
5444   // Readjust mask for new input vector length.
5445   SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
5446   for (unsigned I = 0; I != MaskNumElts; ++I) {
5447     int Idx = Mask[I];
5448     if (Idx >= static_cast<int>(SrcNumElts))
5449       Idx += PaddedMaskNumElts - SrcNumElts;
5450     MappedOps[I] = Idx;
5451   }
5452 
5453   // If we got more elements than required, extract subvector.
5454   if (MaskNumElts != PaddedMaskNumElts) {
5455     auto Shuffle =
5456         MIRBuilder.buildShuffleVector(PaddedTy, Src1, Src2, MappedOps);
5457 
5458     SmallVector<Register, 16> Elts(MaskNumElts);
5459     for (unsigned I = 0; I < MaskNumElts; ++I) {
5460       Elts[I] =
5461           MIRBuilder.buildExtractVectorElementConstant(DestEltTy, Shuffle, I)
5462               .getReg(0);
5463     }
5464     MIRBuilder.buildBuildVector(DstReg, Elts);
5465   } else {
5466     MIRBuilder.buildShuffleVector(DstReg, Src1, Src2, MappedOps);
5467   }
5468 
5469   MI.eraseFromParent();
5470   return LegalizerHelper::LegalizeResult::Legalized;
5471 }
5472 
5473 LegalizerHelper::LegalizeResult
5474 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
5475                                            unsigned int TypeIdx, LLT MoreTy) {
5476   auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs();
5477   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5478   unsigned NumElts = DstTy.getNumElements();
5479   unsigned WidenNumElts = MoreTy.getNumElements();
5480 
5481   if (DstTy.isVector() && Src1Ty.isVector() &&
5482       DstTy.getNumElements() != Src1Ty.getNumElements()) {
5483     return equalizeVectorShuffleLengths(MI);
5484   }
5485 
5486   if (TypeIdx != 0)
5487     return UnableToLegalize;
5488 
5489   // Expect a canonicalized shuffle.
5490   if (DstTy != Src1Ty || DstTy != Src2Ty)
5491     return UnableToLegalize;
5492 
5493   moreElementsVectorSrc(MI, MoreTy, 1);
5494   moreElementsVectorSrc(MI, MoreTy, 2);
5495 
5496   // Adjust mask based on new input vector length.
5497   SmallVector<int, 16> NewMask;
5498   for (unsigned I = 0; I != NumElts; ++I) {
5499     int Idx = Mask[I];
5500     if (Idx < static_cast<int>(NumElts))
5501       NewMask.push_back(Idx);
5502     else
5503       NewMask.push_back(Idx - NumElts + WidenNumElts);
5504   }
5505   for (unsigned I = NumElts; I != WidenNumElts; ++I)
5506     NewMask.push_back(-1);
5507   moreElementsVectorDst(MI, MoreTy, 0);
5508   MIRBuilder.setInstrAndDebugLoc(MI);
5509   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
5510                                 MI.getOperand(1).getReg(),
5511                                 MI.getOperand(2).getReg(), NewMask);
5512   MI.eraseFromParent();
5513   return Legalized;
5514 }
5515 
5516 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
5517                                         ArrayRef<Register> Src1Regs,
5518                                         ArrayRef<Register> Src2Regs,
5519                                         LLT NarrowTy) {
5520   MachineIRBuilder &B = MIRBuilder;
5521   unsigned SrcParts = Src1Regs.size();
5522   unsigned DstParts = DstRegs.size();
5523 
5524   unsigned DstIdx = 0; // Low bits of the result.
5525   Register FactorSum =
5526       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
5527   DstRegs[DstIdx] = FactorSum;
5528 
5529   unsigned CarrySumPrevDstIdx;
5530   SmallVector<Register, 4> Factors;
5531 
5532   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
5533     // Collect low parts of muls for DstIdx.
5534     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
5535          i <= std::min(DstIdx, SrcParts - 1); ++i) {
5536       MachineInstrBuilder Mul =
5537           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
5538       Factors.push_back(Mul.getReg(0));
5539     }
5540     // Collect high parts of muls from previous DstIdx.
5541     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
5542          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
5543       MachineInstrBuilder Umulh =
5544           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
5545       Factors.push_back(Umulh.getReg(0));
5546     }
5547     // Add CarrySum from additions calculated for previous DstIdx.
5548     if (DstIdx != 1) {
5549       Factors.push_back(CarrySumPrevDstIdx);
5550     }
5551 
5552     Register CarrySum;
5553     // Add all factors and accumulate all carries into CarrySum.
5554     if (DstIdx != DstParts - 1) {
5555       MachineInstrBuilder Uaddo =
5556           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5557       FactorSum = Uaddo.getReg(0);
5558       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5559       for (unsigned i = 2; i < Factors.size(); ++i) {
5560         MachineInstrBuilder Uaddo =
5561             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5562         FactorSum = Uaddo.getReg(0);
5563         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5564         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5565       }
5566     } else {
5567       // Since value for the next index is not calculated, neither is CarrySum.
5568       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5569       for (unsigned i = 2; i < Factors.size(); ++i)
5570         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5571     }
5572 
5573     CarrySumPrevDstIdx = CarrySum;
5574     DstRegs[DstIdx] = FactorSum;
5575     Factors.clear();
5576   }
5577 }
5578 
5579 LegalizerHelper::LegalizeResult
5580 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5581                                     LLT NarrowTy) {
5582   if (TypeIdx != 0)
5583     return UnableToLegalize;
5584 
5585   Register DstReg = MI.getOperand(0).getReg();
5586   LLT DstType = MRI.getType(DstReg);
5587   // FIXME: add support for vector types
5588   if (DstType.isVector())
5589     return UnableToLegalize;
5590 
5591   unsigned Opcode = MI.getOpcode();
5592   unsigned OpO, OpE, OpF;
5593   switch (Opcode) {
5594   case TargetOpcode::G_SADDO:
5595   case TargetOpcode::G_SADDE:
5596   case TargetOpcode::G_UADDO:
5597   case TargetOpcode::G_UADDE:
5598   case TargetOpcode::G_ADD:
5599     OpO = TargetOpcode::G_UADDO;
5600     OpE = TargetOpcode::G_UADDE;
5601     OpF = TargetOpcode::G_UADDE;
5602     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5603       OpF = TargetOpcode::G_SADDE;
5604     break;
5605   case TargetOpcode::G_SSUBO:
5606   case TargetOpcode::G_SSUBE:
5607   case TargetOpcode::G_USUBO:
5608   case TargetOpcode::G_USUBE:
5609   case TargetOpcode::G_SUB:
5610     OpO = TargetOpcode::G_USUBO;
5611     OpE = TargetOpcode::G_USUBE;
5612     OpF = TargetOpcode::G_USUBE;
5613     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5614       OpF = TargetOpcode::G_SSUBE;
5615     break;
5616   default:
5617     llvm_unreachable("Unexpected add/sub opcode!");
5618   }
5619 
5620   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5621   unsigned NumDefs = MI.getNumExplicitDefs();
5622   Register Src1 = MI.getOperand(NumDefs).getReg();
5623   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5624   Register CarryDst, CarryIn;
5625   if (NumDefs == 2)
5626     CarryDst = MI.getOperand(1).getReg();
5627   if (MI.getNumOperands() == NumDefs + 3)
5628     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5629 
5630   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5631   LLT LeftoverTy, DummyTy;
5632   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5633   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
5634   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
5635 
5636   int NarrowParts = Src1Regs.size();
5637   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5638     Src1Regs.push_back(Src1Left[I]);
5639     Src2Regs.push_back(Src2Left[I]);
5640   }
5641   DstRegs.reserve(Src1Regs.size());
5642 
5643   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5644     Register DstReg =
5645         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5646     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5647     // Forward the final carry-out to the destination register
5648     if (i == e - 1 && CarryDst)
5649       CarryOut = CarryDst;
5650 
5651     if (!CarryIn) {
5652       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5653                             {Src1Regs[i], Src2Regs[i]});
5654     } else if (i == e - 1) {
5655       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5656                             {Src1Regs[i], Src2Regs[i], CarryIn});
5657     } else {
5658       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5659                             {Src1Regs[i], Src2Regs[i], CarryIn});
5660     }
5661 
5662     DstRegs.push_back(DstReg);
5663     CarryIn = CarryOut;
5664   }
5665   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5666               ArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5667               ArrayRef(DstRegs).drop_front(NarrowParts));
5668 
5669   MI.eraseFromParent();
5670   return Legalized;
5671 }
5672 
5673 LegalizerHelper::LegalizeResult
5674 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5675   auto [DstReg, Src1, Src2] = MI.getFirst3Regs();
5676 
5677   LLT Ty = MRI.getType(DstReg);
5678   if (Ty.isVector())
5679     return UnableToLegalize;
5680 
5681   unsigned Size = Ty.getSizeInBits();
5682   unsigned NarrowSize = NarrowTy.getSizeInBits();
5683   if (Size % NarrowSize != 0)
5684     return UnableToLegalize;
5685 
5686   unsigned NumParts = Size / NarrowSize;
5687   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5688   unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
5689 
5690   SmallVector<Register, 2> Src1Parts, Src2Parts;
5691   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5692   extractParts(Src1, NarrowTy, NumParts, Src1Parts);
5693   extractParts(Src2, NarrowTy, NumParts, Src2Parts);
5694   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5695 
5696   // Take only high half of registers if this is high mul.
5697   ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
5698   MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
5699   MI.eraseFromParent();
5700   return Legalized;
5701 }
5702 
5703 LegalizerHelper::LegalizeResult
5704 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5705                                    LLT NarrowTy) {
5706   if (TypeIdx != 0)
5707     return UnableToLegalize;
5708 
5709   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5710 
5711   Register Src = MI.getOperand(1).getReg();
5712   LLT SrcTy = MRI.getType(Src);
5713 
5714   // If all finite floats fit into the narrowed integer type, we can just swap
5715   // out the result type. This is practically only useful for conversions from
5716   // half to at least 16-bits, so just handle the one case.
5717   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5718       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5719     return UnableToLegalize;
5720 
5721   Observer.changingInstr(MI);
5722   narrowScalarDst(MI, NarrowTy, 0,
5723                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5724   Observer.changedInstr(MI);
5725   return Legalized;
5726 }
5727 
5728 LegalizerHelper::LegalizeResult
5729 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5730                                      LLT NarrowTy) {
5731   if (TypeIdx != 1)
5732     return UnableToLegalize;
5733 
5734   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5735 
5736   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5737   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5738   // NarrowSize.
5739   if (SizeOp1 % NarrowSize != 0)
5740     return UnableToLegalize;
5741   int NumParts = SizeOp1 / NarrowSize;
5742 
5743   SmallVector<Register, 2> SrcRegs, DstRegs;
5744   SmallVector<uint64_t, 2> Indexes;
5745   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
5746 
5747   Register OpReg = MI.getOperand(0).getReg();
5748   uint64_t OpStart = MI.getOperand(2).getImm();
5749   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5750   for (int i = 0; i < NumParts; ++i) {
5751     unsigned SrcStart = i * NarrowSize;
5752 
5753     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5754       // No part of the extract uses this subregister, ignore it.
5755       continue;
5756     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5757       // The entire subregister is extracted, forward the value.
5758       DstRegs.push_back(SrcRegs[i]);
5759       continue;
5760     }
5761 
5762     // OpSegStart is where this destination segment would start in OpReg if it
5763     // extended infinitely in both directions.
5764     int64_t ExtractOffset;
5765     uint64_t SegSize;
5766     if (OpStart < SrcStart) {
5767       ExtractOffset = 0;
5768       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5769     } else {
5770       ExtractOffset = OpStart - SrcStart;
5771       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5772     }
5773 
5774     Register SegReg = SrcRegs[i];
5775     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5776       // A genuine extract is needed.
5777       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5778       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5779     }
5780 
5781     DstRegs.push_back(SegReg);
5782   }
5783 
5784   Register DstReg = MI.getOperand(0).getReg();
5785   if (MRI.getType(DstReg).isVector())
5786     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5787   else if (DstRegs.size() > 1)
5788     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
5789   else
5790     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5791   MI.eraseFromParent();
5792   return Legalized;
5793 }
5794 
5795 LegalizerHelper::LegalizeResult
5796 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5797                                     LLT NarrowTy) {
5798   // FIXME: Don't know how to handle secondary types yet.
5799   if (TypeIdx != 0)
5800     return UnableToLegalize;
5801 
5802   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5803   SmallVector<uint64_t, 2> Indexes;
5804   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5805   LLT LeftoverTy;
5806   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5807                LeftoverRegs);
5808 
5809   for (Register Reg : LeftoverRegs)
5810     SrcRegs.push_back(Reg);
5811 
5812   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5813   Register OpReg = MI.getOperand(2).getReg();
5814   uint64_t OpStart = MI.getOperand(3).getImm();
5815   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5816   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5817     unsigned DstStart = I * NarrowSize;
5818 
5819     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5820       // The entire subregister is defined by this insert, forward the new
5821       // value.
5822       DstRegs.push_back(OpReg);
5823       continue;
5824     }
5825 
5826     Register SrcReg = SrcRegs[I];
5827     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5828       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5829       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5830       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5831     }
5832 
5833     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5834       // No part of the insert affects this subregister, forward the original.
5835       DstRegs.push_back(SrcReg);
5836       continue;
5837     }
5838 
5839     // OpSegStart is where this destination segment would start in OpReg if it
5840     // extended infinitely in both directions.
5841     int64_t ExtractOffset, InsertOffset;
5842     uint64_t SegSize;
5843     if (OpStart < DstStart) {
5844       InsertOffset = 0;
5845       ExtractOffset = DstStart - OpStart;
5846       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5847     } else {
5848       InsertOffset = OpStart - DstStart;
5849       ExtractOffset = 0;
5850       SegSize =
5851         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5852     }
5853 
5854     Register SegReg = OpReg;
5855     if (ExtractOffset != 0 || SegSize != OpSize) {
5856       // A genuine extract is needed.
5857       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5858       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5859     }
5860 
5861     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5862     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5863     DstRegs.push_back(DstReg);
5864   }
5865 
5866   uint64_t WideSize = DstRegs.size() * NarrowSize;
5867   Register DstReg = MI.getOperand(0).getReg();
5868   if (WideSize > RegTy.getSizeInBits()) {
5869     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5870     MIRBuilder.buildMergeLikeInstr(MergeReg, DstRegs);
5871     MIRBuilder.buildTrunc(DstReg, MergeReg);
5872   } else
5873     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
5874 
5875   MI.eraseFromParent();
5876   return Legalized;
5877 }
5878 
5879 LegalizerHelper::LegalizeResult
5880 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5881                                    LLT NarrowTy) {
5882   Register DstReg = MI.getOperand(0).getReg();
5883   LLT DstTy = MRI.getType(DstReg);
5884 
5885   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5886 
5887   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5888   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5889   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5890   LLT LeftoverTy;
5891   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5892                     Src0Regs, Src0LeftoverRegs))
5893     return UnableToLegalize;
5894 
5895   LLT Unused;
5896   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5897                     Src1Regs, Src1LeftoverRegs))
5898     llvm_unreachable("inconsistent extractParts result");
5899 
5900   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5901     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5902                                         {Src0Regs[I], Src1Regs[I]});
5903     DstRegs.push_back(Inst.getReg(0));
5904   }
5905 
5906   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5907     auto Inst = MIRBuilder.buildInstr(
5908       MI.getOpcode(),
5909       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5910     DstLeftoverRegs.push_back(Inst.getReg(0));
5911   }
5912 
5913   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5914               LeftoverTy, DstLeftoverRegs);
5915 
5916   MI.eraseFromParent();
5917   return Legalized;
5918 }
5919 
5920 LegalizerHelper::LegalizeResult
5921 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5922                                  LLT NarrowTy) {
5923   if (TypeIdx != 0)
5924     return UnableToLegalize;
5925 
5926   auto [DstReg, SrcReg] = MI.getFirst2Regs();
5927 
5928   LLT DstTy = MRI.getType(DstReg);
5929   if (DstTy.isVector())
5930     return UnableToLegalize;
5931 
5932   SmallVector<Register, 8> Parts;
5933   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5934   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5935   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5936 
5937   MI.eraseFromParent();
5938   return Legalized;
5939 }
5940 
5941 LegalizerHelper::LegalizeResult
5942 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5943                                     LLT NarrowTy) {
5944   if (TypeIdx != 0)
5945     return UnableToLegalize;
5946 
5947   Register CondReg = MI.getOperand(1).getReg();
5948   LLT CondTy = MRI.getType(CondReg);
5949   if (CondTy.isVector()) // TODO: Handle vselect
5950     return UnableToLegalize;
5951 
5952   Register DstReg = MI.getOperand(0).getReg();
5953   LLT DstTy = MRI.getType(DstReg);
5954 
5955   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5956   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5957   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5958   LLT LeftoverTy;
5959   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5960                     Src1Regs, Src1LeftoverRegs))
5961     return UnableToLegalize;
5962 
5963   LLT Unused;
5964   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5965                     Src2Regs, Src2LeftoverRegs))
5966     llvm_unreachable("inconsistent extractParts result");
5967 
5968   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5969     auto Select = MIRBuilder.buildSelect(NarrowTy,
5970                                          CondReg, Src1Regs[I], Src2Regs[I]);
5971     DstRegs.push_back(Select.getReg(0));
5972   }
5973 
5974   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5975     auto Select = MIRBuilder.buildSelect(
5976       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5977     DstLeftoverRegs.push_back(Select.getReg(0));
5978   }
5979 
5980   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5981               LeftoverTy, DstLeftoverRegs);
5982 
5983   MI.eraseFromParent();
5984   return Legalized;
5985 }
5986 
5987 LegalizerHelper::LegalizeResult
5988 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5989                                   LLT NarrowTy) {
5990   if (TypeIdx != 1)
5991     return UnableToLegalize;
5992 
5993   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5994   unsigned NarrowSize = NarrowTy.getSizeInBits();
5995 
5996   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5997     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5998 
5999     MachineIRBuilder &B = MIRBuilder;
6000     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
6001     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
6002     auto C_0 = B.buildConstant(NarrowTy, 0);
6003     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
6004                                 UnmergeSrc.getReg(1), C_0);
6005     auto LoCTLZ = IsUndef ?
6006       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
6007       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
6008     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
6009     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
6010     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
6011     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
6012 
6013     MI.eraseFromParent();
6014     return Legalized;
6015   }
6016 
6017   return UnableToLegalize;
6018 }
6019 
6020 LegalizerHelper::LegalizeResult
6021 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
6022                                   LLT NarrowTy) {
6023   if (TypeIdx != 1)
6024     return UnableToLegalize;
6025 
6026   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6027   unsigned NarrowSize = NarrowTy.getSizeInBits();
6028 
6029   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
6030     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
6031 
6032     MachineIRBuilder &B = MIRBuilder;
6033     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
6034     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
6035     auto C_0 = B.buildConstant(NarrowTy, 0);
6036     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
6037                                 UnmergeSrc.getReg(0), C_0);
6038     auto HiCTTZ = IsUndef ?
6039       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
6040       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
6041     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
6042     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
6043     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
6044     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
6045 
6046     MI.eraseFromParent();
6047     return Legalized;
6048   }
6049 
6050   return UnableToLegalize;
6051 }
6052 
6053 LegalizerHelper::LegalizeResult
6054 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
6055                                    LLT NarrowTy) {
6056   if (TypeIdx != 1)
6057     return UnableToLegalize;
6058 
6059   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6060   unsigned NarrowSize = NarrowTy.getSizeInBits();
6061 
6062   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
6063     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
6064 
6065     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
6066     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
6067     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
6068 
6069     MI.eraseFromParent();
6070     return Legalized;
6071   }
6072 
6073   return UnableToLegalize;
6074 }
6075 
6076 LegalizerHelper::LegalizeResult
6077 LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx,
6078                                     LLT NarrowTy) {
6079   if (TypeIdx != 1)
6080     return UnableToLegalize;
6081 
6082   MachineIRBuilder &B = MIRBuilder;
6083   Register ExpReg = MI.getOperand(2).getReg();
6084   LLT ExpTy = MRI.getType(ExpReg);
6085 
6086   unsigned ClampSize = NarrowTy.getScalarSizeInBits();
6087 
6088   // Clamp the exponent to the range of the target type.
6089   auto MinExp = B.buildConstant(ExpTy, minIntN(ClampSize));
6090   auto ClampMin = B.buildSMax(ExpTy, ExpReg, MinExp);
6091   auto MaxExp = B.buildConstant(ExpTy, maxIntN(ClampSize));
6092   auto Clamp = B.buildSMin(ExpTy, ClampMin, MaxExp);
6093 
6094   auto Trunc = B.buildTrunc(NarrowTy, Clamp);
6095   Observer.changingInstr(MI);
6096   MI.getOperand(2).setReg(Trunc.getReg(0));
6097   Observer.changedInstr(MI);
6098   return Legalized;
6099 }
6100 
6101 LegalizerHelper::LegalizeResult
6102 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
6103   unsigned Opc = MI.getOpcode();
6104   const auto &TII = MIRBuilder.getTII();
6105   auto isSupported = [this](const LegalityQuery &Q) {
6106     auto QAction = LI.getAction(Q).Action;
6107     return QAction == Legal || QAction == Libcall || QAction == Custom;
6108   };
6109   switch (Opc) {
6110   default:
6111     return UnableToLegalize;
6112   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
6113     // This trivially expands to CTLZ.
6114     Observer.changingInstr(MI);
6115     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
6116     Observer.changedInstr(MI);
6117     return Legalized;
6118   }
6119   case TargetOpcode::G_CTLZ: {
6120     auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6121     unsigned Len = SrcTy.getSizeInBits();
6122 
6123     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
6124       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
6125       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
6126       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
6127       auto ICmp = MIRBuilder.buildICmp(
6128           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
6129       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
6130       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
6131       MI.eraseFromParent();
6132       return Legalized;
6133     }
6134     // for now, we do this:
6135     // NewLen = NextPowerOf2(Len);
6136     // x = x | (x >> 1);
6137     // x = x | (x >> 2);
6138     // ...
6139     // x = x | (x >>16);
6140     // x = x | (x >>32); // for 64-bit input
6141     // Upto NewLen/2
6142     // return Len - popcount(x);
6143     //
6144     // Ref: "Hacker's Delight" by Henry Warren
6145     Register Op = SrcReg;
6146     unsigned NewLen = PowerOf2Ceil(Len);
6147     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
6148       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
6149       auto MIBOp = MIRBuilder.buildOr(
6150           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
6151       Op = MIBOp.getReg(0);
6152     }
6153     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
6154     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
6155                         MIBPop);
6156     MI.eraseFromParent();
6157     return Legalized;
6158   }
6159   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
6160     // This trivially expands to CTTZ.
6161     Observer.changingInstr(MI);
6162     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
6163     Observer.changedInstr(MI);
6164     return Legalized;
6165   }
6166   case TargetOpcode::G_CTTZ: {
6167     auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6168 
6169     unsigned Len = SrcTy.getSizeInBits();
6170     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
6171       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
6172       // zero.
6173       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
6174       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
6175       auto ICmp = MIRBuilder.buildICmp(
6176           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
6177       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
6178       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
6179       MI.eraseFromParent();
6180       return Legalized;
6181     }
6182     // for now, we use: { return popcount(~x & (x - 1)); }
6183     // unless the target has ctlz but not ctpop, in which case we use:
6184     // { return 32 - nlz(~x & (x-1)); }
6185     // Ref: "Hacker's Delight" by Henry Warren
6186     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
6187     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
6188     auto MIBTmp = MIRBuilder.buildAnd(
6189         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
6190     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
6191         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
6192       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
6193       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
6194                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
6195       MI.eraseFromParent();
6196       return Legalized;
6197     }
6198     Observer.changingInstr(MI);
6199     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
6200     MI.getOperand(1).setReg(MIBTmp.getReg(0));
6201     Observer.changedInstr(MI);
6202     return Legalized;
6203   }
6204   case TargetOpcode::G_CTPOP: {
6205     Register SrcReg = MI.getOperand(1).getReg();
6206     LLT Ty = MRI.getType(SrcReg);
6207     unsigned Size = Ty.getSizeInBits();
6208     MachineIRBuilder &B = MIRBuilder;
6209 
6210     // Count set bits in blocks of 2 bits. Default approach would be
6211     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
6212     // We use following formula instead:
6213     // B2Count = val - { (val >> 1) & 0x55555555 }
6214     // since it gives same result in blocks of 2 with one instruction less.
6215     auto C_1 = B.buildConstant(Ty, 1);
6216     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
6217     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
6218     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
6219     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
6220     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
6221 
6222     // In order to get count in blocks of 4 add values from adjacent block of 2.
6223     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
6224     auto C_2 = B.buildConstant(Ty, 2);
6225     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
6226     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
6227     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
6228     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
6229     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
6230     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
6231 
6232     // For count in blocks of 8 bits we don't have to mask high 4 bits before
6233     // addition since count value sits in range {0,...,8} and 4 bits are enough
6234     // to hold such binary values. After addition high 4 bits still hold count
6235     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
6236     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
6237     auto C_4 = B.buildConstant(Ty, 4);
6238     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
6239     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
6240     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
6241     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
6242     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
6243 
6244     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
6245     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
6246     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
6247     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
6248     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
6249 
6250     // Shift count result from 8 high bits to low bits.
6251     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
6252     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
6253 
6254     MI.eraseFromParent();
6255     return Legalized;
6256   }
6257   }
6258 }
6259 
6260 // Check that (every element of) Reg is undef or not an exact multiple of BW.
6261 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
6262                                         Register Reg, unsigned BW) {
6263   return matchUnaryPredicate(
6264       MRI, Reg,
6265       [=](const Constant *C) {
6266         // Null constant here means an undef.
6267         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
6268         return !CI || CI->getValue().urem(BW) != 0;
6269       },
6270       /*AllowUndefs*/ true);
6271 }
6272 
6273 LegalizerHelper::LegalizeResult
6274 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
6275   auto [Dst, X, Y, Z] = MI.getFirst4Regs();
6276   LLT Ty = MRI.getType(Dst);
6277   LLT ShTy = MRI.getType(Z);
6278 
6279   unsigned BW = Ty.getScalarSizeInBits();
6280 
6281   if (!isPowerOf2_32(BW))
6282     return UnableToLegalize;
6283 
6284   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6285   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
6286 
6287   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
6288     // fshl X, Y, Z -> fshr X, Y, -Z
6289     // fshr X, Y, Z -> fshl X, Y, -Z
6290     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
6291     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
6292   } else {
6293     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
6294     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
6295     auto One = MIRBuilder.buildConstant(ShTy, 1);
6296     if (IsFSHL) {
6297       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
6298       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
6299     } else {
6300       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
6301       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
6302     }
6303 
6304     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
6305   }
6306 
6307   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
6308   MI.eraseFromParent();
6309   return Legalized;
6310 }
6311 
6312 LegalizerHelper::LegalizeResult
6313 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
6314   auto [Dst, X, Y, Z] = MI.getFirst4Regs();
6315   LLT Ty = MRI.getType(Dst);
6316   LLT ShTy = MRI.getType(Z);
6317 
6318   const unsigned BW = Ty.getScalarSizeInBits();
6319   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6320 
6321   Register ShX, ShY;
6322   Register ShAmt, InvShAmt;
6323 
6324   // FIXME: Emit optimized urem by constant instead of letting it expand later.
6325   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
6326     // fshl: X << C | Y >> (BW - C)
6327     // fshr: X << (BW - C) | Y >> C
6328     // where C = Z % BW is not zero
6329     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
6330     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
6331     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
6332     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
6333     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
6334   } else {
6335     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
6336     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
6337     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
6338     if (isPowerOf2_32(BW)) {
6339       // Z % BW -> Z & (BW - 1)
6340       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
6341       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
6342       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
6343       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
6344     } else {
6345       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
6346       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
6347       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
6348     }
6349 
6350     auto One = MIRBuilder.buildConstant(ShTy, 1);
6351     if (IsFSHL) {
6352       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
6353       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
6354       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
6355     } else {
6356       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
6357       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
6358       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
6359     }
6360   }
6361 
6362   MIRBuilder.buildOr(Dst, ShX, ShY);
6363   MI.eraseFromParent();
6364   return Legalized;
6365 }
6366 
6367 LegalizerHelper::LegalizeResult
6368 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
6369   // These operations approximately do the following (while avoiding undefined
6370   // shifts by BW):
6371   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
6372   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
6373   Register Dst = MI.getOperand(0).getReg();
6374   LLT Ty = MRI.getType(Dst);
6375   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
6376 
6377   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6378   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
6379 
6380   // TODO: Use smarter heuristic that accounts for vector legalization.
6381   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
6382     return lowerFunnelShiftAsShifts(MI);
6383 
6384   // This only works for powers of 2, fallback to shifts if it fails.
6385   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
6386   if (Result == UnableToLegalize)
6387     return lowerFunnelShiftAsShifts(MI);
6388   return Result;
6389 }
6390 
6391 LegalizerHelper::LegalizeResult LegalizerHelper::lowerEXT(MachineInstr &MI) {
6392   auto [Dst, Src] = MI.getFirst2Regs();
6393   LLT DstTy = MRI.getType(Dst);
6394   LLT SrcTy = MRI.getType(Src);
6395 
6396   uint32_t DstTySize = DstTy.getSizeInBits();
6397   uint32_t DstTyScalarSize = DstTy.getScalarSizeInBits();
6398   uint32_t SrcTyScalarSize = SrcTy.getScalarSizeInBits();
6399 
6400   if (!isPowerOf2_32(DstTySize) || !isPowerOf2_32(DstTyScalarSize) ||
6401       !isPowerOf2_32(SrcTyScalarSize))
6402     return UnableToLegalize;
6403 
6404   // The step between extend is too large, split it by creating an intermediate
6405   // extend instruction
6406   if (SrcTyScalarSize * 2 < DstTyScalarSize) {
6407     LLT MidTy = SrcTy.changeElementSize(SrcTyScalarSize * 2);
6408     // If the destination type is illegal, split it into multiple statements
6409     // zext x -> zext(merge(zext(unmerge), zext(unmerge)))
6410     auto NewExt = MIRBuilder.buildInstr(MI.getOpcode(), {MidTy}, {Src});
6411     // Unmerge the vector
6412     LLT EltTy = MidTy.changeElementCount(
6413         MidTy.getElementCount().divideCoefficientBy(2));
6414     auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, NewExt);
6415 
6416     // ZExt the vectors
6417     LLT ZExtResTy = DstTy.changeElementCount(
6418         DstTy.getElementCount().divideCoefficientBy(2));
6419     auto ZExtRes1 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy},
6420                                           {UnmergeSrc.getReg(0)});
6421     auto ZExtRes2 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy},
6422                                           {UnmergeSrc.getReg(1)});
6423 
6424     // Merge the ending vectors
6425     MIRBuilder.buildMergeLikeInstr(Dst, {ZExtRes1, ZExtRes2});
6426 
6427     MI.eraseFromParent();
6428     return Legalized;
6429   }
6430   return UnableToLegalize;
6431 }
6432 
6433 LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) {
6434   // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
6435   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
6436   // Similar to how operand splitting is done in SelectiondDAG, we can handle
6437   // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
6438   //   %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
6439   //   %lo16(<4 x s16>) = G_TRUNC %inlo
6440   //   %hi16(<4 x s16>) = G_TRUNC %inhi
6441   //   %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
6442   //   %res(<8 x s8>) = G_TRUNC %in16
6443 
6444   assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
6445 
6446   Register DstReg = MI.getOperand(0).getReg();
6447   Register SrcReg = MI.getOperand(1).getReg();
6448   LLT DstTy = MRI.getType(DstReg);
6449   LLT SrcTy = MRI.getType(SrcReg);
6450 
6451   if (DstTy.isVector() && isPowerOf2_32(DstTy.getNumElements()) &&
6452       isPowerOf2_32(DstTy.getScalarSizeInBits()) &&
6453       isPowerOf2_32(SrcTy.getNumElements()) &&
6454       isPowerOf2_32(SrcTy.getScalarSizeInBits())) {
6455     // Split input type.
6456     LLT SplitSrcTy = SrcTy.changeElementCount(
6457         SrcTy.getElementCount().divideCoefficientBy(2));
6458 
6459     // First, split the source into two smaller vectors.
6460     SmallVector<Register, 2> SplitSrcs;
6461     extractParts(SrcReg, SplitSrcTy, 2, SplitSrcs);
6462 
6463     // Truncate the splits into intermediate narrower elements.
6464     LLT InterTy;
6465     if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
6466       InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
6467     else
6468       InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits());
6469     for (unsigned I = 0; I < SplitSrcs.size(); ++I) {
6470       SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0);
6471     }
6472 
6473     // Combine the new truncates into one vector
6474     auto Merge = MIRBuilder.buildMergeLikeInstr(
6475         DstTy.changeElementSize(InterTy.getScalarSizeInBits()), SplitSrcs);
6476 
6477     // Truncate the new vector to the final result type
6478     if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
6479       MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), Merge.getReg(0));
6480     else
6481       MIRBuilder.buildCopy(MI.getOperand(0).getReg(), Merge.getReg(0));
6482 
6483     MI.eraseFromParent();
6484 
6485     return Legalized;
6486   }
6487   return UnableToLegalize;
6488 }
6489 
6490 LegalizerHelper::LegalizeResult
6491 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
6492   auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
6493   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6494   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6495   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6496   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6497   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
6498   MI.eraseFromParent();
6499   return Legalized;
6500 }
6501 
6502 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
6503   auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
6504 
6505   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
6506   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6507 
6508   MIRBuilder.setInstrAndDebugLoc(MI);
6509 
6510   // If a rotate in the other direction is supported, use it.
6511   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6512   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
6513       isPowerOf2_32(EltSizeInBits))
6514     return lowerRotateWithReverseRotate(MI);
6515 
6516   // If a funnel shift is supported, use it.
6517   unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6518   unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6519   bool IsFShLegal = false;
6520   if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
6521       LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
6522     auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
6523                                 Register R3) {
6524       MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
6525       MI.eraseFromParent();
6526       return Legalized;
6527     };
6528     // If a funnel shift in the other direction is supported, use it.
6529     if (IsFShLegal) {
6530       return buildFunnelShift(FShOpc, Dst, Src, Amt);
6531     } else if (isPowerOf2_32(EltSizeInBits)) {
6532       Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
6533       return buildFunnelShift(RevFsh, Dst, Src, Amt);
6534     }
6535   }
6536 
6537   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6538   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
6539   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
6540   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
6541   Register ShVal;
6542   Register RevShiftVal;
6543   if (isPowerOf2_32(EltSizeInBits)) {
6544     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
6545     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
6546     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6547     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
6548     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6549     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
6550     RevShiftVal =
6551         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
6552   } else {
6553     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
6554     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
6555     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
6556     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
6557     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6558     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
6559     auto One = MIRBuilder.buildConstant(AmtTy, 1);
6560     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
6561     RevShiftVal =
6562         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
6563   }
6564   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
6565   MI.eraseFromParent();
6566   return Legalized;
6567 }
6568 
6569 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
6570 // representation.
6571 LegalizerHelper::LegalizeResult
6572 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
6573   auto [Dst, Src] = MI.getFirst2Regs();
6574   const LLT S64 = LLT::scalar(64);
6575   const LLT S32 = LLT::scalar(32);
6576   const LLT S1 = LLT::scalar(1);
6577 
6578   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
6579 
6580   // unsigned cul2f(ulong u) {
6581   //   uint lz = clz(u);
6582   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
6583   //   u = (u << lz) & 0x7fffffffffffffffUL;
6584   //   ulong t = u & 0xffffffffffUL;
6585   //   uint v = (e << 23) | (uint)(u >> 40);
6586   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
6587   //   return as_float(v + r);
6588   // }
6589 
6590   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
6591   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
6592 
6593   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
6594 
6595   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
6596   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
6597 
6598   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
6599   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
6600 
6601   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
6602   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
6603 
6604   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
6605 
6606   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
6607   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
6608 
6609   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
6610   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
6611   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
6612 
6613   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
6614   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
6615   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
6616   auto One = MIRBuilder.buildConstant(S32, 1);
6617 
6618   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
6619   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
6620   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
6621   MIRBuilder.buildAdd(Dst, V, R);
6622 
6623   MI.eraseFromParent();
6624   return Legalized;
6625 }
6626 
6627 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
6628   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6629 
6630   if (SrcTy == LLT::scalar(1)) {
6631     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
6632     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6633     MIRBuilder.buildSelect(Dst, Src, True, False);
6634     MI.eraseFromParent();
6635     return Legalized;
6636   }
6637 
6638   if (SrcTy != LLT::scalar(64))
6639     return UnableToLegalize;
6640 
6641   if (DstTy == LLT::scalar(32)) {
6642     // TODO: SelectionDAG has several alternative expansions to port which may
6643     // be more reasonble depending on the available instructions. If a target
6644     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6645     // intermediate type, this is probably worse.
6646     return lowerU64ToF32BitOps(MI);
6647   }
6648 
6649   return UnableToLegalize;
6650 }
6651 
6652 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6653   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6654 
6655   const LLT S64 = LLT::scalar(64);
6656   const LLT S32 = LLT::scalar(32);
6657   const LLT S1 = LLT::scalar(1);
6658 
6659   if (SrcTy == S1) {
6660     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6661     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6662     MIRBuilder.buildSelect(Dst, Src, True, False);
6663     MI.eraseFromParent();
6664     return Legalized;
6665   }
6666 
6667   if (SrcTy != S64)
6668     return UnableToLegalize;
6669 
6670   if (DstTy == S32) {
6671     // signed cl2f(long l) {
6672     //   long s = l >> 63;
6673     //   float r = cul2f((l + s) ^ s);
6674     //   return s ? -r : r;
6675     // }
6676     Register L = Src;
6677     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6678     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6679 
6680     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6681     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6682     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6683 
6684     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6685     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6686                                             MIRBuilder.buildConstant(S64, 0));
6687     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6688     MI.eraseFromParent();
6689     return Legalized;
6690   }
6691 
6692   return UnableToLegalize;
6693 }
6694 
6695 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6696   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6697   const LLT S64 = LLT::scalar(64);
6698   const LLT S32 = LLT::scalar(32);
6699 
6700   if (SrcTy != S64 && SrcTy != S32)
6701     return UnableToLegalize;
6702   if (DstTy != S32 && DstTy != S64)
6703     return UnableToLegalize;
6704 
6705   // FPTOSI gives same result as FPTOUI for positive signed integers.
6706   // FPTOUI needs to deal with fp values that convert to unsigned integers
6707   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6708 
6709   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6710   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6711                                                 : APFloat::IEEEdouble(),
6712                     APInt::getZero(SrcTy.getSizeInBits()));
6713   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6714 
6715   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6716 
6717   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6718   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6719   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6720   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6721   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6722   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6723   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6724 
6725   const LLT S1 = LLT::scalar(1);
6726 
6727   MachineInstrBuilder FCMP =
6728       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6729   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6730 
6731   MI.eraseFromParent();
6732   return Legalized;
6733 }
6734 
6735 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6736   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6737   const LLT S64 = LLT::scalar(64);
6738   const LLT S32 = LLT::scalar(32);
6739 
6740   // FIXME: Only f32 to i64 conversions are supported.
6741   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6742     return UnableToLegalize;
6743 
6744   // Expand f32 -> i64 conversion
6745   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6746   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6747 
6748   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6749 
6750   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6751   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6752 
6753   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6754   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6755 
6756   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6757                                            APInt::getSignMask(SrcEltBits));
6758   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6759   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6760   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6761   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6762 
6763   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6764   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6765   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6766 
6767   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6768   R = MIRBuilder.buildZExt(DstTy, R);
6769 
6770   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6771   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6772   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6773   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6774 
6775   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6776   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6777 
6778   const LLT S1 = LLT::scalar(1);
6779   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6780                                     S1, Exponent, ExponentLoBit);
6781 
6782   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6783 
6784   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6785   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6786 
6787   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6788 
6789   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6790                                           S1, Exponent, ZeroSrcTy);
6791 
6792   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6793   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6794 
6795   MI.eraseFromParent();
6796   return Legalized;
6797 }
6798 
6799 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6800 LegalizerHelper::LegalizeResult
6801 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6802   const LLT S1 = LLT::scalar(1);
6803   const LLT S32 = LLT::scalar(32);
6804 
6805   auto [Dst, Src] = MI.getFirst2Regs();
6806   assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
6807          MRI.getType(Src).getScalarType() == LLT::scalar(64));
6808 
6809   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6810     return UnableToLegalize;
6811 
6812   if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
6813     unsigned Flags = MI.getFlags();
6814     auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
6815     MIRBuilder.buildFPTrunc(Dst, Src32, Flags);
6816     MI.eraseFromParent();
6817     return Legalized;
6818   }
6819 
6820   const unsigned ExpMask = 0x7ff;
6821   const unsigned ExpBiasf64 = 1023;
6822   const unsigned ExpBiasf16 = 15;
6823 
6824   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6825   Register U = Unmerge.getReg(0);
6826   Register UH = Unmerge.getReg(1);
6827 
6828   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6829   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6830 
6831   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6832   // add the f16 bias (15) to get the biased exponent for the f16 format.
6833   E = MIRBuilder.buildAdd(
6834     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6835 
6836   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6837   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6838 
6839   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6840                                        MIRBuilder.buildConstant(S32, 0x1ff));
6841   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6842 
6843   auto Zero = MIRBuilder.buildConstant(S32, 0);
6844   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6845   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6846   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6847 
6848   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6849   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6850   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6851   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6852 
6853   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6854   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6855 
6856   // N = M | (E << 12);
6857   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6858   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6859 
6860   // B = clamp(1-E, 0, 13);
6861   auto One = MIRBuilder.buildConstant(S32, 1);
6862   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6863   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6864   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6865 
6866   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6867                                        MIRBuilder.buildConstant(S32, 0x1000));
6868 
6869   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6870   auto D0 = MIRBuilder.buildShl(S32, D, B);
6871 
6872   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6873                                              D0, SigSetHigh);
6874   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6875   D = MIRBuilder.buildOr(S32, D, D1);
6876 
6877   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6878   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6879 
6880   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6881   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6882 
6883   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6884                                        MIRBuilder.buildConstant(S32, 3));
6885   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6886 
6887   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6888                                        MIRBuilder.buildConstant(S32, 5));
6889   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6890 
6891   V1 = MIRBuilder.buildOr(S32, V0, V1);
6892   V = MIRBuilder.buildAdd(S32, V, V1);
6893 
6894   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6895                                        E, MIRBuilder.buildConstant(S32, 30));
6896   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6897                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6898 
6899   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6900                                          E, MIRBuilder.buildConstant(S32, 1039));
6901   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6902 
6903   // Extract the sign bit.
6904   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6905   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6906 
6907   // Insert the sign bit
6908   V = MIRBuilder.buildOr(S32, Sign, V);
6909 
6910   MIRBuilder.buildTrunc(Dst, V);
6911   MI.eraseFromParent();
6912   return Legalized;
6913 }
6914 
6915 LegalizerHelper::LegalizeResult
6916 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6917   auto [DstTy, SrcTy] = MI.getFirst2LLTs();
6918   const LLT S64 = LLT::scalar(64);
6919   const LLT S16 = LLT::scalar(16);
6920 
6921   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6922     return lowerFPTRUNC_F64_TO_F16(MI);
6923 
6924   return UnableToLegalize;
6925 }
6926 
6927 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6928 // multiplication tree.
6929 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6930   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6931   LLT Ty = MRI.getType(Dst);
6932 
6933   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6934   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6935   MI.eraseFromParent();
6936   return Legalized;
6937 }
6938 
6939 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6940   switch (Opc) {
6941   case TargetOpcode::G_SMIN:
6942     return CmpInst::ICMP_SLT;
6943   case TargetOpcode::G_SMAX:
6944     return CmpInst::ICMP_SGT;
6945   case TargetOpcode::G_UMIN:
6946     return CmpInst::ICMP_ULT;
6947   case TargetOpcode::G_UMAX:
6948     return CmpInst::ICMP_UGT;
6949   default:
6950     llvm_unreachable("not in integer min/max");
6951   }
6952 }
6953 
6954 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6955   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6956 
6957   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6958   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6959 
6960   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6961   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6962 
6963   MI.eraseFromParent();
6964   return Legalized;
6965 }
6966 
6967 LegalizerHelper::LegalizeResult
6968 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6969   auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
6970   const int Src0Size = Src0Ty.getScalarSizeInBits();
6971   const int Src1Size = Src1Ty.getScalarSizeInBits();
6972 
6973   auto SignBitMask = MIRBuilder.buildConstant(
6974     Src0Ty, APInt::getSignMask(Src0Size));
6975 
6976   auto NotSignBitMask = MIRBuilder.buildConstant(
6977     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6978 
6979   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6980   Register And1;
6981   if (Src0Ty == Src1Ty) {
6982     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6983   } else if (Src0Size > Src1Size) {
6984     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6985     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6986     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6987     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6988   } else {
6989     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6990     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6991     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6992     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6993   }
6994 
6995   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6996   // constants are a nan and -0.0, but the final result should preserve
6997   // everything.
6998   unsigned Flags = MI.getFlags();
6999   MIRBuilder.buildOr(Dst, And0, And1, Flags);
7000 
7001   MI.eraseFromParent();
7002   return Legalized;
7003 }
7004 
7005 LegalizerHelper::LegalizeResult
7006 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
7007   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
7008     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
7009 
7010   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
7011   LLT Ty = MRI.getType(Dst);
7012 
7013   if (!MI.getFlag(MachineInstr::FmNoNans)) {
7014     // Insert canonicalizes if it's possible we need to quiet to get correct
7015     // sNaN behavior.
7016 
7017     // Note this must be done here, and not as an optimization combine in the
7018     // absence of a dedicate quiet-snan instruction as we're using an
7019     // omni-purpose G_FCANONICALIZE.
7020     if (!isKnownNeverSNaN(Src0, MRI))
7021       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
7022 
7023     if (!isKnownNeverSNaN(Src1, MRI))
7024       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
7025   }
7026 
7027   // If there are no nans, it's safe to simply replace this with the non-IEEE
7028   // version.
7029   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
7030   MI.eraseFromParent();
7031   return Legalized;
7032 }
7033 
7034 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
7035   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
7036   Register DstReg = MI.getOperand(0).getReg();
7037   LLT Ty = MRI.getType(DstReg);
7038   unsigned Flags = MI.getFlags();
7039 
7040   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
7041                                   Flags);
7042   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
7043   MI.eraseFromParent();
7044   return Legalized;
7045 }
7046 
7047 LegalizerHelper::LegalizeResult
7048 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
7049   auto [DstReg, X] = MI.getFirst2Regs();
7050   const unsigned Flags = MI.getFlags();
7051   const LLT Ty = MRI.getType(DstReg);
7052   const LLT CondTy = Ty.changeElementSize(1);
7053 
7054   // round(x) =>
7055   //  t = trunc(x);
7056   //  d = fabs(x - t);
7057   //  o = copysign(d >= 0.5 ? 1.0 : 0.0, x);
7058   //  return t + o;
7059 
7060   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
7061 
7062   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
7063   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
7064 
7065   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
7066   auto Cmp =
7067       MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half, Flags);
7068 
7069   // Could emit G_UITOFP instead
7070   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
7071   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
7072   auto BoolFP = MIRBuilder.buildSelect(Ty, Cmp, One, Zero);
7073   auto SignedOffset = MIRBuilder.buildFCopysign(Ty, BoolFP, X);
7074 
7075   MIRBuilder.buildFAdd(DstReg, T, SignedOffset, Flags);
7076 
7077   MI.eraseFromParent();
7078   return Legalized;
7079 }
7080 
7081 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
7082   auto [DstReg, SrcReg] = MI.getFirst2Regs();
7083   unsigned Flags = MI.getFlags();
7084   LLT Ty = MRI.getType(DstReg);
7085   const LLT CondTy = Ty.changeElementSize(1);
7086 
7087   // result = trunc(src);
7088   // if (src < 0.0 && src != result)
7089   //   result += -1.0.
7090 
7091   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
7092   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
7093 
7094   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
7095                                   SrcReg, Zero, Flags);
7096   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
7097                                       SrcReg, Trunc, Flags);
7098   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
7099   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
7100 
7101   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
7102   MI.eraseFromParent();
7103   return Legalized;
7104 }
7105 
7106 LegalizerHelper::LegalizeResult
7107 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
7108   const unsigned NumOps = MI.getNumOperands();
7109   auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
7110   unsigned PartSize = Src0Ty.getSizeInBits();
7111 
7112   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
7113   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
7114 
7115   for (unsigned I = 2; I != NumOps; ++I) {
7116     const unsigned Offset = (I - 1) * PartSize;
7117 
7118     Register SrcReg = MI.getOperand(I).getReg();
7119     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
7120 
7121     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
7122       MRI.createGenericVirtualRegister(WideTy);
7123 
7124     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
7125     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
7126     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
7127     ResultReg = NextResult;
7128   }
7129 
7130   if (DstTy.isPointer()) {
7131     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
7132           DstTy.getAddressSpace())) {
7133       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
7134       return UnableToLegalize;
7135     }
7136 
7137     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
7138   }
7139 
7140   MI.eraseFromParent();
7141   return Legalized;
7142 }
7143 
7144 LegalizerHelper::LegalizeResult
7145 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
7146   const unsigned NumDst = MI.getNumOperands() - 1;
7147   Register SrcReg = MI.getOperand(NumDst).getReg();
7148   Register Dst0Reg = MI.getOperand(0).getReg();
7149   LLT DstTy = MRI.getType(Dst0Reg);
7150   if (DstTy.isPointer())
7151     return UnableToLegalize; // TODO
7152 
7153   SrcReg = coerceToScalar(SrcReg);
7154   if (!SrcReg)
7155     return UnableToLegalize;
7156 
7157   // Expand scalarizing unmerge as bitcast to integer and shift.
7158   LLT IntTy = MRI.getType(SrcReg);
7159 
7160   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
7161 
7162   const unsigned DstSize = DstTy.getSizeInBits();
7163   unsigned Offset = DstSize;
7164   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
7165     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
7166     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
7167     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
7168   }
7169 
7170   MI.eraseFromParent();
7171   return Legalized;
7172 }
7173 
7174 /// Lower a vector extract or insert by writing the vector to a stack temporary
7175 /// and reloading the element or vector.
7176 ///
7177 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
7178 ///  =>
7179 ///  %stack_temp = G_FRAME_INDEX
7180 ///  G_STORE %vec, %stack_temp
7181 ///  %idx = clamp(%idx, %vec.getNumElements())
7182 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
7183 ///  %dst = G_LOAD %element_ptr
7184 LegalizerHelper::LegalizeResult
7185 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
7186   Register DstReg = MI.getOperand(0).getReg();
7187   Register SrcVec = MI.getOperand(1).getReg();
7188   Register InsertVal;
7189   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
7190     InsertVal = MI.getOperand(2).getReg();
7191 
7192   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
7193 
7194   LLT VecTy = MRI.getType(SrcVec);
7195   LLT EltTy = VecTy.getElementType();
7196   unsigned NumElts = VecTy.getNumElements();
7197 
7198   int64_t IdxVal;
7199   if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
7200     SmallVector<Register, 8> SrcRegs;
7201     extractParts(SrcVec, EltTy, NumElts, SrcRegs);
7202 
7203     if (InsertVal) {
7204       SrcRegs[IdxVal] = MI.getOperand(2).getReg();
7205       MIRBuilder.buildMergeLikeInstr(DstReg, SrcRegs);
7206     } else {
7207       MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
7208     }
7209 
7210     MI.eraseFromParent();
7211     return Legalized;
7212   }
7213 
7214   if (!EltTy.isByteSized()) { // Not implemented.
7215     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
7216     return UnableToLegalize;
7217   }
7218 
7219   unsigned EltBytes = EltTy.getSizeInBytes();
7220   Align VecAlign = getStackTemporaryAlignment(VecTy);
7221   Align EltAlign;
7222 
7223   MachinePointerInfo PtrInfo;
7224   auto StackTemp = createStackTemporary(
7225       TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign, PtrInfo);
7226   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
7227 
7228   // Get the pointer to the element, and be sure not to hit undefined behavior
7229   // if the index is out of bounds.
7230   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
7231 
7232   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
7233     int64_t Offset = IdxVal * EltBytes;
7234     PtrInfo = PtrInfo.getWithOffset(Offset);
7235     EltAlign = commonAlignment(VecAlign, Offset);
7236   } else {
7237     // We lose information with a variable offset.
7238     EltAlign = getStackTemporaryAlignment(EltTy);
7239     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
7240   }
7241 
7242   if (InsertVal) {
7243     // Write the inserted element
7244     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
7245 
7246     // Reload the whole vector.
7247     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
7248   } else {
7249     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
7250   }
7251 
7252   MI.eraseFromParent();
7253   return Legalized;
7254 }
7255 
7256 LegalizerHelper::LegalizeResult
7257 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
7258   auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
7259       MI.getFirst3RegLLTs();
7260   LLT IdxTy = LLT::scalar(32);
7261 
7262   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
7263   Register Undef;
7264   SmallVector<Register, 32> BuildVec;
7265   LLT EltTy = DstTy.getScalarType();
7266 
7267   for (int Idx : Mask) {
7268     if (Idx < 0) {
7269       if (!Undef.isValid())
7270         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
7271       BuildVec.push_back(Undef);
7272       continue;
7273     }
7274 
7275     if (Src0Ty.isScalar()) {
7276       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
7277     } else {
7278       int NumElts = Src0Ty.getNumElements();
7279       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
7280       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
7281       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
7282       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
7283       BuildVec.push_back(Extract.getReg(0));
7284     }
7285   }
7286 
7287   if (DstTy.isScalar())
7288     MIRBuilder.buildCopy(DstReg, BuildVec[0]);
7289   else
7290     MIRBuilder.buildBuildVector(DstReg, BuildVec);
7291   MI.eraseFromParent();
7292   return Legalized;
7293 }
7294 
7295 Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
7296                                                     Register AllocSize,
7297                                                     Align Alignment,
7298                                                     LLT PtrTy) {
7299   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
7300 
7301   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
7302   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
7303 
7304   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
7305   // have to generate an extra instruction to negate the alloc and then use
7306   // G_PTR_ADD to add the negative offset.
7307   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
7308   if (Alignment > Align(1)) {
7309     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
7310     AlignMask.negate();
7311     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
7312     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
7313   }
7314 
7315   return MIRBuilder.buildCast(PtrTy, Alloc).getReg(0);
7316 }
7317 
7318 LegalizerHelper::LegalizeResult
7319 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
7320   const auto &MF = *MI.getMF();
7321   const auto &TFI = *MF.getSubtarget().getFrameLowering();
7322   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
7323     return UnableToLegalize;
7324 
7325   Register Dst = MI.getOperand(0).getReg();
7326   Register AllocSize = MI.getOperand(1).getReg();
7327   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
7328 
7329   LLT PtrTy = MRI.getType(Dst);
7330   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
7331   Register SPTmp =
7332       getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
7333 
7334   MIRBuilder.buildCopy(SPReg, SPTmp);
7335   MIRBuilder.buildCopy(Dst, SPTmp);
7336 
7337   MI.eraseFromParent();
7338   return Legalized;
7339 }
7340 
7341 LegalizerHelper::LegalizeResult
7342 LegalizerHelper::lowerStackSave(MachineInstr &MI) {
7343   Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
7344   if (!StackPtr)
7345     return UnableToLegalize;
7346 
7347   MIRBuilder.buildCopy(MI.getOperand(0), StackPtr);
7348   MI.eraseFromParent();
7349   return Legalized;
7350 }
7351 
7352 LegalizerHelper::LegalizeResult
7353 LegalizerHelper::lowerStackRestore(MachineInstr &MI) {
7354   Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
7355   if (!StackPtr)
7356     return UnableToLegalize;
7357 
7358   MIRBuilder.buildCopy(StackPtr, MI.getOperand(0));
7359   MI.eraseFromParent();
7360   return Legalized;
7361 }
7362 
7363 LegalizerHelper::LegalizeResult
7364 LegalizerHelper::lowerExtract(MachineInstr &MI) {
7365   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7366   unsigned Offset = MI.getOperand(2).getImm();
7367 
7368   // Extract sub-vector or one element
7369   if (SrcTy.isVector()) {
7370     unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
7371     unsigned DstSize = DstTy.getSizeInBits();
7372 
7373     if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
7374         (Offset + DstSize <= SrcTy.getSizeInBits())) {
7375       // Unmerge and allow access to each Src element for the artifact combiner.
7376       auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), SrcReg);
7377 
7378       // Take element(s) we need to extract and copy it (merge them).
7379       SmallVector<Register, 8> SubVectorElts;
7380       for (unsigned Idx = Offset / SrcEltSize;
7381            Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
7382         SubVectorElts.push_back(Unmerge.getReg(Idx));
7383       }
7384       if (SubVectorElts.size() == 1)
7385         MIRBuilder.buildCopy(DstReg, SubVectorElts[0]);
7386       else
7387         MIRBuilder.buildMergeLikeInstr(DstReg, SubVectorElts);
7388 
7389       MI.eraseFromParent();
7390       return Legalized;
7391     }
7392   }
7393 
7394   if (DstTy.isScalar() &&
7395       (SrcTy.isScalar() ||
7396        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
7397     LLT SrcIntTy = SrcTy;
7398     if (!SrcTy.isScalar()) {
7399       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
7400       SrcReg = MIRBuilder.buildBitcast(SrcIntTy, SrcReg).getReg(0);
7401     }
7402 
7403     if (Offset == 0)
7404       MIRBuilder.buildTrunc(DstReg, SrcReg);
7405     else {
7406       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
7407       auto Shr = MIRBuilder.buildLShr(SrcIntTy, SrcReg, ShiftAmt);
7408       MIRBuilder.buildTrunc(DstReg, Shr);
7409     }
7410 
7411     MI.eraseFromParent();
7412     return Legalized;
7413   }
7414 
7415   return UnableToLegalize;
7416 }
7417 
7418 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
7419   auto [Dst, Src, InsertSrc] = MI.getFirst3Regs();
7420   uint64_t Offset = MI.getOperand(3).getImm();
7421 
7422   LLT DstTy = MRI.getType(Src);
7423   LLT InsertTy = MRI.getType(InsertSrc);
7424 
7425   // Insert sub-vector or one element
7426   if (DstTy.isVector() && !InsertTy.isPointer()) {
7427     LLT EltTy = DstTy.getElementType();
7428     unsigned EltSize = EltTy.getSizeInBits();
7429     unsigned InsertSize = InsertTy.getSizeInBits();
7430 
7431     if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
7432         (Offset + InsertSize <= DstTy.getSizeInBits())) {
7433       auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
7434       SmallVector<Register, 8> DstElts;
7435       unsigned Idx = 0;
7436       // Elements from Src before insert start Offset
7437       for (; Idx < Offset / EltSize; ++Idx) {
7438         DstElts.push_back(UnmergeSrc.getReg(Idx));
7439       }
7440 
7441       // Replace elements in Src with elements from InsertSrc
7442       if (InsertTy.getSizeInBits() > EltSize) {
7443         auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
7444         for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
7445              ++Idx, ++i) {
7446           DstElts.push_back(UnmergeInsertSrc.getReg(i));
7447         }
7448       } else {
7449         DstElts.push_back(InsertSrc);
7450         ++Idx;
7451       }
7452 
7453       // Remaining elements from Src after insert
7454       for (; Idx < DstTy.getNumElements(); ++Idx) {
7455         DstElts.push_back(UnmergeSrc.getReg(Idx));
7456       }
7457 
7458       MIRBuilder.buildMergeLikeInstr(Dst, DstElts);
7459       MI.eraseFromParent();
7460       return Legalized;
7461     }
7462   }
7463 
7464   if (InsertTy.isVector() ||
7465       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
7466     return UnableToLegalize;
7467 
7468   const DataLayout &DL = MIRBuilder.getDataLayout();
7469   if ((DstTy.isPointer() &&
7470        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
7471       (InsertTy.isPointer() &&
7472        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
7473     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
7474     return UnableToLegalize;
7475   }
7476 
7477   LLT IntDstTy = DstTy;
7478 
7479   if (!DstTy.isScalar()) {
7480     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
7481     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
7482   }
7483 
7484   if (!InsertTy.isScalar()) {
7485     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
7486     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
7487   }
7488 
7489   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
7490   if (Offset != 0) {
7491     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
7492     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
7493   }
7494 
7495   APInt MaskVal = APInt::getBitsSetWithWrap(
7496       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
7497 
7498   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
7499   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
7500   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
7501 
7502   MIRBuilder.buildCast(Dst, Or);
7503   MI.eraseFromParent();
7504   return Legalized;
7505 }
7506 
7507 LegalizerHelper::LegalizeResult
7508 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
7509   auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] =
7510       MI.getFirst4RegLLTs();
7511   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
7512 
7513   LLT Ty = Dst0Ty;
7514   LLT BoolTy = Dst1Ty;
7515 
7516   if (IsAdd)
7517     MIRBuilder.buildAdd(Dst0, LHS, RHS);
7518   else
7519     MIRBuilder.buildSub(Dst0, LHS, RHS);
7520 
7521   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
7522 
7523   auto Zero = MIRBuilder.buildConstant(Ty, 0);
7524 
7525   // For an addition, the result should be less than one of the operands (LHS)
7526   // if and only if the other operand (RHS) is negative, otherwise there will
7527   // be overflow.
7528   // For a subtraction, the result should be less than one of the operands
7529   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
7530   // otherwise there will be overflow.
7531   auto ResultLowerThanLHS =
7532       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
7533   auto ConditionRHS = MIRBuilder.buildICmp(
7534       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
7535 
7536   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
7537   MI.eraseFromParent();
7538   return Legalized;
7539 }
7540 
7541 LegalizerHelper::LegalizeResult
7542 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
7543   auto [Res, LHS, RHS] = MI.getFirst3Regs();
7544   LLT Ty = MRI.getType(Res);
7545   bool IsSigned;
7546   bool IsAdd;
7547   unsigned BaseOp;
7548   switch (MI.getOpcode()) {
7549   default:
7550     llvm_unreachable("unexpected addsat/subsat opcode");
7551   case TargetOpcode::G_UADDSAT:
7552     IsSigned = false;
7553     IsAdd = true;
7554     BaseOp = TargetOpcode::G_ADD;
7555     break;
7556   case TargetOpcode::G_SADDSAT:
7557     IsSigned = true;
7558     IsAdd = true;
7559     BaseOp = TargetOpcode::G_ADD;
7560     break;
7561   case TargetOpcode::G_USUBSAT:
7562     IsSigned = false;
7563     IsAdd = false;
7564     BaseOp = TargetOpcode::G_SUB;
7565     break;
7566   case TargetOpcode::G_SSUBSAT:
7567     IsSigned = true;
7568     IsAdd = false;
7569     BaseOp = TargetOpcode::G_SUB;
7570     break;
7571   }
7572 
7573   if (IsSigned) {
7574     // sadd.sat(a, b) ->
7575     //   hi = 0x7fffffff - smax(a, 0)
7576     //   lo = 0x80000000 - smin(a, 0)
7577     //   a + smin(smax(lo, b), hi)
7578     // ssub.sat(a, b) ->
7579     //   lo = smax(a, -1) - 0x7fffffff
7580     //   hi = smin(a, -1) - 0x80000000
7581     //   a - smin(smax(lo, b), hi)
7582     // TODO: AMDGPU can use a "median of 3" instruction here:
7583     //   a +/- med3(lo, b, hi)
7584     uint64_t NumBits = Ty.getScalarSizeInBits();
7585     auto MaxVal =
7586         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
7587     auto MinVal =
7588         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7589     MachineInstrBuilder Hi, Lo;
7590     if (IsAdd) {
7591       auto Zero = MIRBuilder.buildConstant(Ty, 0);
7592       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
7593       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
7594     } else {
7595       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
7596       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
7597                                MaxVal);
7598       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
7599                                MinVal);
7600     }
7601     auto RHSClamped =
7602         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
7603     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
7604   } else {
7605     // uadd.sat(a, b) -> a + umin(~a, b)
7606     // usub.sat(a, b) -> a - umin(a, b)
7607     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
7608     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
7609     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
7610   }
7611 
7612   MI.eraseFromParent();
7613   return Legalized;
7614 }
7615 
7616 LegalizerHelper::LegalizeResult
7617 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
7618   auto [Res, LHS, RHS] = MI.getFirst3Regs();
7619   LLT Ty = MRI.getType(Res);
7620   LLT BoolTy = Ty.changeElementSize(1);
7621   bool IsSigned;
7622   bool IsAdd;
7623   unsigned OverflowOp;
7624   switch (MI.getOpcode()) {
7625   default:
7626     llvm_unreachable("unexpected addsat/subsat opcode");
7627   case TargetOpcode::G_UADDSAT:
7628     IsSigned = false;
7629     IsAdd = true;
7630     OverflowOp = TargetOpcode::G_UADDO;
7631     break;
7632   case TargetOpcode::G_SADDSAT:
7633     IsSigned = true;
7634     IsAdd = true;
7635     OverflowOp = TargetOpcode::G_SADDO;
7636     break;
7637   case TargetOpcode::G_USUBSAT:
7638     IsSigned = false;
7639     IsAdd = false;
7640     OverflowOp = TargetOpcode::G_USUBO;
7641     break;
7642   case TargetOpcode::G_SSUBSAT:
7643     IsSigned = true;
7644     IsAdd = false;
7645     OverflowOp = TargetOpcode::G_SSUBO;
7646     break;
7647   }
7648 
7649   auto OverflowRes =
7650       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7651   Register Tmp = OverflowRes.getReg(0);
7652   Register Ov = OverflowRes.getReg(1);
7653   MachineInstrBuilder Clamp;
7654   if (IsSigned) {
7655     // sadd.sat(a, b) ->
7656     //   {tmp, ov} = saddo(a, b)
7657     //   ov ? (tmp >>s 31) + 0x80000000 : r
7658     // ssub.sat(a, b) ->
7659     //   {tmp, ov} = ssubo(a, b)
7660     //   ov ? (tmp >>s 31) + 0x80000000 : r
7661     uint64_t NumBits = Ty.getScalarSizeInBits();
7662     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7663     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7664     auto MinVal =
7665         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7666     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7667   } else {
7668     // uadd.sat(a, b) ->
7669     //   {tmp, ov} = uaddo(a, b)
7670     //   ov ? 0xffffffff : tmp
7671     // usub.sat(a, b) ->
7672     //   {tmp, ov} = usubo(a, b)
7673     //   ov ? 0 : tmp
7674     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7675   }
7676   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7677 
7678   MI.eraseFromParent();
7679   return Legalized;
7680 }
7681 
7682 LegalizerHelper::LegalizeResult
7683 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7684   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7685           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7686          "Expected shlsat opcode!");
7687   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7688   auto [Res, LHS, RHS] = MI.getFirst3Regs();
7689   LLT Ty = MRI.getType(Res);
7690   LLT BoolTy = Ty.changeElementSize(1);
7691 
7692   unsigned BW = Ty.getScalarSizeInBits();
7693   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7694   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7695                        : MIRBuilder.buildLShr(Ty, Result, RHS);
7696 
7697   MachineInstrBuilder SatVal;
7698   if (IsSigned) {
7699     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7700     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7701     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7702                                     MIRBuilder.buildConstant(Ty, 0));
7703     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7704   } else {
7705     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7706   }
7707   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7708   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7709 
7710   MI.eraseFromParent();
7711   return Legalized;
7712 }
7713 
7714 LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
7715   auto [Dst, Src] = MI.getFirst2Regs();
7716   const LLT Ty = MRI.getType(Src);
7717   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7718   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7719 
7720   // Swap most and least significant byte, set remaining bytes in Res to zero.
7721   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7722   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7723   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7724   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7725 
7726   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7727   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7728     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7729     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7730     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7731     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7732     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7733     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7734     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7735     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7736     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7737     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7738     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7739     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7740   }
7741   Res.getInstr()->getOperand(0).setReg(Dst);
7742 
7743   MI.eraseFromParent();
7744   return Legalized;
7745 }
7746 
7747 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7748 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7749                                  MachineInstrBuilder Src, APInt Mask) {
7750   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7751   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7752   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7753   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7754   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7755   return B.buildOr(Dst, LHS, RHS);
7756 }
7757 
7758 LegalizerHelper::LegalizeResult
7759 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7760   auto [Dst, Src] = MI.getFirst2Regs();
7761   const LLT Ty = MRI.getType(Src);
7762   unsigned Size = Ty.getSizeInBits();
7763 
7764   MachineInstrBuilder BSWAP =
7765       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7766 
7767   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7768   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7769   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7770   MachineInstrBuilder Swap4 =
7771       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7772 
7773   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7774   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7775   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7776   MachineInstrBuilder Swap2 =
7777       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7778 
7779   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7780   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7781   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7782   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7783 
7784   MI.eraseFromParent();
7785   return Legalized;
7786 }
7787 
7788 LegalizerHelper::LegalizeResult
7789 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7790   MachineFunction &MF = MIRBuilder.getMF();
7791 
7792   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7793   int NameOpIdx = IsRead ? 1 : 0;
7794   int ValRegIndex = IsRead ? 0 : 1;
7795 
7796   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7797   const LLT Ty = MRI.getType(ValReg);
7798   const MDString *RegStr = cast<MDString>(
7799     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7800 
7801   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7802   if (!PhysReg.isValid())
7803     return UnableToLegalize;
7804 
7805   if (IsRead)
7806     MIRBuilder.buildCopy(ValReg, PhysReg);
7807   else
7808     MIRBuilder.buildCopy(PhysReg, ValReg);
7809 
7810   MI.eraseFromParent();
7811   return Legalized;
7812 }
7813 
7814 LegalizerHelper::LegalizeResult
7815 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7816   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7817   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7818   Register Result = MI.getOperand(0).getReg();
7819   LLT OrigTy = MRI.getType(Result);
7820   auto SizeInBits = OrigTy.getScalarSizeInBits();
7821   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7822 
7823   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7824   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7825   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7826   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7827 
7828   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7829   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7830   MIRBuilder.buildTrunc(Result, Shifted);
7831 
7832   MI.eraseFromParent();
7833   return Legalized;
7834 }
7835 
7836 LegalizerHelper::LegalizeResult
7837 LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
7838   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7839   FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm());
7840 
7841   if (Mask == fcNone) {
7842     MIRBuilder.buildConstant(DstReg, 0);
7843     MI.eraseFromParent();
7844     return Legalized;
7845   }
7846   if (Mask == fcAllFlags) {
7847     MIRBuilder.buildConstant(DstReg, 1);
7848     MI.eraseFromParent();
7849     return Legalized;
7850   }
7851 
7852   // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
7853   // version
7854 
7855   unsigned BitSize = SrcTy.getScalarSizeInBits();
7856   const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
7857 
7858   LLT IntTy = LLT::scalar(BitSize);
7859   if (SrcTy.isVector())
7860     IntTy = LLT::vector(SrcTy.getElementCount(), IntTy);
7861   auto AsInt = MIRBuilder.buildCopy(IntTy, SrcReg);
7862 
7863   // Various masks.
7864   APInt SignBit = APInt::getSignMask(BitSize);
7865   APInt ValueMask = APInt::getSignedMaxValue(BitSize);     // All bits but sign.
7866   APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit.
7867   APInt ExpMask = Inf;
7868   APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
7869   APInt QNaNBitMask =
7870       APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
7871   APInt InvertionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits());
7872 
7873   auto SignBitC = MIRBuilder.buildConstant(IntTy, SignBit);
7874   auto ValueMaskC = MIRBuilder.buildConstant(IntTy, ValueMask);
7875   auto InfC = MIRBuilder.buildConstant(IntTy, Inf);
7876   auto ExpMaskC = MIRBuilder.buildConstant(IntTy, ExpMask);
7877   auto ZeroC = MIRBuilder.buildConstant(IntTy, 0);
7878 
7879   auto Abs = MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC);
7880   auto Sign =
7881       MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs);
7882 
7883   auto Res = MIRBuilder.buildConstant(DstTy, 0);
7884   // Clang doesn't support capture of structured bindings:
7885   LLT DstTyCopy = DstTy;
7886   const auto appendToRes = [&](MachineInstrBuilder ToAppend) {
7887     Res = MIRBuilder.buildOr(DstTyCopy, Res, ToAppend);
7888   };
7889 
7890   // Tests that involve more than one class should be processed first.
7891   if ((Mask & fcFinite) == fcFinite) {
7892     // finite(V) ==> abs(V) u< exp_mask
7893     appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
7894                                      ExpMaskC));
7895     Mask &= ~fcFinite;
7896   } else if ((Mask & fcFinite) == fcPosFinite) {
7897     // finite(V) && V > 0 ==> V u< exp_mask
7898     appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt,
7899                                      ExpMaskC));
7900     Mask &= ~fcPosFinite;
7901   } else if ((Mask & fcFinite) == fcNegFinite) {
7902     // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
7903     auto Cmp = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
7904                                     ExpMaskC);
7905     auto And = MIRBuilder.buildAnd(DstTy, Cmp, Sign);
7906     appendToRes(And);
7907     Mask &= ~fcNegFinite;
7908   }
7909 
7910   if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
7911     // fcZero | fcSubnormal => test all exponent bits are 0
7912     // TODO: Handle sign bit specific cases
7913     // TODO: Handle inverted case
7914     if (PartialCheck == (fcZero | fcSubnormal)) {
7915       auto ExpBits = MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC);
7916       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7917                                        ExpBits, ZeroC));
7918       Mask &= ~PartialCheck;
7919     }
7920   }
7921 
7922   // Check for individual classes.
7923   if (FPClassTest PartialCheck = Mask & fcZero) {
7924     if (PartialCheck == fcPosZero)
7925       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7926                                        AsInt, ZeroC));
7927     else if (PartialCheck == fcZero)
7928       appendToRes(
7929           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC));
7930     else // fcNegZero
7931       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7932                                        AsInt, SignBitC));
7933   }
7934 
7935   if (FPClassTest PartialCheck = Mask & fcSubnormal) {
7936     // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
7937     // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
7938     auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
7939     auto OneC = MIRBuilder.buildConstant(IntTy, 1);
7940     auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC);
7941     auto SubnormalRes =
7942         MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne,
7943                              MIRBuilder.buildConstant(IntTy, AllOneMantissa));
7944     if (PartialCheck == fcNegSubnormal)
7945       SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign);
7946     appendToRes(SubnormalRes);
7947   }
7948 
7949   if (FPClassTest PartialCheck = Mask & fcInf) {
7950     if (PartialCheck == fcPosInf)
7951       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7952                                        AsInt, InfC));
7953     else if (PartialCheck == fcInf)
7954       appendToRes(
7955           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC));
7956     else { // fcNegInf
7957       APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt();
7958       auto NegInfC = MIRBuilder.buildConstant(IntTy, NegInf);
7959       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7960                                        AsInt, NegInfC));
7961     }
7962   }
7963 
7964   if (FPClassTest PartialCheck = Mask & fcNan) {
7965     auto InfWithQnanBitC = MIRBuilder.buildConstant(IntTy, Inf | QNaNBitMask);
7966     if (PartialCheck == fcNan) {
7967       // isnan(V) ==> abs(V) u> int(inf)
7968       appendToRes(
7969           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC));
7970     } else if (PartialCheck == fcQNan) {
7971       // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
7972       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs,
7973                                        InfWithQnanBitC));
7974     } else { // fcSNan
7975       // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
7976       //                    abs(V) u< (unsigned(Inf) | quiet_bit)
7977       auto IsNan =
7978           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC);
7979       auto IsNotQnan = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy,
7980                                             Abs, InfWithQnanBitC);
7981       appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan));
7982     }
7983   }
7984 
7985   if (FPClassTest PartialCheck = Mask & fcNormal) {
7986     // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
7987     // (max_exp-1))
7988     APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
7989     auto ExpMinusOne = MIRBuilder.buildSub(
7990         IntTy, Abs, MIRBuilder.buildConstant(IntTy, ExpLSB));
7991     APInt MaxExpMinusOne = ExpMask - ExpLSB;
7992     auto NormalRes =
7993         MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
7994                              MIRBuilder.buildConstant(IntTy, MaxExpMinusOne));
7995     if (PartialCheck == fcNegNormal)
7996       NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign);
7997     else if (PartialCheck == fcPosNormal) {
7998       auto PosSign = MIRBuilder.buildXor(
7999           DstTy, Sign, MIRBuilder.buildConstant(DstTy, InvertionMask));
8000       NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign);
8001     }
8002     appendToRes(NormalRes);
8003   }
8004 
8005   MIRBuilder.buildCopy(DstReg, Res);
8006   MI.eraseFromParent();
8007   return Legalized;
8008 }
8009 
8010 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
8011   // Implement vector G_SELECT in terms of XOR, AND, OR.
8012   auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
8013       MI.getFirst4RegLLTs();
8014   if (!DstTy.isVector())
8015     return UnableToLegalize;
8016 
8017   bool IsEltPtr = DstTy.getElementType().isPointer();
8018   if (IsEltPtr) {
8019     LLT ScalarPtrTy = LLT::scalar(DstTy.getScalarSizeInBits());
8020     LLT NewTy = DstTy.changeElementType(ScalarPtrTy);
8021     Op1Reg = MIRBuilder.buildPtrToInt(NewTy, Op1Reg).getReg(0);
8022     Op2Reg = MIRBuilder.buildPtrToInt(NewTy, Op2Reg).getReg(0);
8023     DstTy = NewTy;
8024   }
8025 
8026   if (MaskTy.isScalar()) {
8027     // Turn the scalar condition into a vector condition mask.
8028 
8029     Register MaskElt = MaskReg;
8030 
8031     // The condition was potentially zero extended before, but we want a sign
8032     // extended boolean.
8033     if (MaskTy != LLT::scalar(1))
8034       MaskElt = MIRBuilder.buildSExtInReg(MaskTy, MaskElt, 1).getReg(0);
8035 
8036     // Continue the sign extension (or truncate) to match the data type.
8037     MaskElt = MIRBuilder.buildSExtOrTrunc(DstTy.getElementType(),
8038                                           MaskElt).getReg(0);
8039 
8040     // Generate a vector splat idiom.
8041     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
8042     MaskReg = ShufSplat.getReg(0);
8043     MaskTy = DstTy;
8044   }
8045 
8046   if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
8047     return UnableToLegalize;
8048   }
8049 
8050   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
8051   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
8052   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
8053   if (IsEltPtr) {
8054     auto Or = MIRBuilder.buildOr(DstTy, NewOp1, NewOp2);
8055     MIRBuilder.buildIntToPtr(DstReg, Or);
8056   } else {
8057     MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
8058   }
8059   MI.eraseFromParent();
8060   return Legalized;
8061 }
8062 
8063 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
8064   // Split DIVREM into individual instructions.
8065   unsigned Opcode = MI.getOpcode();
8066 
8067   MIRBuilder.buildInstr(
8068       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
8069                                         : TargetOpcode::G_UDIV,
8070       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
8071   MIRBuilder.buildInstr(
8072       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
8073                                         : TargetOpcode::G_UREM,
8074       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
8075   MI.eraseFromParent();
8076   return Legalized;
8077 }
8078 
8079 LegalizerHelper::LegalizeResult
8080 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
8081   // Expand %res = G_ABS %a into:
8082   // %v1 = G_ASHR %a, scalar_size-1
8083   // %v2 = G_ADD %a, %v1
8084   // %res = G_XOR %v2, %v1
8085   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
8086   Register OpReg = MI.getOperand(1).getReg();
8087   auto ShiftAmt =
8088       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
8089   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
8090   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
8091   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
8092   MI.eraseFromParent();
8093   return Legalized;
8094 }
8095 
8096 LegalizerHelper::LegalizeResult
8097 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
8098   // Expand %res = G_ABS %a into:
8099   // %v1 = G_CONSTANT 0
8100   // %v2 = G_SUB %v1, %a
8101   // %res = G_SMAX %a, %v2
8102   Register SrcReg = MI.getOperand(1).getReg();
8103   LLT Ty = MRI.getType(SrcReg);
8104   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
8105   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
8106   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
8107   MI.eraseFromParent();
8108   return Legalized;
8109 }
8110 
8111 LegalizerHelper::LegalizeResult
8112 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
8113   Register SrcReg = MI.getOperand(1).getReg();
8114   LLT SrcTy = MRI.getType(SrcReg);
8115   LLT DstTy = MRI.getType(SrcReg);
8116 
8117   // The source could be a scalar if the IR type was <1 x sN>.
8118   if (SrcTy.isScalar()) {
8119     if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
8120       return UnableToLegalize; // FIXME: handle extension.
8121     // This can be just a plain copy.
8122     Observer.changingInstr(MI);
8123     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
8124     Observer.changedInstr(MI);
8125     return Legalized;
8126   }
8127   return UnableToLegalize;
8128 }
8129 
8130 static Type *getTypeForLLT(LLT Ty, LLVMContext &C);
8131 
8132 LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) {
8133   MachineFunction &MF = *MI.getMF();
8134   const DataLayout &DL = MIRBuilder.getDataLayout();
8135   LLVMContext &Ctx = MF.getFunction().getContext();
8136   Register ListPtr = MI.getOperand(1).getReg();
8137   LLT PtrTy = MRI.getType(ListPtr);
8138 
8139   // LstPtr is a pointer to the head of the list. Get the address
8140   // of the head of the list.
8141   Align PtrAlignment = DL.getABITypeAlign(getTypeForLLT(PtrTy, Ctx));
8142   MachineMemOperand *PtrLoadMMO = MF.getMachineMemOperand(
8143       MachinePointerInfo(), MachineMemOperand::MOLoad, PtrTy, PtrAlignment);
8144   auto VAList = MIRBuilder.buildLoad(PtrTy, ListPtr, *PtrLoadMMO).getReg(0);
8145 
8146   const Align A(MI.getOperand(2).getImm());
8147   LLT PtrTyAsScalarTy = LLT::scalar(PtrTy.getSizeInBits());
8148   if (A > TLI.getMinStackArgumentAlignment()) {
8149     Register AlignAmt =
8150         MIRBuilder.buildConstant(PtrTyAsScalarTy, A.value() - 1).getReg(0);
8151     auto AddDst = MIRBuilder.buildPtrAdd(PtrTy, VAList, AlignAmt);
8152     auto AndDst = MIRBuilder.buildMaskLowPtrBits(PtrTy, AddDst, Log2(A));
8153     VAList = AndDst.getReg(0);
8154   }
8155 
8156   // Increment the pointer, VAList, to the next vaarg
8157   // The list should be bumped by the size of element in the current head of
8158   // list.
8159   Register Dst = MI.getOperand(0).getReg();
8160   LLT LLTTy = MRI.getType(Dst);
8161   Type *Ty = getTypeForLLT(LLTTy, Ctx);
8162   auto IncAmt =
8163       MIRBuilder.buildConstant(PtrTyAsScalarTy, DL.getTypeAllocSize(Ty));
8164   auto Succ = MIRBuilder.buildPtrAdd(PtrTy, VAList, IncAmt);
8165 
8166   // Store the increment VAList to the legalized pointer
8167   MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
8168       MachinePointerInfo(), MachineMemOperand::MOStore, PtrTy, PtrAlignment);
8169   MIRBuilder.buildStore(Succ, ListPtr, *StoreMMO);
8170   // Load the actual argument out of the pointer VAList
8171   Align EltAlignment = DL.getABITypeAlign(Ty);
8172   MachineMemOperand *EltLoadMMO = MF.getMachineMemOperand(
8173       MachinePointerInfo(), MachineMemOperand::MOLoad, LLTTy, EltAlignment);
8174   MIRBuilder.buildLoad(Dst, VAList, *EltLoadMMO);
8175 
8176   MI.eraseFromParent();
8177   return Legalized;
8178 }
8179 
8180 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
8181   // On Darwin, -Os means optimize for size without hurting performance, so
8182   // only really optimize for size when -Oz (MinSize) is used.
8183   if (MF.getTarget().getTargetTriple().isOSDarwin())
8184     return MF.getFunction().hasMinSize();
8185   return MF.getFunction().hasOptSize();
8186 }
8187 
8188 // Returns a list of types to use for memory op lowering in MemOps. A partial
8189 // port of findOptimalMemOpLowering in TargetLowering.
8190 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
8191                                           unsigned Limit, const MemOp &Op,
8192                                           unsigned DstAS, unsigned SrcAS,
8193                                           const AttributeList &FuncAttributes,
8194                                           const TargetLowering &TLI) {
8195   if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
8196     return false;
8197 
8198   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
8199 
8200   if (Ty == LLT()) {
8201     // Use the largest scalar type whose alignment constraints are satisfied.
8202     // We only need to check DstAlign here as SrcAlign is always greater or
8203     // equal to DstAlign (or zero).
8204     Ty = LLT::scalar(64);
8205     if (Op.isFixedDstAlign())
8206       while (Op.getDstAlign() < Ty.getSizeInBytes() &&
8207              !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
8208         Ty = LLT::scalar(Ty.getSizeInBytes());
8209     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
8210     // FIXME: check for the largest legal type we can load/store to.
8211   }
8212 
8213   unsigned NumMemOps = 0;
8214   uint64_t Size = Op.size();
8215   while (Size) {
8216     unsigned TySize = Ty.getSizeInBytes();
8217     while (TySize > Size) {
8218       // For now, only use non-vector load / store's for the left-over pieces.
8219       LLT NewTy = Ty;
8220       // FIXME: check for mem op safety and legality of the types. Not all of
8221       // SDAGisms map cleanly to GISel concepts.
8222       if (NewTy.isVector())
8223         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
8224       NewTy = LLT::scalar(llvm::bit_floor(NewTy.getSizeInBits() - 1));
8225       unsigned NewTySize = NewTy.getSizeInBytes();
8226       assert(NewTySize > 0 && "Could not find appropriate type");
8227 
8228       // If the new LLT cannot cover all of the remaining bits, then consider
8229       // issuing a (or a pair of) unaligned and overlapping load / store.
8230       unsigned Fast;
8231       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
8232       MVT VT = getMVTForLLT(Ty);
8233       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
8234           TLI.allowsMisalignedMemoryAccesses(
8235               VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
8236               MachineMemOperand::MONone, &Fast) &&
8237           Fast)
8238         TySize = Size;
8239       else {
8240         Ty = NewTy;
8241         TySize = NewTySize;
8242       }
8243     }
8244 
8245     if (++NumMemOps > Limit)
8246       return false;
8247 
8248     MemOps.push_back(Ty);
8249     Size -= TySize;
8250   }
8251 
8252   return true;
8253 }
8254 
8255 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
8256   if (Ty.isVector())
8257     return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
8258                                 Ty.getNumElements());
8259   return IntegerType::get(C, Ty.getSizeInBits());
8260 }
8261 
8262 // Get a vectorized representation of the memset value operand, GISel edition.
8263 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
8264   MachineRegisterInfo &MRI = *MIB.getMRI();
8265   unsigned NumBits = Ty.getScalarSizeInBits();
8266   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
8267   if (!Ty.isVector() && ValVRegAndVal) {
8268     APInt Scalar = ValVRegAndVal->Value.trunc(8);
8269     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
8270     return MIB.buildConstant(Ty, SplatVal).getReg(0);
8271   }
8272 
8273   // Extend the byte value to the larger type, and then multiply by a magic
8274   // value 0x010101... in order to replicate it across every byte.
8275   // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
8276   if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
8277     return MIB.buildConstant(Ty, 0).getReg(0);
8278   }
8279 
8280   LLT ExtType = Ty.getScalarType();
8281   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
8282   if (NumBits > 8) {
8283     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
8284     auto MagicMI = MIB.buildConstant(ExtType, Magic);
8285     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
8286   }
8287 
8288   // For vector types create a G_BUILD_VECTOR.
8289   if (Ty.isVector())
8290     Val = MIB.buildSplatVector(Ty, Val).getReg(0);
8291 
8292   return Val;
8293 }
8294 
8295 LegalizerHelper::LegalizeResult
8296 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
8297                              uint64_t KnownLen, Align Alignment,
8298                              bool IsVolatile) {
8299   auto &MF = *MI.getParent()->getParent();
8300   const auto &TLI = *MF.getSubtarget().getTargetLowering();
8301   auto &DL = MF.getDataLayout();
8302   LLVMContext &C = MF.getFunction().getContext();
8303 
8304   assert(KnownLen != 0 && "Have a zero length memset length!");
8305 
8306   bool DstAlignCanChange = false;
8307   MachineFrameInfo &MFI = MF.getFrameInfo();
8308   bool OptSize = shouldLowerMemFuncForSize(MF);
8309 
8310   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
8311   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
8312     DstAlignCanChange = true;
8313 
8314   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
8315   std::vector<LLT> MemOps;
8316 
8317   const auto &DstMMO = **MI.memoperands_begin();
8318   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
8319 
8320   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
8321   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
8322 
8323   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
8324                                      MemOp::Set(KnownLen, DstAlignCanChange,
8325                                                 Alignment,
8326                                                 /*IsZeroMemset=*/IsZeroVal,
8327                                                 /*IsVolatile=*/IsVolatile),
8328                                      DstPtrInfo.getAddrSpace(), ~0u,
8329                                      MF.getFunction().getAttributes(), TLI))
8330     return UnableToLegalize;
8331 
8332   if (DstAlignCanChange) {
8333     // Get an estimate of the type from the LLT.
8334     Type *IRTy = getTypeForLLT(MemOps[0], C);
8335     Align NewAlign = DL.getABITypeAlign(IRTy);
8336     if (NewAlign > Alignment) {
8337       Alignment = NewAlign;
8338       unsigned FI = FIDef->getOperand(1).getIndex();
8339       // Give the stack frame object a larger alignment if needed.
8340       if (MFI.getObjectAlign(FI) < Alignment)
8341         MFI.setObjectAlignment(FI, Alignment);
8342     }
8343   }
8344 
8345   MachineIRBuilder MIB(MI);
8346   // Find the largest store and generate the bit pattern for it.
8347   LLT LargestTy = MemOps[0];
8348   for (unsigned i = 1; i < MemOps.size(); i++)
8349     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
8350       LargestTy = MemOps[i];
8351 
8352   // The memset stored value is always defined as an s8, so in order to make it
8353   // work with larger store types we need to repeat the bit pattern across the
8354   // wider type.
8355   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
8356 
8357   if (!MemSetValue)
8358     return UnableToLegalize;
8359 
8360   // Generate the stores. For each store type in the list, we generate the
8361   // matching store of that type to the destination address.
8362   LLT PtrTy = MRI.getType(Dst);
8363   unsigned DstOff = 0;
8364   unsigned Size = KnownLen;
8365   for (unsigned I = 0; I < MemOps.size(); I++) {
8366     LLT Ty = MemOps[I];
8367     unsigned TySize = Ty.getSizeInBytes();
8368     if (TySize > Size) {
8369       // Issuing an unaligned load / store pair that overlaps with the previous
8370       // pair. Adjust the offset accordingly.
8371       assert(I == MemOps.size() - 1 && I != 0);
8372       DstOff -= TySize - Size;
8373     }
8374 
8375     // If this store is smaller than the largest store see whether we can get
8376     // the smaller value for free with a truncate.
8377     Register Value = MemSetValue;
8378     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
8379       MVT VT = getMVTForLLT(Ty);
8380       MVT LargestVT = getMVTForLLT(LargestTy);
8381       if (!LargestTy.isVector() && !Ty.isVector() &&
8382           TLI.isTruncateFree(LargestVT, VT))
8383         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
8384       else
8385         Value = getMemsetValue(Val, Ty, MIB);
8386       if (!Value)
8387         return UnableToLegalize;
8388     }
8389 
8390     auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
8391 
8392     Register Ptr = Dst;
8393     if (DstOff != 0) {
8394       auto Offset =
8395           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
8396       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
8397     }
8398 
8399     MIB.buildStore(Value, Ptr, *StoreMMO);
8400     DstOff += Ty.getSizeInBytes();
8401     Size -= TySize;
8402   }
8403 
8404   MI.eraseFromParent();
8405   return Legalized;
8406 }
8407 
8408 LegalizerHelper::LegalizeResult
8409 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
8410   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
8411 
8412   auto [Dst, Src, Len] = MI.getFirst3Regs();
8413 
8414   const auto *MMOIt = MI.memoperands_begin();
8415   const MachineMemOperand *MemOp = *MMOIt;
8416   bool IsVolatile = MemOp->isVolatile();
8417 
8418   // See if this is a constant length copy
8419   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
8420   // FIXME: support dynamically sized G_MEMCPY_INLINE
8421   assert(LenVRegAndVal &&
8422          "inline memcpy with dynamic size is not yet supported");
8423   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
8424   if (KnownLen == 0) {
8425     MI.eraseFromParent();
8426     return Legalized;
8427   }
8428 
8429   const auto &DstMMO = **MI.memoperands_begin();
8430   const auto &SrcMMO = **std::next(MI.memoperands_begin());
8431   Align DstAlign = DstMMO.getBaseAlign();
8432   Align SrcAlign = SrcMMO.getBaseAlign();
8433 
8434   return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
8435                            IsVolatile);
8436 }
8437 
8438 LegalizerHelper::LegalizeResult
8439 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
8440                                    uint64_t KnownLen, Align DstAlign,
8441                                    Align SrcAlign, bool IsVolatile) {
8442   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
8443   return lowerMemcpy(MI, Dst, Src, KnownLen,
8444                      std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
8445                      IsVolatile);
8446 }
8447 
8448 LegalizerHelper::LegalizeResult
8449 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
8450                              uint64_t KnownLen, uint64_t Limit, Align DstAlign,
8451                              Align SrcAlign, bool IsVolatile) {
8452   auto &MF = *MI.getParent()->getParent();
8453   const auto &TLI = *MF.getSubtarget().getTargetLowering();
8454   auto &DL = MF.getDataLayout();
8455   LLVMContext &C = MF.getFunction().getContext();
8456 
8457   assert(KnownLen != 0 && "Have a zero length memcpy length!");
8458 
8459   bool DstAlignCanChange = false;
8460   MachineFrameInfo &MFI = MF.getFrameInfo();
8461   Align Alignment = std::min(DstAlign, SrcAlign);
8462 
8463   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
8464   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
8465     DstAlignCanChange = true;
8466 
8467   // FIXME: infer better src pointer alignment like SelectionDAG does here.
8468   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
8469   // if the memcpy is in a tail call position.
8470 
8471   std::vector<LLT> MemOps;
8472 
8473   const auto &DstMMO = **MI.memoperands_begin();
8474   const auto &SrcMMO = **std::next(MI.memoperands_begin());
8475   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
8476   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
8477 
8478   if (!findGISelOptimalMemOpLowering(
8479           MemOps, Limit,
8480           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
8481                       IsVolatile),
8482           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
8483           MF.getFunction().getAttributes(), TLI))
8484     return UnableToLegalize;
8485 
8486   if (DstAlignCanChange) {
8487     // Get an estimate of the type from the LLT.
8488     Type *IRTy = getTypeForLLT(MemOps[0], C);
8489     Align NewAlign = DL.getABITypeAlign(IRTy);
8490 
8491     // Don't promote to an alignment that would require dynamic stack
8492     // realignment.
8493     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
8494     if (!TRI->hasStackRealignment(MF))
8495       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
8496         NewAlign = NewAlign.previous();
8497 
8498     if (NewAlign > Alignment) {
8499       Alignment = NewAlign;
8500       unsigned FI = FIDef->getOperand(1).getIndex();
8501       // Give the stack frame object a larger alignment if needed.
8502       if (MFI.getObjectAlign(FI) < Alignment)
8503         MFI.setObjectAlignment(FI, Alignment);
8504     }
8505   }
8506 
8507   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
8508 
8509   MachineIRBuilder MIB(MI);
8510   // Now we need to emit a pair of load and stores for each of the types we've
8511   // collected. I.e. for each type, generate a load from the source pointer of
8512   // that type width, and then generate a corresponding store to the dest buffer
8513   // of that value loaded. This can result in a sequence of loads and stores
8514   // mixed types, depending on what the target specifies as good types to use.
8515   unsigned CurrOffset = 0;
8516   unsigned Size = KnownLen;
8517   for (auto CopyTy : MemOps) {
8518     // Issuing an unaligned load / store pair  that overlaps with the previous
8519     // pair. Adjust the offset accordingly.
8520     if (CopyTy.getSizeInBytes() > Size)
8521       CurrOffset -= CopyTy.getSizeInBytes() - Size;
8522 
8523     // Construct MMOs for the accesses.
8524     auto *LoadMMO =
8525         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
8526     auto *StoreMMO =
8527         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
8528 
8529     // Create the load.
8530     Register LoadPtr = Src;
8531     Register Offset;
8532     if (CurrOffset != 0) {
8533       LLT SrcTy = MRI.getType(Src);
8534       Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
8535                    .getReg(0);
8536       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
8537     }
8538     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
8539 
8540     // Create the store.
8541     Register StorePtr = Dst;
8542     if (CurrOffset != 0) {
8543       LLT DstTy = MRI.getType(Dst);
8544       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
8545     }
8546     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
8547     CurrOffset += CopyTy.getSizeInBytes();
8548     Size -= CopyTy.getSizeInBytes();
8549   }
8550 
8551   MI.eraseFromParent();
8552   return Legalized;
8553 }
8554 
8555 LegalizerHelper::LegalizeResult
8556 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
8557                               uint64_t KnownLen, Align DstAlign, Align SrcAlign,
8558                               bool IsVolatile) {
8559   auto &MF = *MI.getParent()->getParent();
8560   const auto &TLI = *MF.getSubtarget().getTargetLowering();
8561   auto &DL = MF.getDataLayout();
8562   LLVMContext &C = MF.getFunction().getContext();
8563 
8564   assert(KnownLen != 0 && "Have a zero length memmove length!");
8565 
8566   bool DstAlignCanChange = false;
8567   MachineFrameInfo &MFI = MF.getFrameInfo();
8568   bool OptSize = shouldLowerMemFuncForSize(MF);
8569   Align Alignment = std::min(DstAlign, SrcAlign);
8570 
8571   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
8572   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
8573     DstAlignCanChange = true;
8574 
8575   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
8576   std::vector<LLT> MemOps;
8577 
8578   const auto &DstMMO = **MI.memoperands_begin();
8579   const auto &SrcMMO = **std::next(MI.memoperands_begin());
8580   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
8581   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
8582 
8583   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
8584   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
8585   // same thing here.
8586   if (!findGISelOptimalMemOpLowering(
8587           MemOps, Limit,
8588           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
8589                       /*IsVolatile*/ true),
8590           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
8591           MF.getFunction().getAttributes(), TLI))
8592     return UnableToLegalize;
8593 
8594   if (DstAlignCanChange) {
8595     // Get an estimate of the type from the LLT.
8596     Type *IRTy = getTypeForLLT(MemOps[0], C);
8597     Align NewAlign = DL.getABITypeAlign(IRTy);
8598 
8599     // Don't promote to an alignment that would require dynamic stack
8600     // realignment.
8601     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
8602     if (!TRI->hasStackRealignment(MF))
8603       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
8604         NewAlign = NewAlign.previous();
8605 
8606     if (NewAlign > Alignment) {
8607       Alignment = NewAlign;
8608       unsigned FI = FIDef->getOperand(1).getIndex();
8609       // Give the stack frame object a larger alignment if needed.
8610       if (MFI.getObjectAlign(FI) < Alignment)
8611         MFI.setObjectAlignment(FI, Alignment);
8612     }
8613   }
8614 
8615   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
8616 
8617   MachineIRBuilder MIB(MI);
8618   // Memmove requires that we perform the loads first before issuing the stores.
8619   // Apart from that, this loop is pretty much doing the same thing as the
8620   // memcpy codegen function.
8621   unsigned CurrOffset = 0;
8622   SmallVector<Register, 16> LoadVals;
8623   for (auto CopyTy : MemOps) {
8624     // Construct MMO for the load.
8625     auto *LoadMMO =
8626         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
8627 
8628     // Create the load.
8629     Register LoadPtr = Src;
8630     if (CurrOffset != 0) {
8631       LLT SrcTy = MRI.getType(Src);
8632       auto Offset =
8633           MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
8634       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
8635     }
8636     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
8637     CurrOffset += CopyTy.getSizeInBytes();
8638   }
8639 
8640   CurrOffset = 0;
8641   for (unsigned I = 0; I < MemOps.size(); ++I) {
8642     LLT CopyTy = MemOps[I];
8643     // Now store the values loaded.
8644     auto *StoreMMO =
8645         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
8646 
8647     Register StorePtr = Dst;
8648     if (CurrOffset != 0) {
8649       LLT DstTy = MRI.getType(Dst);
8650       auto Offset =
8651           MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
8652       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
8653     }
8654     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
8655     CurrOffset += CopyTy.getSizeInBytes();
8656   }
8657   MI.eraseFromParent();
8658   return Legalized;
8659 }
8660 
8661 LegalizerHelper::LegalizeResult
8662 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
8663   const unsigned Opc = MI.getOpcode();
8664   // This combine is fairly complex so it's not written with a separate
8665   // matcher function.
8666   assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
8667           Opc == TargetOpcode::G_MEMSET) &&
8668          "Expected memcpy like instruction");
8669 
8670   auto MMOIt = MI.memoperands_begin();
8671   const MachineMemOperand *MemOp = *MMOIt;
8672 
8673   Align DstAlign = MemOp->getBaseAlign();
8674   Align SrcAlign;
8675   auto [Dst, Src, Len] = MI.getFirst3Regs();
8676 
8677   if (Opc != TargetOpcode::G_MEMSET) {
8678     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
8679     MemOp = *(++MMOIt);
8680     SrcAlign = MemOp->getBaseAlign();
8681   }
8682 
8683   // See if this is a constant length copy
8684   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
8685   if (!LenVRegAndVal)
8686     return UnableToLegalize;
8687   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
8688 
8689   if (KnownLen == 0) {
8690     MI.eraseFromParent();
8691     return Legalized;
8692   }
8693 
8694   bool IsVolatile = MemOp->isVolatile();
8695   if (Opc == TargetOpcode::G_MEMCPY_INLINE)
8696     return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
8697                              IsVolatile);
8698 
8699   // Don't try to optimize volatile.
8700   if (IsVolatile)
8701     return UnableToLegalize;
8702 
8703   if (MaxLen && KnownLen > MaxLen)
8704     return UnableToLegalize;
8705 
8706   if (Opc == TargetOpcode::G_MEMCPY) {
8707     auto &MF = *MI.getParent()->getParent();
8708     const auto &TLI = *MF.getSubtarget().getTargetLowering();
8709     bool OptSize = shouldLowerMemFuncForSize(MF);
8710     uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
8711     return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
8712                        IsVolatile);
8713   }
8714   if (Opc == TargetOpcode::G_MEMMOVE)
8715     return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
8716   if (Opc == TargetOpcode::G_MEMSET)
8717     return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
8718   return UnableToLegalize;
8719 }
8720