1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
19 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24 #include "llvm/CodeGen/GlobalISel/Utils.h"
25 #include "llvm/CodeGen/LowLevelTypeUtils.h"
26 #include "llvm/CodeGen/MachineConstantPool.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/CodeGen/RuntimeLibcallUtil.h"
30 #include "llvm/CodeGen/TargetFrameLowering.h"
31 #include "llvm/CodeGen/TargetInstrInfo.h"
32 #include "llvm/CodeGen/TargetLowering.h"
33 #include "llvm/CodeGen/TargetOpcodes.h"
34 #include "llvm/CodeGen/TargetSubtargetInfo.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/Support/Debug.h"
37 #include "llvm/Support/MathExtras.h"
38 #include "llvm/Support/raw_ostream.h"
39 #include "llvm/Target/TargetMachine.h"
40 #include <numeric>
41 #include <optional>
42
43 #define DEBUG_TYPE "legalizer"
44
45 using namespace llvm;
46 using namespace LegalizeActions;
47 using namespace MIPatternMatch;
48
49 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
50 ///
51 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
52 /// with any leftover piece as type \p LeftoverTy
53 ///
54 /// Returns -1 in the first element of the pair if the breakdown is not
55 /// satisfiable.
56 static std::pair<int, int>
getNarrowTypeBreakDown(LLT OrigTy,LLT NarrowTy,LLT & LeftoverTy)57 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
58 assert(!LeftoverTy.isValid() && "this is an out argument");
59
60 unsigned Size = OrigTy.getSizeInBits();
61 unsigned NarrowSize = NarrowTy.getSizeInBits();
62 unsigned NumParts = Size / NarrowSize;
63 unsigned LeftoverSize = Size - NumParts * NarrowSize;
64 assert(Size > NarrowSize);
65
66 if (LeftoverSize == 0)
67 return {NumParts, 0};
68
69 if (NarrowTy.isVector()) {
70 unsigned EltSize = OrigTy.getScalarSizeInBits();
71 if (LeftoverSize % EltSize != 0)
72 return {-1, -1};
73 LeftoverTy =
74 LLT::scalarOrVector(ElementCount::getFixed(LeftoverSize / EltSize),
75 OrigTy.getElementType());
76 } else {
77 LeftoverTy = LLT::scalar(LeftoverSize);
78 }
79
80 int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
81 return std::make_pair(NumParts, NumLeftover);
82 }
83
getFloatTypeForLLT(LLVMContext & Ctx,LLT Ty)84 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
85
86 if (!Ty.isScalar())
87 return nullptr;
88
89 switch (Ty.getSizeInBits()) {
90 case 16:
91 return Type::getHalfTy(Ctx);
92 case 32:
93 return Type::getFloatTy(Ctx);
94 case 64:
95 return Type::getDoubleTy(Ctx);
96 case 80:
97 return Type::getX86_FP80Ty(Ctx);
98 case 128:
99 return Type::getFP128Ty(Ctx);
100 default:
101 return nullptr;
102 }
103 }
104
LegalizerHelper(MachineFunction & MF,GISelChangeObserver & Observer,MachineIRBuilder & Builder)105 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
106 GISelChangeObserver &Observer,
107 MachineIRBuilder &Builder)
108 : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
109 LI(*MF.getSubtarget().getLegalizerInfo()),
110 TLI(*MF.getSubtarget().getTargetLowering()), VT(nullptr) {}
111
LegalizerHelper(MachineFunction & MF,const LegalizerInfo & LI,GISelChangeObserver & Observer,MachineIRBuilder & B,GISelValueTracking * VT)112 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
113 GISelChangeObserver &Observer,
114 MachineIRBuilder &B, GISelValueTracking *VT)
115 : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
116 TLI(*MF.getSubtarget().getTargetLowering()), VT(VT) {}
117
118 LegalizerHelper::LegalizeResult
legalizeInstrStep(MachineInstr & MI,LostDebugLocObserver & LocObserver)119 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
120 LostDebugLocObserver &LocObserver) {
121 LLVM_DEBUG(dbgs() << "\nLegalizing: " << MI);
122
123 MIRBuilder.setInstrAndDebugLoc(MI);
124
125 if (isa<GIntrinsic>(MI))
126 return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
127 auto Step = LI.getAction(MI, MRI);
128 switch (Step.Action) {
129 case Legal:
130 LLVM_DEBUG(dbgs() << ".. Already legal\n");
131 return AlreadyLegal;
132 case Libcall:
133 LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
134 return libcall(MI, LocObserver);
135 case NarrowScalar:
136 LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
137 return narrowScalar(MI, Step.TypeIdx, Step.NewType);
138 case WidenScalar:
139 LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
140 return widenScalar(MI, Step.TypeIdx, Step.NewType);
141 case Bitcast:
142 LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
143 return bitcast(MI, Step.TypeIdx, Step.NewType);
144 case Lower:
145 LLVM_DEBUG(dbgs() << ".. Lower\n");
146 return lower(MI, Step.TypeIdx, Step.NewType);
147 case FewerElements:
148 LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
149 return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
150 case MoreElements:
151 LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
152 return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
153 case Custom:
154 LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
155 return LI.legalizeCustom(*this, MI, LocObserver) ? Legalized
156 : UnableToLegalize;
157 default:
158 LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
159 return UnableToLegalize;
160 }
161 }
162
insertParts(Register DstReg,LLT ResultTy,LLT PartTy,ArrayRef<Register> PartRegs,LLT LeftoverTy,ArrayRef<Register> LeftoverRegs)163 void LegalizerHelper::insertParts(Register DstReg,
164 LLT ResultTy, LLT PartTy,
165 ArrayRef<Register> PartRegs,
166 LLT LeftoverTy,
167 ArrayRef<Register> LeftoverRegs) {
168 if (!LeftoverTy.isValid()) {
169 assert(LeftoverRegs.empty());
170
171 if (!ResultTy.isVector()) {
172 MIRBuilder.buildMergeLikeInstr(DstReg, PartRegs);
173 return;
174 }
175
176 if (PartTy.isVector())
177 MIRBuilder.buildConcatVectors(DstReg, PartRegs);
178 else
179 MIRBuilder.buildBuildVector(DstReg, PartRegs);
180 return;
181 }
182
183 // Merge sub-vectors with different number of elements and insert into DstReg.
184 if (ResultTy.isVector()) {
185 assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
186 SmallVector<Register, 8> AllRegs(PartRegs);
187 AllRegs.append(LeftoverRegs.begin(), LeftoverRegs.end());
188 return mergeMixedSubvectors(DstReg, AllRegs);
189 }
190
191 SmallVector<Register> GCDRegs;
192 LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
193 for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
194 extractGCDType(GCDRegs, GCDTy, PartReg);
195 LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
196 buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
197 }
198
appendVectorElts(SmallVectorImpl<Register> & Elts,Register Reg)199 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
200 Register Reg) {
201 LLT Ty = MRI.getType(Reg);
202 SmallVector<Register, 8> RegElts;
203 extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts,
204 MIRBuilder, MRI);
205 Elts.append(RegElts);
206 }
207
208 /// Merge \p PartRegs with different types into \p DstReg.
mergeMixedSubvectors(Register DstReg,ArrayRef<Register> PartRegs)209 void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
210 ArrayRef<Register> PartRegs) {
211 SmallVector<Register, 8> AllElts;
212 for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
213 appendVectorElts(AllElts, PartRegs[i]);
214
215 Register Leftover = PartRegs[PartRegs.size() - 1];
216 if (!MRI.getType(Leftover).isVector())
217 AllElts.push_back(Leftover);
218 else
219 appendVectorElts(AllElts, Leftover);
220
221 MIRBuilder.buildMergeLikeInstr(DstReg, AllElts);
222 }
223
224 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
getUnmergeResults(SmallVectorImpl<Register> & Regs,const MachineInstr & MI)225 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
226 const MachineInstr &MI) {
227 assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
228
229 const int StartIdx = Regs.size();
230 const int NumResults = MI.getNumOperands() - 1;
231 Regs.resize(Regs.size() + NumResults);
232 for (int I = 0; I != NumResults; ++I)
233 Regs[StartIdx + I] = MI.getOperand(I).getReg();
234 }
235
extractGCDType(SmallVectorImpl<Register> & Parts,LLT GCDTy,Register SrcReg)236 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
237 LLT GCDTy, Register SrcReg) {
238 LLT SrcTy = MRI.getType(SrcReg);
239 if (SrcTy == GCDTy) {
240 // If the source already evenly divides the result type, we don't need to do
241 // anything.
242 Parts.push_back(SrcReg);
243 } else {
244 // Need to split into common type sized pieces.
245 auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
246 getUnmergeResults(Parts, *Unmerge);
247 }
248 }
249
extractGCDType(SmallVectorImpl<Register> & Parts,LLT DstTy,LLT NarrowTy,Register SrcReg)250 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
251 LLT NarrowTy, Register SrcReg) {
252 LLT SrcTy = MRI.getType(SrcReg);
253 LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
254 extractGCDType(Parts, GCDTy, SrcReg);
255 return GCDTy;
256 }
257
buildLCMMergePieces(LLT DstTy,LLT NarrowTy,LLT GCDTy,SmallVectorImpl<Register> & VRegs,unsigned PadStrategy)258 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
259 SmallVectorImpl<Register> &VRegs,
260 unsigned PadStrategy) {
261 LLT LCMTy = getLCMType(DstTy, NarrowTy);
262
263 int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
264 int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
265 int NumOrigSrc = VRegs.size();
266
267 Register PadReg;
268
269 // Get a value we can use to pad the source value if the sources won't evenly
270 // cover the result type.
271 if (NumOrigSrc < NumParts * NumSubParts) {
272 if (PadStrategy == TargetOpcode::G_ZEXT)
273 PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
274 else if (PadStrategy == TargetOpcode::G_ANYEXT)
275 PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
276 else {
277 assert(PadStrategy == TargetOpcode::G_SEXT);
278
279 // Shift the sign bit of the low register through the high register.
280 auto ShiftAmt =
281 MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
282 PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
283 }
284 }
285
286 // Registers for the final merge to be produced.
287 SmallVector<Register, 4> Remerge(NumParts);
288
289 // Registers needed for intermediate merges, which will be merged into a
290 // source for Remerge.
291 SmallVector<Register, 4> SubMerge(NumSubParts);
292
293 // Once we've fully read off the end of the original source bits, we can reuse
294 // the same high bits for remaining padding elements.
295 Register AllPadReg;
296
297 // Build merges to the LCM type to cover the original result type.
298 for (int I = 0; I != NumParts; ++I) {
299 bool AllMergePartsArePadding = true;
300
301 // Build the requested merges to the requested type.
302 for (int J = 0; J != NumSubParts; ++J) {
303 int Idx = I * NumSubParts + J;
304 if (Idx >= NumOrigSrc) {
305 SubMerge[J] = PadReg;
306 continue;
307 }
308
309 SubMerge[J] = VRegs[Idx];
310
311 // There are meaningful bits here we can't reuse later.
312 AllMergePartsArePadding = false;
313 }
314
315 // If we've filled up a complete piece with padding bits, we can directly
316 // emit the natural sized constant if applicable, rather than a merge of
317 // smaller constants.
318 if (AllMergePartsArePadding && !AllPadReg) {
319 if (PadStrategy == TargetOpcode::G_ANYEXT)
320 AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
321 else if (PadStrategy == TargetOpcode::G_ZEXT)
322 AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
323
324 // If this is a sign extension, we can't materialize a trivial constant
325 // with the right type and have to produce a merge.
326 }
327
328 if (AllPadReg) {
329 // Avoid creating additional instructions if we're just adding additional
330 // copies of padding bits.
331 Remerge[I] = AllPadReg;
332 continue;
333 }
334
335 if (NumSubParts == 1)
336 Remerge[I] = SubMerge[0];
337 else
338 Remerge[I] = MIRBuilder.buildMergeLikeInstr(NarrowTy, SubMerge).getReg(0);
339
340 // In the sign extend padding case, re-use the first all-signbit merge.
341 if (AllMergePartsArePadding && !AllPadReg)
342 AllPadReg = Remerge[I];
343 }
344
345 VRegs = std::move(Remerge);
346 return LCMTy;
347 }
348
buildWidenedRemergeToDst(Register DstReg,LLT LCMTy,ArrayRef<Register> RemergeRegs)349 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
350 ArrayRef<Register> RemergeRegs) {
351 LLT DstTy = MRI.getType(DstReg);
352
353 // Create the merge to the widened source, and extract the relevant bits into
354 // the result.
355
356 if (DstTy == LCMTy) {
357 MIRBuilder.buildMergeLikeInstr(DstReg, RemergeRegs);
358 return;
359 }
360
361 auto Remerge = MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs);
362 if (DstTy.isScalar() && LCMTy.isScalar()) {
363 MIRBuilder.buildTrunc(DstReg, Remerge);
364 return;
365 }
366
367 if (LCMTy.isVector()) {
368 unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
369 SmallVector<Register, 8> UnmergeDefs(NumDefs);
370 UnmergeDefs[0] = DstReg;
371 for (unsigned I = 1; I != NumDefs; ++I)
372 UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
373
374 MIRBuilder.buildUnmerge(UnmergeDefs,
375 MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs));
376 return;
377 }
378
379 llvm_unreachable("unhandled case");
380 }
381
getRTLibDesc(unsigned Opcode,unsigned Size)382 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
383 #define RTLIBCASE_INT(LibcallPrefix) \
384 do { \
385 switch (Size) { \
386 case 32: \
387 return RTLIB::LibcallPrefix##32; \
388 case 64: \
389 return RTLIB::LibcallPrefix##64; \
390 case 128: \
391 return RTLIB::LibcallPrefix##128; \
392 default: \
393 llvm_unreachable("unexpected size"); \
394 } \
395 } while (0)
396
397 #define RTLIBCASE(LibcallPrefix) \
398 do { \
399 switch (Size) { \
400 case 32: \
401 return RTLIB::LibcallPrefix##32; \
402 case 64: \
403 return RTLIB::LibcallPrefix##64; \
404 case 80: \
405 return RTLIB::LibcallPrefix##80; \
406 case 128: \
407 return RTLIB::LibcallPrefix##128; \
408 default: \
409 llvm_unreachable("unexpected size"); \
410 } \
411 } while (0)
412
413 switch (Opcode) {
414 case TargetOpcode::G_LROUND:
415 RTLIBCASE(LROUND_F);
416 case TargetOpcode::G_LLROUND:
417 RTLIBCASE(LLROUND_F);
418 case TargetOpcode::G_MUL:
419 RTLIBCASE_INT(MUL_I);
420 case TargetOpcode::G_SDIV:
421 RTLIBCASE_INT(SDIV_I);
422 case TargetOpcode::G_UDIV:
423 RTLIBCASE_INT(UDIV_I);
424 case TargetOpcode::G_SREM:
425 RTLIBCASE_INT(SREM_I);
426 case TargetOpcode::G_UREM:
427 RTLIBCASE_INT(UREM_I);
428 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
429 RTLIBCASE_INT(CTLZ_I);
430 case TargetOpcode::G_FADD:
431 RTLIBCASE(ADD_F);
432 case TargetOpcode::G_FSUB:
433 RTLIBCASE(SUB_F);
434 case TargetOpcode::G_FMUL:
435 RTLIBCASE(MUL_F);
436 case TargetOpcode::G_FDIV:
437 RTLIBCASE(DIV_F);
438 case TargetOpcode::G_FEXP:
439 RTLIBCASE(EXP_F);
440 case TargetOpcode::G_FEXP2:
441 RTLIBCASE(EXP2_F);
442 case TargetOpcode::G_FEXP10:
443 RTLIBCASE(EXP10_F);
444 case TargetOpcode::G_FREM:
445 RTLIBCASE(REM_F);
446 case TargetOpcode::G_FPOW:
447 RTLIBCASE(POW_F);
448 case TargetOpcode::G_FPOWI:
449 RTLIBCASE(POWI_F);
450 case TargetOpcode::G_FMA:
451 RTLIBCASE(FMA_F);
452 case TargetOpcode::G_FSIN:
453 RTLIBCASE(SIN_F);
454 case TargetOpcode::G_FCOS:
455 RTLIBCASE(COS_F);
456 case TargetOpcode::G_FTAN:
457 RTLIBCASE(TAN_F);
458 case TargetOpcode::G_FASIN:
459 RTLIBCASE(ASIN_F);
460 case TargetOpcode::G_FACOS:
461 RTLIBCASE(ACOS_F);
462 case TargetOpcode::G_FATAN:
463 RTLIBCASE(ATAN_F);
464 case TargetOpcode::G_FATAN2:
465 RTLIBCASE(ATAN2_F);
466 case TargetOpcode::G_FSINH:
467 RTLIBCASE(SINH_F);
468 case TargetOpcode::G_FCOSH:
469 RTLIBCASE(COSH_F);
470 case TargetOpcode::G_FTANH:
471 RTLIBCASE(TANH_F);
472 case TargetOpcode::G_FSINCOS:
473 RTLIBCASE(SINCOS_F);
474 case TargetOpcode::G_FLOG10:
475 RTLIBCASE(LOG10_F);
476 case TargetOpcode::G_FLOG:
477 RTLIBCASE(LOG_F);
478 case TargetOpcode::G_FLOG2:
479 RTLIBCASE(LOG2_F);
480 case TargetOpcode::G_FLDEXP:
481 RTLIBCASE(LDEXP_F);
482 case TargetOpcode::G_FCEIL:
483 RTLIBCASE(CEIL_F);
484 case TargetOpcode::G_FFLOOR:
485 RTLIBCASE(FLOOR_F);
486 case TargetOpcode::G_FMINNUM:
487 RTLIBCASE(FMIN_F);
488 case TargetOpcode::G_FMAXNUM:
489 RTLIBCASE(FMAX_F);
490 case TargetOpcode::G_FSQRT:
491 RTLIBCASE(SQRT_F);
492 case TargetOpcode::G_FRINT:
493 RTLIBCASE(RINT_F);
494 case TargetOpcode::G_FNEARBYINT:
495 RTLIBCASE(NEARBYINT_F);
496 case TargetOpcode::G_INTRINSIC_TRUNC:
497 RTLIBCASE(TRUNC_F);
498 case TargetOpcode::G_INTRINSIC_ROUND:
499 RTLIBCASE(ROUND_F);
500 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
501 RTLIBCASE(ROUNDEVEN_F);
502 case TargetOpcode::G_INTRINSIC_LRINT:
503 RTLIBCASE(LRINT_F);
504 case TargetOpcode::G_INTRINSIC_LLRINT:
505 RTLIBCASE(LLRINT_F);
506 }
507 llvm_unreachable("Unknown libcall function");
508 #undef RTLIBCASE_INT
509 #undef RTLIBCASE
510 }
511
512 /// True if an instruction is in tail position in its caller. Intended for
513 /// legalizing libcalls as tail calls when possible.
isLibCallInTailPosition(const CallLowering::ArgInfo & Result,MachineInstr & MI,const TargetInstrInfo & TII,MachineRegisterInfo & MRI)514 static bool isLibCallInTailPosition(const CallLowering::ArgInfo &Result,
515 MachineInstr &MI,
516 const TargetInstrInfo &TII,
517 MachineRegisterInfo &MRI) {
518 MachineBasicBlock &MBB = *MI.getParent();
519 const Function &F = MBB.getParent()->getFunction();
520
521 // Conservatively require the attributes of the call to match those of
522 // the return. Ignore NoAlias and NonNull because they don't affect the
523 // call sequence.
524 AttributeList CallerAttrs = F.getAttributes();
525 if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
526 .removeAttribute(Attribute::NoAlias)
527 .removeAttribute(Attribute::NonNull)
528 .hasAttributes())
529 return false;
530
531 // It's not safe to eliminate the sign / zero extension of the return value.
532 if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
533 CallerAttrs.hasRetAttr(Attribute::SExt))
534 return false;
535
536 // Only tail call if the following instruction is a standard return or if we
537 // have a `thisreturn` callee, and a sequence like:
538 //
539 // G_MEMCPY %0, %1, %2
540 // $x0 = COPY %0
541 // RET_ReallyLR implicit $x0
542 auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
543 if (Next != MBB.instr_end() && Next->isCopy()) {
544 if (MI.getOpcode() == TargetOpcode::G_BZERO)
545 return false;
546
547 // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the
548 // mempy/etc routines return the same parameter. For other it will be the
549 // returned value.
550 Register VReg = MI.getOperand(0).getReg();
551 if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
552 return false;
553
554 Register PReg = Next->getOperand(0).getReg();
555 if (!PReg.isPhysical())
556 return false;
557
558 auto Ret = next_nodbg(Next, MBB.instr_end());
559 if (Ret == MBB.instr_end() || !Ret->isReturn())
560 return false;
561
562 if (Ret->getNumImplicitOperands() != 1)
563 return false;
564
565 if (!Ret->getOperand(0).isReg() || PReg != Ret->getOperand(0).getReg())
566 return false;
567
568 // Skip over the COPY that we just validated.
569 Next = Ret;
570 }
571
572 if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
573 return false;
574
575 return true;
576 }
577
578 LegalizerHelper::LegalizeResult
createLibcall(MachineIRBuilder & MIRBuilder,const char * Name,const CallLowering::ArgInfo & Result,ArrayRef<CallLowering::ArgInfo> Args,const CallingConv::ID CC,LostDebugLocObserver & LocObserver,MachineInstr * MI)579 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
580 const CallLowering::ArgInfo &Result,
581 ArrayRef<CallLowering::ArgInfo> Args,
582 const CallingConv::ID CC, LostDebugLocObserver &LocObserver,
583 MachineInstr *MI) {
584 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
585
586 CallLowering::CallLoweringInfo Info;
587 Info.CallConv = CC;
588 Info.Callee = MachineOperand::CreateES(Name);
589 Info.OrigRet = Result;
590 if (MI)
591 Info.IsTailCall =
592 (Result.Ty->isVoidTy() ||
593 Result.Ty == MIRBuilder.getMF().getFunction().getReturnType()) &&
594 isLibCallInTailPosition(Result, *MI, MIRBuilder.getTII(),
595 *MIRBuilder.getMRI());
596
597 llvm::append_range(Info.OrigArgs, Args);
598 if (!CLI.lowerCall(MIRBuilder, Info))
599 return LegalizerHelper::UnableToLegalize;
600
601 if (MI && Info.LoweredTailCall) {
602 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
603
604 // Check debug locations before removing the return.
605 LocObserver.checkpoint(true);
606
607 // We must have a return following the call (or debug insts) to get past
608 // isLibCallInTailPosition.
609 do {
610 MachineInstr *Next = MI->getNextNode();
611 assert(Next &&
612 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
613 "Expected instr following MI to be return or debug inst?");
614 // We lowered a tail call, so the call is now the return from the block.
615 // Delete the old return.
616 Next->eraseFromParent();
617 } while (MI->getNextNode());
618
619 // We expect to lose the debug location from the return.
620 LocObserver.checkpoint(false);
621 }
622 return LegalizerHelper::Legalized;
623 }
624
625 LegalizerHelper::LegalizeResult
createLibcall(MachineIRBuilder & MIRBuilder,RTLIB::Libcall Libcall,const CallLowering::ArgInfo & Result,ArrayRef<CallLowering::ArgInfo> Args,LostDebugLocObserver & LocObserver,MachineInstr * MI)626 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
627 const CallLowering::ArgInfo &Result,
628 ArrayRef<CallLowering::ArgInfo> Args,
629 LostDebugLocObserver &LocObserver, MachineInstr *MI) {
630 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
631 const char *Name = TLI.getLibcallName(Libcall);
632 if (!Name)
633 return LegalizerHelper::UnableToLegalize;
634 const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
635 return createLibcall(MIRBuilder, Name, Result, Args, CC, LocObserver, MI);
636 }
637
638 // Useful for libcalls where all operands have the same type.
639 static LegalizerHelper::LegalizeResult
simpleLibcall(MachineInstr & MI,MachineIRBuilder & MIRBuilder,unsigned Size,Type * OpType,LostDebugLocObserver & LocObserver)640 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
641 Type *OpType, LostDebugLocObserver &LocObserver) {
642 auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
643
644 // FIXME: What does the original arg index mean here?
645 SmallVector<CallLowering::ArgInfo, 3> Args;
646 for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
647 Args.push_back({MO.getReg(), OpType, 0});
648 return createLibcall(MIRBuilder, Libcall,
649 {MI.getOperand(0).getReg(), OpType, 0}, Args,
650 LocObserver, &MI);
651 }
652
emitSincosLibcall(MachineInstr & MI,MachineIRBuilder & MIRBuilder,unsigned Size,Type * OpType,LostDebugLocObserver & LocObserver)653 LegalizerHelper::LegalizeResult LegalizerHelper::emitSincosLibcall(
654 MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, Type *OpType,
655 LostDebugLocObserver &LocObserver) {
656 MachineFunction &MF = *MI.getMF();
657 MachineRegisterInfo &MRI = MF.getRegInfo();
658
659 Register DstSin = MI.getOperand(0).getReg();
660 Register DstCos = MI.getOperand(1).getReg();
661 Register Src = MI.getOperand(2).getReg();
662 LLT DstTy = MRI.getType(DstSin);
663
664 int MemSize = DstTy.getSizeInBytes();
665 Align Alignment = getStackTemporaryAlignment(DstTy);
666 const DataLayout &DL = MIRBuilder.getDataLayout();
667 unsigned AddrSpace = DL.getAllocaAddrSpace();
668 MachinePointerInfo PtrInfo;
669
670 Register StackPtrSin =
671 createStackTemporary(TypeSize::getFixed(MemSize), Alignment, PtrInfo)
672 .getReg(0);
673 Register StackPtrCos =
674 createStackTemporary(TypeSize::getFixed(MemSize), Alignment, PtrInfo)
675 .getReg(0);
676
677 auto &Ctx = MF.getFunction().getContext();
678 auto LibcallResult =
679 createLibcall(MIRBuilder, getRTLibDesc(MI.getOpcode(), Size),
680 {{0}, Type::getVoidTy(Ctx), 0},
681 {{Src, OpType, 0},
682 {StackPtrSin, PointerType::get(Ctx, AddrSpace), 1},
683 {StackPtrCos, PointerType::get(Ctx, AddrSpace), 2}},
684 LocObserver, &MI);
685
686 if (LibcallResult != LegalizeResult::Legalized)
687 return LegalizerHelper::UnableToLegalize;
688
689 MachineMemOperand *LoadMMOSin = MF.getMachineMemOperand(
690 PtrInfo, MachineMemOperand::MOLoad, MemSize, Alignment);
691 MachineMemOperand *LoadMMOCos = MF.getMachineMemOperand(
692 PtrInfo, MachineMemOperand::MOLoad, MemSize, Alignment);
693
694 MIRBuilder.buildLoad(DstSin, StackPtrSin, *LoadMMOSin);
695 MIRBuilder.buildLoad(DstCos, StackPtrCos, *LoadMMOCos);
696 MI.eraseFromParent();
697
698 return LegalizerHelper::Legalized;
699 }
700
701 LegalizerHelper::LegalizeResult
createMemLibcall(MachineIRBuilder & MIRBuilder,MachineRegisterInfo & MRI,MachineInstr & MI,LostDebugLocObserver & LocObserver)702 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
703 MachineInstr &MI, LostDebugLocObserver &LocObserver) {
704 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
705
706 SmallVector<CallLowering::ArgInfo, 3> Args;
707 // Add all the args, except for the last which is an imm denoting 'tail'.
708 for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
709 Register Reg = MI.getOperand(i).getReg();
710
711 // Need derive an IR type for call lowering.
712 LLT OpLLT = MRI.getType(Reg);
713 Type *OpTy = nullptr;
714 if (OpLLT.isPointer())
715 OpTy = PointerType::get(Ctx, OpLLT.getAddressSpace());
716 else
717 OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
718 Args.push_back({Reg, OpTy, 0});
719 }
720
721 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
722 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
723 RTLIB::Libcall RTLibcall;
724 unsigned Opc = MI.getOpcode();
725 const char *Name;
726 switch (Opc) {
727 case TargetOpcode::G_BZERO:
728 RTLibcall = RTLIB::BZERO;
729 Name = TLI.getLibcallName(RTLibcall);
730 break;
731 case TargetOpcode::G_MEMCPY:
732 RTLibcall = RTLIB::MEMCPY;
733 Name = TLI.getMemcpyName();
734 Args[0].Flags[0].setReturned();
735 break;
736 case TargetOpcode::G_MEMMOVE:
737 RTLibcall = RTLIB::MEMMOVE;
738 Name = TLI.getLibcallName(RTLibcall);
739 Args[0].Flags[0].setReturned();
740 break;
741 case TargetOpcode::G_MEMSET:
742 RTLibcall = RTLIB::MEMSET;
743 Name = TLI.getLibcallName(RTLibcall);
744 Args[0].Flags[0].setReturned();
745 break;
746 default:
747 llvm_unreachable("unsupported opcode");
748 }
749
750 // Unsupported libcall on the target.
751 if (!Name) {
752 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
753 << MIRBuilder.getTII().getName(Opc) << "\n");
754 return LegalizerHelper::UnableToLegalize;
755 }
756
757 CallLowering::CallLoweringInfo Info;
758 Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
759 Info.Callee = MachineOperand::CreateES(Name);
760 Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
761 Info.IsTailCall =
762 MI.getOperand(MI.getNumOperands() - 1).getImm() &&
763 isLibCallInTailPosition(Info.OrigRet, MI, MIRBuilder.getTII(), MRI);
764
765 llvm::append_range(Info.OrigArgs, Args);
766 if (!CLI.lowerCall(MIRBuilder, Info))
767 return LegalizerHelper::UnableToLegalize;
768
769 if (Info.LoweredTailCall) {
770 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
771
772 // Check debug locations before removing the return.
773 LocObserver.checkpoint(true);
774
775 // We must have a return following the call (or debug insts) to get past
776 // isLibCallInTailPosition.
777 do {
778 MachineInstr *Next = MI.getNextNode();
779 assert(Next &&
780 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
781 "Expected instr following MI to be return or debug inst?");
782 // We lowered a tail call, so the call is now the return from the block.
783 // Delete the old return.
784 Next->eraseFromParent();
785 } while (MI.getNextNode());
786
787 // We expect to lose the debug location from the return.
788 LocObserver.checkpoint(false);
789 }
790
791 return LegalizerHelper::Legalized;
792 }
793
getOutlineAtomicLibcall(MachineInstr & MI)794 static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
795 unsigned Opc = MI.getOpcode();
796 auto &AtomicMI = cast<GMemOperation>(MI);
797 auto &MMO = AtomicMI.getMMO();
798 auto Ordering = MMO.getMergedOrdering();
799 LLT MemType = MMO.getMemoryType();
800 uint64_t MemSize = MemType.getSizeInBytes();
801 if (MemType.isVector())
802 return RTLIB::UNKNOWN_LIBCALL;
803
804 #define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
805 #define LCALL5(A) \
806 LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
807 switch (Opc) {
808 case TargetOpcode::G_ATOMIC_CMPXCHG:
809 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
810 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
811 return getOutlineAtomicHelper(LC, Ordering, MemSize);
812 }
813 case TargetOpcode::G_ATOMICRMW_XCHG: {
814 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
815 return getOutlineAtomicHelper(LC, Ordering, MemSize);
816 }
817 case TargetOpcode::G_ATOMICRMW_ADD:
818 case TargetOpcode::G_ATOMICRMW_SUB: {
819 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
820 return getOutlineAtomicHelper(LC, Ordering, MemSize);
821 }
822 case TargetOpcode::G_ATOMICRMW_AND: {
823 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
824 return getOutlineAtomicHelper(LC, Ordering, MemSize);
825 }
826 case TargetOpcode::G_ATOMICRMW_OR: {
827 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
828 return getOutlineAtomicHelper(LC, Ordering, MemSize);
829 }
830 case TargetOpcode::G_ATOMICRMW_XOR: {
831 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
832 return getOutlineAtomicHelper(LC, Ordering, MemSize);
833 }
834 default:
835 return RTLIB::UNKNOWN_LIBCALL;
836 }
837 #undef LCALLS
838 #undef LCALL5
839 }
840
841 static LegalizerHelper::LegalizeResult
createAtomicLibcall(MachineIRBuilder & MIRBuilder,MachineInstr & MI)842 createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
843 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
844
845 Type *RetTy;
846 SmallVector<Register> RetRegs;
847 SmallVector<CallLowering::ArgInfo, 3> Args;
848 unsigned Opc = MI.getOpcode();
849 switch (Opc) {
850 case TargetOpcode::G_ATOMIC_CMPXCHG:
851 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
852 Register Success;
853 LLT SuccessLLT;
854 auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
855 MI.getFirst4RegLLTs();
856 RetRegs.push_back(Ret);
857 RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
858 if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
859 std::tie(Ret, RetLLT, Success, SuccessLLT, Mem, MemLLT, Cmp, CmpLLT, New,
860 NewLLT) = MI.getFirst5RegLLTs();
861 RetRegs.push_back(Success);
862 RetTy = StructType::get(
863 Ctx, {RetTy, IntegerType::get(Ctx, SuccessLLT.getSizeInBits())});
864 }
865 Args.push_back({Cmp, IntegerType::get(Ctx, CmpLLT.getSizeInBits()), 0});
866 Args.push_back({New, IntegerType::get(Ctx, NewLLT.getSizeInBits()), 0});
867 Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
868 break;
869 }
870 case TargetOpcode::G_ATOMICRMW_XCHG:
871 case TargetOpcode::G_ATOMICRMW_ADD:
872 case TargetOpcode::G_ATOMICRMW_SUB:
873 case TargetOpcode::G_ATOMICRMW_AND:
874 case TargetOpcode::G_ATOMICRMW_OR:
875 case TargetOpcode::G_ATOMICRMW_XOR: {
876 auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
877 RetRegs.push_back(Ret);
878 RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
879 if (Opc == TargetOpcode::G_ATOMICRMW_AND)
880 Val =
881 MIRBuilder.buildXor(ValLLT, MIRBuilder.buildConstant(ValLLT, -1), Val)
882 .getReg(0);
883 else if (Opc == TargetOpcode::G_ATOMICRMW_SUB)
884 Val =
885 MIRBuilder.buildSub(ValLLT, MIRBuilder.buildConstant(ValLLT, 0), Val)
886 .getReg(0);
887 Args.push_back({Val, IntegerType::get(Ctx, ValLLT.getSizeInBits()), 0});
888 Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
889 break;
890 }
891 default:
892 llvm_unreachable("unsupported opcode");
893 }
894
895 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
896 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
897 RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI);
898 const char *Name = TLI.getLibcallName(RTLibcall);
899
900 // Unsupported libcall on the target.
901 if (!Name) {
902 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
903 << MIRBuilder.getTII().getName(Opc) << "\n");
904 return LegalizerHelper::UnableToLegalize;
905 }
906
907 CallLowering::CallLoweringInfo Info;
908 Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
909 Info.Callee = MachineOperand::CreateES(Name);
910 Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
911
912 llvm::append_range(Info.OrigArgs, Args);
913 if (!CLI.lowerCall(MIRBuilder, Info))
914 return LegalizerHelper::UnableToLegalize;
915
916 return LegalizerHelper::Legalized;
917 }
918
getConvRTLibDesc(unsigned Opcode,Type * ToType,Type * FromType)919 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
920 Type *FromType) {
921 auto ToMVT = MVT::getVT(ToType);
922 auto FromMVT = MVT::getVT(FromType);
923
924 switch (Opcode) {
925 case TargetOpcode::G_FPEXT:
926 return RTLIB::getFPEXT(FromMVT, ToMVT);
927 case TargetOpcode::G_FPTRUNC:
928 return RTLIB::getFPROUND(FromMVT, ToMVT);
929 case TargetOpcode::G_FPTOSI:
930 return RTLIB::getFPTOSINT(FromMVT, ToMVT);
931 case TargetOpcode::G_FPTOUI:
932 return RTLIB::getFPTOUINT(FromMVT, ToMVT);
933 case TargetOpcode::G_SITOFP:
934 return RTLIB::getSINTTOFP(FromMVT, ToMVT);
935 case TargetOpcode::G_UITOFP:
936 return RTLIB::getUINTTOFP(FromMVT, ToMVT);
937 }
938 llvm_unreachable("Unsupported libcall function");
939 }
940
941 static LegalizerHelper::LegalizeResult
conversionLibcall(MachineInstr & MI,MachineIRBuilder & MIRBuilder,Type * ToType,Type * FromType,LostDebugLocObserver & LocObserver,const TargetLowering & TLI,bool IsSigned=false)942 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
943 Type *FromType, LostDebugLocObserver &LocObserver,
944 const TargetLowering &TLI, bool IsSigned = false) {
945 CallLowering::ArgInfo Arg = {MI.getOperand(1).getReg(), FromType, 0};
946 if (FromType->isIntegerTy()) {
947 if (TLI.shouldSignExtendTypeInLibCall(FromType, IsSigned))
948 Arg.Flags[0].setSExt();
949 else
950 Arg.Flags[0].setZExt();
951 }
952
953 RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
954 return createLibcall(MIRBuilder, Libcall,
955 {MI.getOperand(0).getReg(), ToType, 0}, Arg, LocObserver,
956 &MI);
957 }
958
959 static RTLIB::Libcall
getStateLibraryFunctionFor(MachineInstr & MI,const TargetLowering & TLI)960 getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) {
961 RTLIB::Libcall RTLibcall;
962 switch (MI.getOpcode()) {
963 case TargetOpcode::G_GET_FPENV:
964 RTLibcall = RTLIB::FEGETENV;
965 break;
966 case TargetOpcode::G_SET_FPENV:
967 case TargetOpcode::G_RESET_FPENV:
968 RTLibcall = RTLIB::FESETENV;
969 break;
970 case TargetOpcode::G_GET_FPMODE:
971 RTLibcall = RTLIB::FEGETMODE;
972 break;
973 case TargetOpcode::G_SET_FPMODE:
974 case TargetOpcode::G_RESET_FPMODE:
975 RTLibcall = RTLIB::FESETMODE;
976 break;
977 default:
978 llvm_unreachable("Unexpected opcode");
979 }
980 return RTLibcall;
981 }
982
983 // Some library functions that read FP state (fegetmode, fegetenv) write the
984 // state into a region in memory. IR intrinsics that do the same operations
985 // (get_fpmode, get_fpenv) return the state as integer value. To implement these
986 // intrinsics via the library functions, we need to use temporary variable,
987 // for example:
988 //
989 // %0:_(s32) = G_GET_FPMODE
990 //
991 // is transformed to:
992 //
993 // %1:_(p0) = G_FRAME_INDEX %stack.0
994 // BL &fegetmode
995 // %0:_(s32) = G_LOAD % 1
996 //
997 LegalizerHelper::LegalizeResult
createGetStateLibcall(MachineIRBuilder & MIRBuilder,MachineInstr & MI,LostDebugLocObserver & LocObserver)998 LegalizerHelper::createGetStateLibcall(MachineIRBuilder &MIRBuilder,
999 MachineInstr &MI,
1000 LostDebugLocObserver &LocObserver) {
1001 const DataLayout &DL = MIRBuilder.getDataLayout();
1002 auto &MF = MIRBuilder.getMF();
1003 auto &MRI = *MIRBuilder.getMRI();
1004 auto &Ctx = MF.getFunction().getContext();
1005
1006 // Create temporary, where library function will put the read state.
1007 Register Dst = MI.getOperand(0).getReg();
1008 LLT StateTy = MRI.getType(Dst);
1009 TypeSize StateSize = StateTy.getSizeInBytes();
1010 Align TempAlign = getStackTemporaryAlignment(StateTy);
1011 MachinePointerInfo TempPtrInfo;
1012 auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo);
1013
1014 // Create a call to library function, with the temporary as an argument.
1015 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1016 Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace);
1017 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1018 auto Res =
1019 createLibcall(MIRBuilder, RTLibcall,
1020 CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
1021 CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
1022 LocObserver, nullptr);
1023 if (Res != LegalizerHelper::Legalized)
1024 return Res;
1025
1026 // Create a load from the temporary.
1027 MachineMemOperand *MMO = MF.getMachineMemOperand(
1028 TempPtrInfo, MachineMemOperand::MOLoad, StateTy, TempAlign);
1029 MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Temp, *MMO);
1030
1031 return LegalizerHelper::Legalized;
1032 }
1033
1034 // Similar to `createGetStateLibcall` the function calls a library function
1035 // using transient space in stack. In this case the library function reads
1036 // content of memory region.
1037 LegalizerHelper::LegalizeResult
createSetStateLibcall(MachineIRBuilder & MIRBuilder,MachineInstr & MI,LostDebugLocObserver & LocObserver)1038 LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
1039 MachineInstr &MI,
1040 LostDebugLocObserver &LocObserver) {
1041 const DataLayout &DL = MIRBuilder.getDataLayout();
1042 auto &MF = MIRBuilder.getMF();
1043 auto &MRI = *MIRBuilder.getMRI();
1044 auto &Ctx = MF.getFunction().getContext();
1045
1046 // Create temporary, where library function will get the new state.
1047 Register Src = MI.getOperand(0).getReg();
1048 LLT StateTy = MRI.getType(Src);
1049 TypeSize StateSize = StateTy.getSizeInBytes();
1050 Align TempAlign = getStackTemporaryAlignment(StateTy);
1051 MachinePointerInfo TempPtrInfo;
1052 auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo);
1053
1054 // Put the new state into the temporary.
1055 MachineMemOperand *MMO = MF.getMachineMemOperand(
1056 TempPtrInfo, MachineMemOperand::MOStore, StateTy, TempAlign);
1057 MIRBuilder.buildStore(Src, Temp, *MMO);
1058
1059 // Create a call to library function, with the temporary as an argument.
1060 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1061 Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace);
1062 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1063 return createLibcall(MIRBuilder, RTLibcall,
1064 CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
1065 CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
1066 LocObserver, nullptr);
1067 }
1068
1069 /// Returns the corresponding libcall for the given Pred and
1070 /// the ICMP predicate that should be generated to compare with #0
1071 /// after the libcall.
1072 static std::pair<RTLIB::Libcall, CmpInst::Predicate>
getFCMPLibcallDesc(const CmpInst::Predicate Pred,unsigned Size)1073 getFCMPLibcallDesc(const CmpInst::Predicate Pred, unsigned Size) {
1074 #define RTLIBCASE_CMP(LibcallPrefix, ICmpPred) \
1075 do { \
1076 switch (Size) { \
1077 case 32: \
1078 return {RTLIB::LibcallPrefix##32, ICmpPred}; \
1079 case 64: \
1080 return {RTLIB::LibcallPrefix##64, ICmpPred}; \
1081 case 128: \
1082 return {RTLIB::LibcallPrefix##128, ICmpPred}; \
1083 default: \
1084 llvm_unreachable("unexpected size"); \
1085 } \
1086 } while (0)
1087
1088 switch (Pred) {
1089 case CmpInst::FCMP_OEQ:
1090 RTLIBCASE_CMP(OEQ_F, CmpInst::ICMP_EQ);
1091 case CmpInst::FCMP_UNE:
1092 RTLIBCASE_CMP(UNE_F, CmpInst::ICMP_NE);
1093 case CmpInst::FCMP_OGE:
1094 RTLIBCASE_CMP(OGE_F, CmpInst::ICMP_SGE);
1095 case CmpInst::FCMP_OLT:
1096 RTLIBCASE_CMP(OLT_F, CmpInst::ICMP_SLT);
1097 case CmpInst::FCMP_OLE:
1098 RTLIBCASE_CMP(OLE_F, CmpInst::ICMP_SLE);
1099 case CmpInst::FCMP_OGT:
1100 RTLIBCASE_CMP(OGT_F, CmpInst::ICMP_SGT);
1101 case CmpInst::FCMP_UNO:
1102 RTLIBCASE_CMP(UO_F, CmpInst::ICMP_NE);
1103 default:
1104 return {RTLIB::UNKNOWN_LIBCALL, CmpInst::BAD_ICMP_PREDICATE};
1105 }
1106 }
1107
1108 LegalizerHelper::LegalizeResult
createFCMPLibcall(MachineIRBuilder & MIRBuilder,MachineInstr & MI,LostDebugLocObserver & LocObserver)1109 LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
1110 MachineInstr &MI,
1111 LostDebugLocObserver &LocObserver) {
1112 auto &MF = MIRBuilder.getMF();
1113 auto &Ctx = MF.getFunction().getContext();
1114 const GFCmp *Cmp = cast<GFCmp>(&MI);
1115
1116 LLT OpLLT = MRI.getType(Cmp->getLHSReg());
1117 unsigned Size = OpLLT.getSizeInBits();
1118 if ((Size != 32 && Size != 64 && Size != 128) ||
1119 OpLLT != MRI.getType(Cmp->getRHSReg()))
1120 return UnableToLegalize;
1121
1122 Type *OpType = getFloatTypeForLLT(Ctx, OpLLT);
1123
1124 // DstReg type is s32
1125 const Register DstReg = Cmp->getReg(0);
1126 LLT DstTy = MRI.getType(DstReg);
1127 const auto Cond = Cmp->getCond();
1128
1129 // Reference:
1130 // https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html#Comparison-functions-1
1131 // Generates a libcall followed by ICMP.
1132 const auto BuildLibcall = [&](const RTLIB::Libcall Libcall,
1133 const CmpInst::Predicate ICmpPred,
1134 const DstOp &Res) -> Register {
1135 // FCMP libcall always returns an i32, and needs an ICMP with #0.
1136 constexpr LLT TempLLT = LLT::scalar(32);
1137 Register Temp = MRI.createGenericVirtualRegister(TempLLT);
1138 // Generate libcall, holding result in Temp
1139 const auto Status = createLibcall(
1140 MIRBuilder, Libcall, {Temp, Type::getInt32Ty(Ctx), 0},
1141 {{Cmp->getLHSReg(), OpType, 0}, {Cmp->getRHSReg(), OpType, 1}},
1142 LocObserver, &MI);
1143 if (!Status)
1144 return {};
1145
1146 // Compare temp with #0 to get the final result.
1147 return MIRBuilder
1148 .buildICmp(ICmpPred, Res, Temp, MIRBuilder.buildConstant(TempLLT, 0))
1149 .getReg(0);
1150 };
1151
1152 // Simple case if we have a direct mapping from predicate to libcall
1153 if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Cond, Size);
1154 Libcall != RTLIB::UNKNOWN_LIBCALL &&
1155 ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
1156 if (BuildLibcall(Libcall, ICmpPred, DstReg)) {
1157 return Legalized;
1158 }
1159 return UnableToLegalize;
1160 }
1161
1162 // No direct mapping found, should be generated as combination of libcalls.
1163
1164 switch (Cond) {
1165 case CmpInst::FCMP_UEQ: {
1166 // FCMP_UEQ: unordered or equal
1167 // Convert into (FCMP_OEQ || FCMP_UNO).
1168
1169 const auto [OeqLibcall, OeqPred] =
1170 getFCMPLibcallDesc(CmpInst::FCMP_OEQ, Size);
1171 const auto Oeq = BuildLibcall(OeqLibcall, OeqPred, DstTy);
1172
1173 const auto [UnoLibcall, UnoPred] =
1174 getFCMPLibcallDesc(CmpInst::FCMP_UNO, Size);
1175 const auto Uno = BuildLibcall(UnoLibcall, UnoPred, DstTy);
1176 if (Oeq && Uno)
1177 MIRBuilder.buildOr(DstReg, Oeq, Uno);
1178 else
1179 return UnableToLegalize;
1180
1181 break;
1182 }
1183 case CmpInst::FCMP_ONE: {
1184 // FCMP_ONE: ordered and operands are unequal
1185 // Convert into (!FCMP_OEQ && !FCMP_UNO).
1186
1187 // We inverse the predicate instead of generating a NOT
1188 // to save one instruction.
1189 // On AArch64 isel can even select two cmp into a single ccmp.
1190 const auto [OeqLibcall, OeqPred] =
1191 getFCMPLibcallDesc(CmpInst::FCMP_OEQ, Size);
1192 const auto NotOeq =
1193 BuildLibcall(OeqLibcall, CmpInst::getInversePredicate(OeqPred), DstTy);
1194
1195 const auto [UnoLibcall, UnoPred] =
1196 getFCMPLibcallDesc(CmpInst::FCMP_UNO, Size);
1197 const auto NotUno =
1198 BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(UnoPred), DstTy);
1199
1200 if (NotOeq && NotUno)
1201 MIRBuilder.buildAnd(DstReg, NotOeq, NotUno);
1202 else
1203 return UnableToLegalize;
1204
1205 break;
1206 }
1207 case CmpInst::FCMP_ULT:
1208 case CmpInst::FCMP_UGE:
1209 case CmpInst::FCMP_UGT:
1210 case CmpInst::FCMP_ULE:
1211 case CmpInst::FCMP_ORD: {
1212 // Convert into: !(inverse(Pred))
1213 // E.g. FCMP_ULT becomes !FCMP_OGE
1214 // This is equivalent to the following, but saves some instructions.
1215 // MIRBuilder.buildNot(
1216 // PredTy,
1217 // MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
1218 // Op1, Op2));
1219 const auto [InversedLibcall, InversedPred] =
1220 getFCMPLibcallDesc(CmpInst::getInversePredicate(Cond), Size);
1221 if (!BuildLibcall(InversedLibcall,
1222 CmpInst::getInversePredicate(InversedPred), DstReg))
1223 return UnableToLegalize;
1224 break;
1225 }
1226 default:
1227 return UnableToLegalize;
1228 }
1229
1230 return Legalized;
1231 }
1232
1233 // The function is used to legalize operations that set default environment
1234 // state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
1235 // On most targets supported in glibc FE_DFL_MODE is defined as
1236 // `((const femode_t *) -1)`. Such assumption is used here. If for some target
1237 // it is not true, the target must provide custom lowering.
1238 LegalizerHelper::LegalizeResult
createResetStateLibcall(MachineIRBuilder & MIRBuilder,MachineInstr & MI,LostDebugLocObserver & LocObserver)1239 LegalizerHelper::createResetStateLibcall(MachineIRBuilder &MIRBuilder,
1240 MachineInstr &MI,
1241 LostDebugLocObserver &LocObserver) {
1242 const DataLayout &DL = MIRBuilder.getDataLayout();
1243 auto &MF = MIRBuilder.getMF();
1244 auto &Ctx = MF.getFunction().getContext();
1245
1246 // Create an argument for the library function.
1247 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
1248 Type *StatePtrTy = PointerType::get(Ctx, AddrSpace);
1249 unsigned PtrSize = DL.getPointerSizeInBits(AddrSpace);
1250 LLT MemTy = LLT::pointer(AddrSpace, PtrSize);
1251 auto DefValue = MIRBuilder.buildConstant(LLT::scalar(PtrSize), -1LL);
1252 DstOp Dest(MRI.createGenericVirtualRegister(MemTy));
1253 MIRBuilder.buildIntToPtr(Dest, DefValue);
1254
1255 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1256 return createLibcall(MIRBuilder, RTLibcall,
1257 CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
1258 CallLowering::ArgInfo({Dest.getReg(), StatePtrTy, 0}),
1259 LocObserver, &MI);
1260 }
1261
1262 LegalizerHelper::LegalizeResult
libcall(MachineInstr & MI,LostDebugLocObserver & LocObserver)1263 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
1264 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
1265
1266 switch (MI.getOpcode()) {
1267 default:
1268 return UnableToLegalize;
1269 case TargetOpcode::G_MUL:
1270 case TargetOpcode::G_SDIV:
1271 case TargetOpcode::G_UDIV:
1272 case TargetOpcode::G_SREM:
1273 case TargetOpcode::G_UREM:
1274 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
1275 LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1276 unsigned Size = LLTy.getSizeInBits();
1277 Type *HLTy = IntegerType::get(Ctx, Size);
1278 auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1279 if (Status != Legalized)
1280 return Status;
1281 break;
1282 }
1283 case TargetOpcode::G_FADD:
1284 case TargetOpcode::G_FSUB:
1285 case TargetOpcode::G_FMUL:
1286 case TargetOpcode::G_FDIV:
1287 case TargetOpcode::G_FMA:
1288 case TargetOpcode::G_FPOW:
1289 case TargetOpcode::G_FREM:
1290 case TargetOpcode::G_FCOS:
1291 case TargetOpcode::G_FSIN:
1292 case TargetOpcode::G_FTAN:
1293 case TargetOpcode::G_FACOS:
1294 case TargetOpcode::G_FASIN:
1295 case TargetOpcode::G_FATAN:
1296 case TargetOpcode::G_FATAN2:
1297 case TargetOpcode::G_FCOSH:
1298 case TargetOpcode::G_FSINH:
1299 case TargetOpcode::G_FTANH:
1300 case TargetOpcode::G_FLOG10:
1301 case TargetOpcode::G_FLOG:
1302 case TargetOpcode::G_FLOG2:
1303 case TargetOpcode::G_FEXP:
1304 case TargetOpcode::G_FEXP2:
1305 case TargetOpcode::G_FEXP10:
1306 case TargetOpcode::G_FCEIL:
1307 case TargetOpcode::G_FFLOOR:
1308 case TargetOpcode::G_FMINNUM:
1309 case TargetOpcode::G_FMAXNUM:
1310 case TargetOpcode::G_FSQRT:
1311 case TargetOpcode::G_FRINT:
1312 case TargetOpcode::G_FNEARBYINT:
1313 case TargetOpcode::G_INTRINSIC_TRUNC:
1314 case TargetOpcode::G_INTRINSIC_ROUND:
1315 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
1316 LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1317 unsigned Size = LLTy.getSizeInBits();
1318 Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1319 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1320 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1321 return UnableToLegalize;
1322 }
1323 auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1324 if (Status != Legalized)
1325 return Status;
1326 break;
1327 }
1328 case TargetOpcode::G_FSINCOS: {
1329 LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1330 unsigned Size = LLTy.getSizeInBits();
1331 Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1332 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1333 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1334 return UnableToLegalize;
1335 }
1336 return emitSincosLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1337 }
1338 case TargetOpcode::G_LROUND:
1339 case TargetOpcode::G_LLROUND:
1340 case TargetOpcode::G_INTRINSIC_LRINT:
1341 case TargetOpcode::G_INTRINSIC_LLRINT: {
1342 LLT LLTy = MRI.getType(MI.getOperand(1).getReg());
1343 unsigned Size = LLTy.getSizeInBits();
1344 Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1345 Type *ITy = IntegerType::get(
1346 Ctx, MRI.getType(MI.getOperand(0).getReg()).getSizeInBits());
1347 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1348 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1349 return UnableToLegalize;
1350 }
1351 auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
1352 LegalizeResult Status =
1353 createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), ITy, 0},
1354 {{MI.getOperand(1).getReg(), HLTy, 0}}, LocObserver, &MI);
1355 if (Status != Legalized)
1356 return Status;
1357 MI.eraseFromParent();
1358 return Legalized;
1359 }
1360 case TargetOpcode::G_FPOWI:
1361 case TargetOpcode::G_FLDEXP: {
1362 LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1363 unsigned Size = LLTy.getSizeInBits();
1364 Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1365 Type *ITy = IntegerType::get(
1366 Ctx, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
1367 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1368 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1369 return UnableToLegalize;
1370 }
1371 auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
1372 SmallVector<CallLowering::ArgInfo, 2> Args = {
1373 {MI.getOperand(1).getReg(), HLTy, 0},
1374 {MI.getOperand(2).getReg(), ITy, 1}};
1375 Args[1].Flags[0].setSExt();
1376 LegalizeResult Status =
1377 createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), HLTy, 0},
1378 Args, LocObserver, &MI);
1379 if (Status != Legalized)
1380 return Status;
1381 break;
1382 }
1383 case TargetOpcode::G_FPEXT:
1384 case TargetOpcode::G_FPTRUNC: {
1385 Type *FromTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(1).getReg()));
1386 Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
1387 if (!FromTy || !ToTy)
1388 return UnableToLegalize;
1389 LegalizeResult Status =
1390 conversionLibcall(MI, MIRBuilder, ToTy, FromTy, LocObserver, TLI);
1391 if (Status != Legalized)
1392 return Status;
1393 break;
1394 }
1395 case TargetOpcode::G_FCMP: {
1396 LegalizeResult Status = createFCMPLibcall(MIRBuilder, MI, LocObserver);
1397 if (Status != Legalized)
1398 return Status;
1399 MI.eraseFromParent();
1400 return Status;
1401 }
1402 case TargetOpcode::G_FPTOSI:
1403 case TargetOpcode::G_FPTOUI: {
1404 // FIXME: Support other types
1405 Type *FromTy =
1406 getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(1).getReg()));
1407 unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1408 if ((ToSize != 32 && ToSize != 64 && ToSize != 128) || !FromTy)
1409 return UnableToLegalize;
1410 LegalizeResult Status = conversionLibcall(
1411 MI, MIRBuilder, Type::getIntNTy(Ctx, ToSize), FromTy, LocObserver, TLI);
1412 if (Status != Legalized)
1413 return Status;
1414 break;
1415 }
1416 case TargetOpcode::G_SITOFP:
1417 case TargetOpcode::G_UITOFP: {
1418 unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1419 Type *ToTy =
1420 getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
1421 if ((FromSize != 32 && FromSize != 64 && FromSize != 128) || !ToTy)
1422 return UnableToLegalize;
1423 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SITOFP;
1424 LegalizeResult Status =
1425 conversionLibcall(MI, MIRBuilder, ToTy, Type::getIntNTy(Ctx, FromSize),
1426 LocObserver, TLI, IsSigned);
1427 if (Status != Legalized)
1428 return Status;
1429 break;
1430 }
1431 case TargetOpcode::G_ATOMICRMW_XCHG:
1432 case TargetOpcode::G_ATOMICRMW_ADD:
1433 case TargetOpcode::G_ATOMICRMW_SUB:
1434 case TargetOpcode::G_ATOMICRMW_AND:
1435 case TargetOpcode::G_ATOMICRMW_OR:
1436 case TargetOpcode::G_ATOMICRMW_XOR:
1437 case TargetOpcode::G_ATOMIC_CMPXCHG:
1438 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
1439 auto Status = createAtomicLibcall(MIRBuilder, MI);
1440 if (Status != Legalized)
1441 return Status;
1442 break;
1443 }
1444 case TargetOpcode::G_BZERO:
1445 case TargetOpcode::G_MEMCPY:
1446 case TargetOpcode::G_MEMMOVE:
1447 case TargetOpcode::G_MEMSET: {
1448 LegalizeResult Result =
1449 createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
1450 if (Result != Legalized)
1451 return Result;
1452 MI.eraseFromParent();
1453 return Result;
1454 }
1455 case TargetOpcode::G_GET_FPENV:
1456 case TargetOpcode::G_GET_FPMODE: {
1457 LegalizeResult Result = createGetStateLibcall(MIRBuilder, MI, LocObserver);
1458 if (Result != Legalized)
1459 return Result;
1460 break;
1461 }
1462 case TargetOpcode::G_SET_FPENV:
1463 case TargetOpcode::G_SET_FPMODE: {
1464 LegalizeResult Result = createSetStateLibcall(MIRBuilder, MI, LocObserver);
1465 if (Result != Legalized)
1466 return Result;
1467 break;
1468 }
1469 case TargetOpcode::G_RESET_FPENV:
1470 case TargetOpcode::G_RESET_FPMODE: {
1471 LegalizeResult Result =
1472 createResetStateLibcall(MIRBuilder, MI, LocObserver);
1473 if (Result != Legalized)
1474 return Result;
1475 break;
1476 }
1477 }
1478
1479 MI.eraseFromParent();
1480 return Legalized;
1481 }
1482
narrowScalar(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)1483 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
1484 unsigned TypeIdx,
1485 LLT NarrowTy) {
1486 uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1487 uint64_t NarrowSize = NarrowTy.getSizeInBits();
1488
1489 switch (MI.getOpcode()) {
1490 default:
1491 return UnableToLegalize;
1492 case TargetOpcode::G_IMPLICIT_DEF: {
1493 Register DstReg = MI.getOperand(0).getReg();
1494 LLT DstTy = MRI.getType(DstReg);
1495
1496 // If SizeOp0 is not an exact multiple of NarrowSize, emit
1497 // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
1498 // FIXME: Although this would also be legal for the general case, it causes
1499 // a lot of regressions in the emitted code (superfluous COPYs, artifact
1500 // combines not being hit). This seems to be a problem related to the
1501 // artifact combiner.
1502 if (SizeOp0 % NarrowSize != 0) {
1503 LLT ImplicitTy = NarrowTy;
1504 if (DstTy.isVector())
1505 ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
1506
1507 Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
1508 MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
1509
1510 MI.eraseFromParent();
1511 return Legalized;
1512 }
1513
1514 int NumParts = SizeOp0 / NarrowSize;
1515
1516 SmallVector<Register, 2> DstRegs;
1517 for (int i = 0; i < NumParts; ++i)
1518 DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
1519
1520 if (DstTy.isVector())
1521 MIRBuilder.buildBuildVector(DstReg, DstRegs);
1522 else
1523 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
1524 MI.eraseFromParent();
1525 return Legalized;
1526 }
1527 case TargetOpcode::G_CONSTANT: {
1528 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1529 const APInt &Val = MI.getOperand(1).getCImm()->getValue();
1530 unsigned TotalSize = Ty.getSizeInBits();
1531 unsigned NarrowSize = NarrowTy.getSizeInBits();
1532 int NumParts = TotalSize / NarrowSize;
1533
1534 SmallVector<Register, 4> PartRegs;
1535 for (int I = 0; I != NumParts; ++I) {
1536 unsigned Offset = I * NarrowSize;
1537 auto K = MIRBuilder.buildConstant(NarrowTy,
1538 Val.lshr(Offset).trunc(NarrowSize));
1539 PartRegs.push_back(K.getReg(0));
1540 }
1541
1542 LLT LeftoverTy;
1543 unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
1544 SmallVector<Register, 1> LeftoverRegs;
1545 if (LeftoverBits != 0) {
1546 LeftoverTy = LLT::scalar(LeftoverBits);
1547 auto K = MIRBuilder.buildConstant(
1548 LeftoverTy,
1549 Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
1550 LeftoverRegs.push_back(K.getReg(0));
1551 }
1552
1553 insertParts(MI.getOperand(0).getReg(),
1554 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
1555
1556 MI.eraseFromParent();
1557 return Legalized;
1558 }
1559 case TargetOpcode::G_SEXT:
1560 case TargetOpcode::G_ZEXT:
1561 case TargetOpcode::G_ANYEXT:
1562 return narrowScalarExt(MI, TypeIdx, NarrowTy);
1563 case TargetOpcode::G_TRUNC: {
1564 if (TypeIdx != 1)
1565 return UnableToLegalize;
1566
1567 uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1568 if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
1569 LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
1570 return UnableToLegalize;
1571 }
1572
1573 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
1574 MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
1575 MI.eraseFromParent();
1576 return Legalized;
1577 }
1578 case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
1579 case TargetOpcode::G_FREEZE: {
1580 if (TypeIdx != 0)
1581 return UnableToLegalize;
1582
1583 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1584 // Should widen scalar first
1585 if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1586 return UnableToLegalize;
1587
1588 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
1589 SmallVector<Register, 8> Parts;
1590 for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1591 Parts.push_back(
1592 MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy}, {Unmerge.getReg(i)})
1593 .getReg(0));
1594 }
1595
1596 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), Parts);
1597 MI.eraseFromParent();
1598 return Legalized;
1599 }
1600 case TargetOpcode::G_ADD:
1601 case TargetOpcode::G_SUB:
1602 case TargetOpcode::G_SADDO:
1603 case TargetOpcode::G_SSUBO:
1604 case TargetOpcode::G_SADDE:
1605 case TargetOpcode::G_SSUBE:
1606 case TargetOpcode::G_UADDO:
1607 case TargetOpcode::G_USUBO:
1608 case TargetOpcode::G_UADDE:
1609 case TargetOpcode::G_USUBE:
1610 return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1611 case TargetOpcode::G_MUL:
1612 case TargetOpcode::G_UMULH:
1613 return narrowScalarMul(MI, NarrowTy);
1614 case TargetOpcode::G_EXTRACT:
1615 return narrowScalarExtract(MI, TypeIdx, NarrowTy);
1616 case TargetOpcode::G_INSERT:
1617 return narrowScalarInsert(MI, TypeIdx, NarrowTy);
1618 case TargetOpcode::G_LOAD: {
1619 auto &LoadMI = cast<GLoad>(MI);
1620 Register DstReg = LoadMI.getDstReg();
1621 LLT DstTy = MRI.getType(DstReg);
1622 if (DstTy.isVector())
1623 return UnableToLegalize;
1624
1625 if (8 * LoadMI.getMemSize().getValue() != DstTy.getSizeInBits()) {
1626 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1627 MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
1628 MIRBuilder.buildAnyExt(DstReg, TmpReg);
1629 LoadMI.eraseFromParent();
1630 return Legalized;
1631 }
1632
1633 return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
1634 }
1635 case TargetOpcode::G_ZEXTLOAD:
1636 case TargetOpcode::G_SEXTLOAD: {
1637 auto &LoadMI = cast<GExtLoad>(MI);
1638 Register DstReg = LoadMI.getDstReg();
1639 Register PtrReg = LoadMI.getPointerReg();
1640
1641 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1642 auto &MMO = LoadMI.getMMO();
1643 unsigned MemSize = MMO.getSizeInBits().getValue();
1644
1645 if (MemSize == NarrowSize) {
1646 MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
1647 } else if (MemSize < NarrowSize) {
1648 MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
1649 } else if (MemSize > NarrowSize) {
1650 // FIXME: Need to split the load.
1651 return UnableToLegalize;
1652 }
1653
1654 if (isa<GZExtLoad>(LoadMI))
1655 MIRBuilder.buildZExt(DstReg, TmpReg);
1656 else
1657 MIRBuilder.buildSExt(DstReg, TmpReg);
1658
1659 LoadMI.eraseFromParent();
1660 return Legalized;
1661 }
1662 case TargetOpcode::G_STORE: {
1663 auto &StoreMI = cast<GStore>(MI);
1664
1665 Register SrcReg = StoreMI.getValueReg();
1666 LLT SrcTy = MRI.getType(SrcReg);
1667 if (SrcTy.isVector())
1668 return UnableToLegalize;
1669
1670 int NumParts = SizeOp0 / NarrowSize;
1671 unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1672 unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1673 if (SrcTy.isVector() && LeftoverBits != 0)
1674 return UnableToLegalize;
1675
1676 if (8 * StoreMI.getMemSize().getValue() != SrcTy.getSizeInBits()) {
1677 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1678 MIRBuilder.buildTrunc(TmpReg, SrcReg);
1679 MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1680 StoreMI.eraseFromParent();
1681 return Legalized;
1682 }
1683
1684 return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1685 }
1686 case TargetOpcode::G_SELECT:
1687 return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1688 case TargetOpcode::G_AND:
1689 case TargetOpcode::G_OR:
1690 case TargetOpcode::G_XOR: {
1691 // Legalize bitwise operation:
1692 // A = BinOp<Ty> B, C
1693 // into:
1694 // B1, ..., BN = G_UNMERGE_VALUES B
1695 // C1, ..., CN = G_UNMERGE_VALUES C
1696 // A1 = BinOp<Ty/N> B1, C2
1697 // ...
1698 // AN = BinOp<Ty/N> BN, CN
1699 // A = G_MERGE_VALUES A1, ..., AN
1700 return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1701 }
1702 case TargetOpcode::G_SHL:
1703 case TargetOpcode::G_LSHR:
1704 case TargetOpcode::G_ASHR:
1705 return narrowScalarShift(MI, TypeIdx, NarrowTy);
1706 case TargetOpcode::G_CTLZ:
1707 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1708 case TargetOpcode::G_CTTZ:
1709 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1710 case TargetOpcode::G_CTPOP:
1711 if (TypeIdx == 1)
1712 switch (MI.getOpcode()) {
1713 case TargetOpcode::G_CTLZ:
1714 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1715 return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1716 case TargetOpcode::G_CTTZ:
1717 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1718 return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1719 case TargetOpcode::G_CTPOP:
1720 return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1721 default:
1722 return UnableToLegalize;
1723 }
1724
1725 Observer.changingInstr(MI);
1726 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1727 Observer.changedInstr(MI);
1728 return Legalized;
1729 case TargetOpcode::G_INTTOPTR:
1730 if (TypeIdx != 1)
1731 return UnableToLegalize;
1732
1733 Observer.changingInstr(MI);
1734 narrowScalarSrc(MI, NarrowTy, 1);
1735 Observer.changedInstr(MI);
1736 return Legalized;
1737 case TargetOpcode::G_PTRTOINT:
1738 if (TypeIdx != 0)
1739 return UnableToLegalize;
1740
1741 Observer.changingInstr(MI);
1742 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1743 Observer.changedInstr(MI);
1744 return Legalized;
1745 case TargetOpcode::G_PHI: {
1746 // FIXME: add support for when SizeOp0 isn't an exact multiple of
1747 // NarrowSize.
1748 if (SizeOp0 % NarrowSize != 0)
1749 return UnableToLegalize;
1750
1751 unsigned NumParts = SizeOp0 / NarrowSize;
1752 SmallVector<Register, 2> DstRegs(NumParts);
1753 SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1754 Observer.changingInstr(MI);
1755 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1756 MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1757 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
1758 extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1759 SrcRegs[i / 2], MIRBuilder, MRI);
1760 }
1761 MachineBasicBlock &MBB = *MI.getParent();
1762 MIRBuilder.setInsertPt(MBB, MI);
1763 for (unsigned i = 0; i < NumParts; ++i) {
1764 DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1765 MachineInstrBuilder MIB =
1766 MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1767 for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1768 MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1769 }
1770 MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1771 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
1772 Observer.changedInstr(MI);
1773 MI.eraseFromParent();
1774 return Legalized;
1775 }
1776 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1777 case TargetOpcode::G_INSERT_VECTOR_ELT: {
1778 if (TypeIdx != 2)
1779 return UnableToLegalize;
1780
1781 int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1782 Observer.changingInstr(MI);
1783 narrowScalarSrc(MI, NarrowTy, OpIdx);
1784 Observer.changedInstr(MI);
1785 return Legalized;
1786 }
1787 case TargetOpcode::G_ICMP: {
1788 Register LHS = MI.getOperand(2).getReg();
1789 LLT SrcTy = MRI.getType(LHS);
1790 CmpInst::Predicate Pred =
1791 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1792
1793 LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1794 SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1795 if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1796 LHSLeftoverRegs, MIRBuilder, MRI))
1797 return UnableToLegalize;
1798
1799 LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1800 SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1801 if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1802 RHSPartRegs, RHSLeftoverRegs, MIRBuilder, MRI))
1803 return UnableToLegalize;
1804
1805 // We now have the LHS and RHS of the compare split into narrow-type
1806 // registers, plus potentially some leftover type.
1807 Register Dst = MI.getOperand(0).getReg();
1808 LLT ResTy = MRI.getType(Dst);
1809 if (ICmpInst::isEquality(Pred)) {
1810 // For each part on the LHS and RHS, keep track of the result of XOR-ing
1811 // them together. For each equal part, the result should be all 0s. For
1812 // each non-equal part, we'll get at least one 1.
1813 auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1814 SmallVector<Register, 4> Xors;
1815 for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1816 auto LHS = std::get<0>(LHSAndRHS);
1817 auto RHS = std::get<1>(LHSAndRHS);
1818 auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1819 Xors.push_back(Xor);
1820 }
1821
1822 // Build a G_XOR for each leftover register. Each G_XOR must be widened
1823 // to the desired narrow type so that we can OR them together later.
1824 SmallVector<Register, 4> WidenedXors;
1825 for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1826 auto LHS = std::get<0>(LHSAndRHS);
1827 auto RHS = std::get<1>(LHSAndRHS);
1828 auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1829 LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1830 buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1831 /* PadStrategy = */ TargetOpcode::G_ZEXT);
1832 llvm::append_range(Xors, WidenedXors);
1833 }
1834
1835 // Now, for each part we broke up, we know if they are equal/not equal
1836 // based off the G_XOR. We can OR these all together and compare against
1837 // 0 to get the result.
1838 assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1839 auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1840 for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1841 Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1842 MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1843 } else {
1844 Register CmpIn;
1845 for (unsigned I = 0, E = LHSPartRegs.size(); I != E; ++I) {
1846 Register CmpOut;
1847 CmpInst::Predicate PartPred;
1848
1849 if (I == E - 1 && LHSLeftoverRegs.empty()) {
1850 PartPred = Pred;
1851 CmpOut = Dst;
1852 } else {
1853 PartPred = ICmpInst::getUnsignedPredicate(Pred);
1854 CmpOut = MRI.createGenericVirtualRegister(ResTy);
1855 }
1856
1857 if (!CmpIn) {
1858 MIRBuilder.buildICmp(PartPred, CmpOut, LHSPartRegs[I],
1859 RHSPartRegs[I]);
1860 } else {
1861 auto Cmp = MIRBuilder.buildICmp(PartPred, ResTy, LHSPartRegs[I],
1862 RHSPartRegs[I]);
1863 auto CmpEq = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy,
1864 LHSPartRegs[I], RHSPartRegs[I]);
1865 MIRBuilder.buildSelect(CmpOut, CmpEq, CmpIn, Cmp);
1866 }
1867
1868 CmpIn = CmpOut;
1869 }
1870
1871 for (unsigned I = 0, E = LHSLeftoverRegs.size(); I != E; ++I) {
1872 Register CmpOut;
1873 CmpInst::Predicate PartPred;
1874
1875 if (I == E - 1 && LHSLeftoverRegs.empty()) {
1876 PartPred = Pred;
1877 CmpOut = Dst;
1878 } else {
1879 PartPred = ICmpInst::getUnsignedPredicate(Pred);
1880 CmpOut = MRI.createGenericVirtualRegister(ResTy);
1881 }
1882
1883 if (!CmpIn) {
1884 MIRBuilder.buildICmp(PartPred, CmpOut, LHSLeftoverRegs[I],
1885 RHSLeftoverRegs[I]);
1886 } else {
1887 auto Cmp = MIRBuilder.buildICmp(PartPred, ResTy, LHSLeftoverRegs[I],
1888 RHSLeftoverRegs[I]);
1889 auto CmpEq =
1890 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy,
1891 LHSLeftoverRegs[I], RHSLeftoverRegs[I]);
1892 MIRBuilder.buildSelect(CmpOut, CmpEq, CmpIn, Cmp);
1893 }
1894
1895 CmpIn = CmpOut;
1896 }
1897 }
1898 MI.eraseFromParent();
1899 return Legalized;
1900 }
1901 case TargetOpcode::G_FCMP:
1902 if (TypeIdx != 0)
1903 return UnableToLegalize;
1904
1905 Observer.changingInstr(MI);
1906 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1907 Observer.changedInstr(MI);
1908 return Legalized;
1909
1910 case TargetOpcode::G_SEXT_INREG: {
1911 if (TypeIdx != 0)
1912 return UnableToLegalize;
1913
1914 int64_t SizeInBits = MI.getOperand(2).getImm();
1915
1916 // So long as the new type has more bits than the bits we're extending we
1917 // don't need to break it apart.
1918 if (NarrowTy.getScalarSizeInBits() > SizeInBits) {
1919 Observer.changingInstr(MI);
1920 // We don't lose any non-extension bits by truncating the src and
1921 // sign-extending the dst.
1922 MachineOperand &MO1 = MI.getOperand(1);
1923 auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1924 MO1.setReg(TruncMIB.getReg(0));
1925
1926 MachineOperand &MO2 = MI.getOperand(0);
1927 Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1928 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1929 MIRBuilder.buildSExt(MO2, DstExt);
1930 MO2.setReg(DstExt);
1931 Observer.changedInstr(MI);
1932 return Legalized;
1933 }
1934
1935 // Break it apart. Components below the extension point are unmodified. The
1936 // component containing the extension point becomes a narrower SEXT_INREG.
1937 // Components above it are ashr'd from the component containing the
1938 // extension point.
1939 if (SizeOp0 % NarrowSize != 0)
1940 return UnableToLegalize;
1941 int NumParts = SizeOp0 / NarrowSize;
1942
1943 // List the registers where the destination will be scattered.
1944 SmallVector<Register, 2> DstRegs;
1945 // List the registers where the source will be split.
1946 SmallVector<Register, 2> SrcRegs;
1947
1948 // Create all the temporary registers.
1949 for (int i = 0; i < NumParts; ++i) {
1950 Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1951
1952 SrcRegs.push_back(SrcReg);
1953 }
1954
1955 // Explode the big arguments into smaller chunks.
1956 MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1957
1958 Register AshrCstReg =
1959 MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1960 .getReg(0);
1961 Register FullExtensionReg;
1962 Register PartialExtensionReg;
1963
1964 // Do the operation on each small part.
1965 for (int i = 0; i < NumParts; ++i) {
1966 if ((i + 1) * NarrowTy.getScalarSizeInBits() <= SizeInBits) {
1967 DstRegs.push_back(SrcRegs[i]);
1968 PartialExtensionReg = DstRegs.back();
1969 } else if (i * NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1970 assert(PartialExtensionReg &&
1971 "Expected to visit partial extension before full");
1972 if (FullExtensionReg) {
1973 DstRegs.push_back(FullExtensionReg);
1974 continue;
1975 }
1976 DstRegs.push_back(
1977 MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1978 .getReg(0));
1979 FullExtensionReg = DstRegs.back();
1980 } else {
1981 DstRegs.push_back(
1982 MIRBuilder
1983 .buildInstr(
1984 TargetOpcode::G_SEXT_INREG, {NarrowTy},
1985 {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1986 .getReg(0));
1987 PartialExtensionReg = DstRegs.back();
1988 }
1989 }
1990
1991 // Gather the destination registers into the final destination.
1992 Register DstReg = MI.getOperand(0).getReg();
1993 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
1994 MI.eraseFromParent();
1995 return Legalized;
1996 }
1997 case TargetOpcode::G_BSWAP:
1998 case TargetOpcode::G_BITREVERSE: {
1999 if (SizeOp0 % NarrowSize != 0)
2000 return UnableToLegalize;
2001
2002 Observer.changingInstr(MI);
2003 SmallVector<Register, 2> SrcRegs, DstRegs;
2004 unsigned NumParts = SizeOp0 / NarrowSize;
2005 extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs,
2006 MIRBuilder, MRI);
2007
2008 for (unsigned i = 0; i < NumParts; ++i) {
2009 auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
2010 {SrcRegs[NumParts - 1 - i]});
2011 DstRegs.push_back(DstPart.getReg(0));
2012 }
2013
2014 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
2015
2016 Observer.changedInstr(MI);
2017 MI.eraseFromParent();
2018 return Legalized;
2019 }
2020 case TargetOpcode::G_PTR_ADD:
2021 case TargetOpcode::G_PTRMASK: {
2022 if (TypeIdx != 1)
2023 return UnableToLegalize;
2024 Observer.changingInstr(MI);
2025 narrowScalarSrc(MI, NarrowTy, 2);
2026 Observer.changedInstr(MI);
2027 return Legalized;
2028 }
2029 case TargetOpcode::G_FPTOUI:
2030 case TargetOpcode::G_FPTOSI:
2031 case TargetOpcode::G_FPTOUI_SAT:
2032 case TargetOpcode::G_FPTOSI_SAT:
2033 return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
2034 case TargetOpcode::G_FPEXT:
2035 if (TypeIdx != 0)
2036 return UnableToLegalize;
2037 Observer.changingInstr(MI);
2038 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
2039 Observer.changedInstr(MI);
2040 return Legalized;
2041 case TargetOpcode::G_FLDEXP:
2042 case TargetOpcode::G_STRICT_FLDEXP:
2043 return narrowScalarFLDEXP(MI, TypeIdx, NarrowTy);
2044 case TargetOpcode::G_VSCALE: {
2045 Register Dst = MI.getOperand(0).getReg();
2046 LLT Ty = MRI.getType(Dst);
2047
2048 // Assume VSCALE(1) fits into a legal integer
2049 const APInt One(NarrowTy.getSizeInBits(), 1);
2050 auto VScaleBase = MIRBuilder.buildVScale(NarrowTy, One);
2051 auto ZExt = MIRBuilder.buildZExt(Ty, VScaleBase);
2052 auto C = MIRBuilder.buildConstant(Ty, *MI.getOperand(1).getCImm());
2053 MIRBuilder.buildMul(Dst, ZExt, C);
2054
2055 MI.eraseFromParent();
2056 return Legalized;
2057 }
2058 }
2059 }
2060
coerceToScalar(Register Val)2061 Register LegalizerHelper::coerceToScalar(Register Val) {
2062 LLT Ty = MRI.getType(Val);
2063 if (Ty.isScalar())
2064 return Val;
2065
2066 const DataLayout &DL = MIRBuilder.getDataLayout();
2067 LLT NewTy = LLT::scalar(Ty.getSizeInBits());
2068 if (Ty.isPointer()) {
2069 if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
2070 return Register();
2071 return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
2072 }
2073
2074 Register NewVal = Val;
2075
2076 assert(Ty.isVector());
2077 if (Ty.isPointerVector())
2078 NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
2079 return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
2080 }
2081
widenScalarSrc(MachineInstr & MI,LLT WideTy,unsigned OpIdx,unsigned ExtOpcode)2082 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
2083 unsigned OpIdx, unsigned ExtOpcode) {
2084 MachineOperand &MO = MI.getOperand(OpIdx);
2085 auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
2086 MO.setReg(ExtB.getReg(0));
2087 }
2088
narrowScalarSrc(MachineInstr & MI,LLT NarrowTy,unsigned OpIdx)2089 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
2090 unsigned OpIdx) {
2091 MachineOperand &MO = MI.getOperand(OpIdx);
2092 auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
2093 MO.setReg(ExtB.getReg(0));
2094 }
2095
widenScalarDst(MachineInstr & MI,LLT WideTy,unsigned OpIdx,unsigned TruncOpcode)2096 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
2097 unsigned OpIdx, unsigned TruncOpcode) {
2098 MachineOperand &MO = MI.getOperand(OpIdx);
2099 Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2100 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2101 MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
2102 MO.setReg(DstExt);
2103 }
2104
narrowScalarDst(MachineInstr & MI,LLT NarrowTy,unsigned OpIdx,unsigned ExtOpcode)2105 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
2106 unsigned OpIdx, unsigned ExtOpcode) {
2107 MachineOperand &MO = MI.getOperand(OpIdx);
2108 Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
2109 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2110 MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
2111 MO.setReg(DstTrunc);
2112 }
2113
moreElementsVectorDst(MachineInstr & MI,LLT WideTy,unsigned OpIdx)2114 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
2115 unsigned OpIdx) {
2116 MachineOperand &MO = MI.getOperand(OpIdx);
2117 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2118 Register Dst = MO.getReg();
2119 Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2120 MO.setReg(DstExt);
2121 MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
2122 }
2123
moreElementsVectorSrc(MachineInstr & MI,LLT MoreTy,unsigned OpIdx)2124 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
2125 unsigned OpIdx) {
2126 MachineOperand &MO = MI.getOperand(OpIdx);
2127 MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
2128 }
2129
bitcastSrc(MachineInstr & MI,LLT CastTy,unsigned OpIdx)2130 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2131 MachineOperand &Op = MI.getOperand(OpIdx);
2132 Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
2133 }
2134
bitcastDst(MachineInstr & MI,LLT CastTy,unsigned OpIdx)2135 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2136 MachineOperand &MO = MI.getOperand(OpIdx);
2137 Register CastDst = MRI.createGenericVirtualRegister(CastTy);
2138 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2139 MIRBuilder.buildBitcast(MO, CastDst);
2140 MO.setReg(CastDst);
2141 }
2142
2143 LegalizerHelper::LegalizeResult
widenScalarMergeValues(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)2144 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
2145 LLT WideTy) {
2146 if (TypeIdx != 1)
2147 return UnableToLegalize;
2148
2149 auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs();
2150 if (DstTy.isVector())
2151 return UnableToLegalize;
2152
2153 LLT SrcTy = MRI.getType(Src1Reg);
2154 const int DstSize = DstTy.getSizeInBits();
2155 const int SrcSize = SrcTy.getSizeInBits();
2156 const int WideSize = WideTy.getSizeInBits();
2157 const int NumMerge = (DstSize + WideSize - 1) / WideSize;
2158
2159 unsigned NumOps = MI.getNumOperands();
2160 unsigned NumSrc = MI.getNumOperands() - 1;
2161 unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
2162
2163 if (WideSize >= DstSize) {
2164 // Directly pack the bits in the target type.
2165 Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1Reg).getReg(0);
2166
2167 for (unsigned I = 2; I != NumOps; ++I) {
2168 const unsigned Offset = (I - 1) * PartSize;
2169
2170 Register SrcReg = MI.getOperand(I).getReg();
2171 assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
2172
2173 auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
2174
2175 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
2176 MRI.createGenericVirtualRegister(WideTy);
2177
2178 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
2179 auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
2180 MIRBuilder.buildOr(NextResult, ResultReg, Shl);
2181 ResultReg = NextResult;
2182 }
2183
2184 if (WideSize > DstSize)
2185 MIRBuilder.buildTrunc(DstReg, ResultReg);
2186 else if (DstTy.isPointer())
2187 MIRBuilder.buildIntToPtr(DstReg, ResultReg);
2188
2189 MI.eraseFromParent();
2190 return Legalized;
2191 }
2192
2193 // Unmerge the original values to the GCD type, and recombine to the next
2194 // multiple greater than the original type.
2195 //
2196 // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
2197 // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
2198 // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
2199 // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
2200 // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
2201 // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
2202 // %12:_(s12) = G_MERGE_VALUES %10, %11
2203 //
2204 // Padding with undef if necessary:
2205 //
2206 // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
2207 // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
2208 // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
2209 // %7:_(s2) = G_IMPLICIT_DEF
2210 // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
2211 // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
2212 // %10:_(s12) = G_MERGE_VALUES %8, %9
2213
2214 const int GCD = std::gcd(SrcSize, WideSize);
2215 LLT GCDTy = LLT::scalar(GCD);
2216
2217 SmallVector<Register, 8> NewMergeRegs;
2218 SmallVector<Register, 8> Unmerges;
2219 LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
2220
2221 // Decompose the original operands if they don't evenly divide.
2222 for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
2223 Register SrcReg = MO.getReg();
2224 if (GCD == SrcSize) {
2225 Unmerges.push_back(SrcReg);
2226 } else {
2227 auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
2228 for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
2229 Unmerges.push_back(Unmerge.getReg(J));
2230 }
2231 }
2232
2233 // Pad with undef to the next size that is a multiple of the requested size.
2234 if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
2235 Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
2236 for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
2237 Unmerges.push_back(UndefReg);
2238 }
2239
2240 const int PartsPerGCD = WideSize / GCD;
2241
2242 // Build merges of each piece.
2243 ArrayRef<Register> Slicer(Unmerges);
2244 for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
2245 auto Merge =
2246 MIRBuilder.buildMergeLikeInstr(WideTy, Slicer.take_front(PartsPerGCD));
2247 NewMergeRegs.push_back(Merge.getReg(0));
2248 }
2249
2250 // A truncate may be necessary if the requested type doesn't evenly divide the
2251 // original result type.
2252 if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
2253 MIRBuilder.buildMergeLikeInstr(DstReg, NewMergeRegs);
2254 } else {
2255 auto FinalMerge = MIRBuilder.buildMergeLikeInstr(WideDstTy, NewMergeRegs);
2256 MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
2257 }
2258
2259 MI.eraseFromParent();
2260 return Legalized;
2261 }
2262
2263 LegalizerHelper::LegalizeResult
widenScalarUnmergeValues(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)2264 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
2265 LLT WideTy) {
2266 if (TypeIdx != 0)
2267 return UnableToLegalize;
2268
2269 int NumDst = MI.getNumOperands() - 1;
2270 Register SrcReg = MI.getOperand(NumDst).getReg();
2271 LLT SrcTy = MRI.getType(SrcReg);
2272 if (SrcTy.isVector())
2273 return UnableToLegalize;
2274
2275 Register Dst0Reg = MI.getOperand(0).getReg();
2276 LLT DstTy = MRI.getType(Dst0Reg);
2277 if (!DstTy.isScalar())
2278 return UnableToLegalize;
2279
2280 if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
2281 if (SrcTy.isPointer()) {
2282 const DataLayout &DL = MIRBuilder.getDataLayout();
2283 if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
2284 LLVM_DEBUG(
2285 dbgs() << "Not casting non-integral address space integer\n");
2286 return UnableToLegalize;
2287 }
2288
2289 SrcTy = LLT::scalar(SrcTy.getSizeInBits());
2290 SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
2291 }
2292
2293 // Widen SrcTy to WideTy. This does not affect the result, but since the
2294 // user requested this size, it is probably better handled than SrcTy and
2295 // should reduce the total number of legalization artifacts.
2296 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2297 SrcTy = WideTy;
2298 SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
2299 }
2300
2301 // Theres no unmerge type to target. Directly extract the bits from the
2302 // source type
2303 unsigned DstSize = DstTy.getSizeInBits();
2304
2305 MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
2306 for (int I = 1; I != NumDst; ++I) {
2307 auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
2308 auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
2309 MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
2310 }
2311
2312 MI.eraseFromParent();
2313 return Legalized;
2314 }
2315
2316 // Extend the source to a wider type.
2317 LLT LCMTy = getLCMType(SrcTy, WideTy);
2318
2319 Register WideSrc = SrcReg;
2320 if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
2321 // TODO: If this is an integral address space, cast to integer and anyext.
2322 if (SrcTy.isPointer()) {
2323 LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
2324 return UnableToLegalize;
2325 }
2326
2327 WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
2328 }
2329
2330 auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
2331
2332 // Create a sequence of unmerges and merges to the original results. Since we
2333 // may have widened the source, we will need to pad the results with dead defs
2334 // to cover the source register.
2335 // e.g. widen s48 to s64:
2336 // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
2337 //
2338 // =>
2339 // %4:_(s192) = G_ANYEXT %0:_(s96)
2340 // %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
2341 // ; unpack to GCD type, with extra dead defs
2342 // %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
2343 // %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
2344 // dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
2345 // %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10 ; Remerge to destination
2346 // %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
2347 const LLT GCDTy = getGCDType(WideTy, DstTy);
2348 const int NumUnmerge = Unmerge->getNumOperands() - 1;
2349 const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
2350
2351 // Directly unmerge to the destination without going through a GCD type
2352 // if possible
2353 if (PartsPerRemerge == 1) {
2354 const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
2355
2356 for (int I = 0; I != NumUnmerge; ++I) {
2357 auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
2358
2359 for (int J = 0; J != PartsPerUnmerge; ++J) {
2360 int Idx = I * PartsPerUnmerge + J;
2361 if (Idx < NumDst)
2362 MIB.addDef(MI.getOperand(Idx).getReg());
2363 else {
2364 // Create dead def for excess components.
2365 MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
2366 }
2367 }
2368
2369 MIB.addUse(Unmerge.getReg(I));
2370 }
2371 } else {
2372 SmallVector<Register, 16> Parts;
2373 for (int J = 0; J != NumUnmerge; ++J)
2374 extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
2375
2376 SmallVector<Register, 8> RemergeParts;
2377 for (int I = 0; I != NumDst; ++I) {
2378 for (int J = 0; J < PartsPerRemerge; ++J) {
2379 const int Idx = I * PartsPerRemerge + J;
2380 RemergeParts.emplace_back(Parts[Idx]);
2381 }
2382
2383 MIRBuilder.buildMergeLikeInstr(MI.getOperand(I).getReg(), RemergeParts);
2384 RemergeParts.clear();
2385 }
2386 }
2387
2388 MI.eraseFromParent();
2389 return Legalized;
2390 }
2391
2392 LegalizerHelper::LegalizeResult
widenScalarExtract(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)2393 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
2394 LLT WideTy) {
2395 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
2396 unsigned Offset = MI.getOperand(2).getImm();
2397
2398 if (TypeIdx == 0) {
2399 if (SrcTy.isVector() || DstTy.isVector())
2400 return UnableToLegalize;
2401
2402 SrcOp Src(SrcReg);
2403 if (SrcTy.isPointer()) {
2404 // Extracts from pointers can be handled only if they are really just
2405 // simple integers.
2406 const DataLayout &DL = MIRBuilder.getDataLayout();
2407 if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
2408 return UnableToLegalize;
2409
2410 LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
2411 Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
2412 SrcTy = SrcAsIntTy;
2413 }
2414
2415 if (DstTy.isPointer())
2416 return UnableToLegalize;
2417
2418 if (Offset == 0) {
2419 // Avoid a shift in the degenerate case.
2420 MIRBuilder.buildTrunc(DstReg,
2421 MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
2422 MI.eraseFromParent();
2423 return Legalized;
2424 }
2425
2426 // Do a shift in the source type.
2427 LLT ShiftTy = SrcTy;
2428 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2429 Src = MIRBuilder.buildAnyExt(WideTy, Src);
2430 ShiftTy = WideTy;
2431 }
2432
2433 auto LShr = MIRBuilder.buildLShr(
2434 ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
2435 MIRBuilder.buildTrunc(DstReg, LShr);
2436 MI.eraseFromParent();
2437 return Legalized;
2438 }
2439
2440 if (SrcTy.isScalar()) {
2441 Observer.changingInstr(MI);
2442 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2443 Observer.changedInstr(MI);
2444 return Legalized;
2445 }
2446
2447 if (!SrcTy.isVector())
2448 return UnableToLegalize;
2449
2450 if (DstTy != SrcTy.getElementType())
2451 return UnableToLegalize;
2452
2453 if (Offset % SrcTy.getScalarSizeInBits() != 0)
2454 return UnableToLegalize;
2455
2456 Observer.changingInstr(MI);
2457 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2458
2459 MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
2460 Offset);
2461 widenScalarDst(MI, WideTy.getScalarType(), 0);
2462 Observer.changedInstr(MI);
2463 return Legalized;
2464 }
2465
2466 LegalizerHelper::LegalizeResult
widenScalarInsert(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)2467 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
2468 LLT WideTy) {
2469 if (TypeIdx != 0 || WideTy.isVector())
2470 return UnableToLegalize;
2471 Observer.changingInstr(MI);
2472 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2473 widenScalarDst(MI, WideTy);
2474 Observer.changedInstr(MI);
2475 return Legalized;
2476 }
2477
2478 LegalizerHelper::LegalizeResult
widenScalarAddSubOverflow(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)2479 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
2480 LLT WideTy) {
2481 unsigned Opcode;
2482 unsigned ExtOpcode;
2483 std::optional<Register> CarryIn;
2484 switch (MI.getOpcode()) {
2485 default:
2486 llvm_unreachable("Unexpected opcode!");
2487 case TargetOpcode::G_SADDO:
2488 Opcode = TargetOpcode::G_ADD;
2489 ExtOpcode = TargetOpcode::G_SEXT;
2490 break;
2491 case TargetOpcode::G_SSUBO:
2492 Opcode = TargetOpcode::G_SUB;
2493 ExtOpcode = TargetOpcode::G_SEXT;
2494 break;
2495 case TargetOpcode::G_UADDO:
2496 Opcode = TargetOpcode::G_ADD;
2497 ExtOpcode = TargetOpcode::G_ZEXT;
2498 break;
2499 case TargetOpcode::G_USUBO:
2500 Opcode = TargetOpcode::G_SUB;
2501 ExtOpcode = TargetOpcode::G_ZEXT;
2502 break;
2503 case TargetOpcode::G_SADDE:
2504 Opcode = TargetOpcode::G_UADDE;
2505 ExtOpcode = TargetOpcode::G_SEXT;
2506 CarryIn = MI.getOperand(4).getReg();
2507 break;
2508 case TargetOpcode::G_SSUBE:
2509 Opcode = TargetOpcode::G_USUBE;
2510 ExtOpcode = TargetOpcode::G_SEXT;
2511 CarryIn = MI.getOperand(4).getReg();
2512 break;
2513 case TargetOpcode::G_UADDE:
2514 Opcode = TargetOpcode::G_UADDE;
2515 ExtOpcode = TargetOpcode::G_ZEXT;
2516 CarryIn = MI.getOperand(4).getReg();
2517 break;
2518 case TargetOpcode::G_USUBE:
2519 Opcode = TargetOpcode::G_USUBE;
2520 ExtOpcode = TargetOpcode::G_ZEXT;
2521 CarryIn = MI.getOperand(4).getReg();
2522 break;
2523 }
2524
2525 if (TypeIdx == 1) {
2526 unsigned BoolExtOp = MIRBuilder.getBoolExtOp(WideTy.isVector(), false);
2527
2528 Observer.changingInstr(MI);
2529 if (CarryIn)
2530 widenScalarSrc(MI, WideTy, 4, BoolExtOp);
2531 widenScalarDst(MI, WideTy, 1);
2532
2533 Observer.changedInstr(MI);
2534 return Legalized;
2535 }
2536
2537 auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
2538 auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
2539 // Do the arithmetic in the larger type.
2540 Register NewOp;
2541 if (CarryIn) {
2542 LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
2543 NewOp = MIRBuilder
2544 .buildInstr(Opcode, {WideTy, CarryOutTy},
2545 {LHSExt, RHSExt, *CarryIn})
2546 .getReg(0);
2547 } else {
2548 NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
2549 }
2550 LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
2551 auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
2552 auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
2553 // There is no overflow if the ExtOp is the same as NewOp.
2554 MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
2555 // Now trunc the NewOp to the original result.
2556 MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
2557 MI.eraseFromParent();
2558 return Legalized;
2559 }
2560
2561 LegalizerHelper::LegalizeResult
widenScalarAddSubShlSat(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)2562 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
2563 LLT WideTy) {
2564 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
2565 MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
2566 MI.getOpcode() == TargetOpcode::G_SSHLSAT;
2567 bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
2568 MI.getOpcode() == TargetOpcode::G_USHLSAT;
2569 // We can convert this to:
2570 // 1. Any extend iN to iM
2571 // 2. SHL by M-N
2572 // 3. [US][ADD|SUB|SHL]SAT
2573 // 4. L/ASHR by M-N
2574 //
2575 // It may be more efficient to lower this to a min and a max operation in
2576 // the higher precision arithmetic if the promoted operation isn't legal,
2577 // but this decision is up to the target's lowering request.
2578 Register DstReg = MI.getOperand(0).getReg();
2579
2580 unsigned NewBits = WideTy.getScalarSizeInBits();
2581 unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
2582
2583 // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
2584 // must not left shift the RHS to preserve the shift amount.
2585 auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
2586 auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
2587 : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
2588 auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
2589 auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
2590 auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
2591
2592 auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
2593 {ShiftL, ShiftR}, MI.getFlags());
2594
2595 // Use a shift that will preserve the number of sign bits when the trunc is
2596 // folded away.
2597 auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
2598 : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
2599
2600 MIRBuilder.buildTrunc(DstReg, Result);
2601 MI.eraseFromParent();
2602 return Legalized;
2603 }
2604
2605 LegalizerHelper::LegalizeResult
widenScalarMulo(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)2606 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
2607 LLT WideTy) {
2608 if (TypeIdx == 1) {
2609 Observer.changingInstr(MI);
2610 widenScalarDst(MI, WideTy, 1);
2611 Observer.changedInstr(MI);
2612 return Legalized;
2613 }
2614
2615 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
2616 auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs();
2617 LLT SrcTy = MRI.getType(LHS);
2618 LLT OverflowTy = MRI.getType(OriginalOverflow);
2619 unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
2620
2621 // To determine if the result overflowed in the larger type, we extend the
2622 // input to the larger type, do the multiply (checking if it overflows),
2623 // then also check the high bits of the result to see if overflow happened
2624 // there.
2625 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2626 auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
2627 auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
2628
2629 // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2630 // so we don't need to check the overflow result of larger type Mulo.
2631 bool WideMulCanOverflow = WideTy.getScalarSizeInBits() < 2 * SrcBitWidth;
2632
2633 unsigned MulOpc =
2634 WideMulCanOverflow ? MI.getOpcode() : (unsigned)TargetOpcode::G_MUL;
2635
2636 MachineInstrBuilder Mulo;
2637 if (WideMulCanOverflow)
2638 Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy, OverflowTy},
2639 {LeftOperand, RightOperand});
2640 else
2641 Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy}, {LeftOperand, RightOperand});
2642
2643 auto Mul = Mulo->getOperand(0);
2644 MIRBuilder.buildTrunc(Result, Mul);
2645
2646 MachineInstrBuilder ExtResult;
2647 // Overflow occurred if it occurred in the larger type, or if the high part
2648 // of the result does not zero/sign-extend the low part. Check this second
2649 // possibility first.
2650 if (IsSigned) {
2651 // For signed, overflow occurred when the high part does not sign-extend
2652 // the low part.
2653 ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
2654 } else {
2655 // Unsigned overflow occurred when the high part does not zero-extend the
2656 // low part.
2657 ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
2658 }
2659
2660 if (WideMulCanOverflow) {
2661 auto Overflow =
2662 MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
2663 // Finally check if the multiplication in the larger type itself overflowed.
2664 MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
2665 } else {
2666 MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
2667 }
2668 MI.eraseFromParent();
2669 return Legalized;
2670 }
2671
2672 LegalizerHelper::LegalizeResult
widenScalar(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)2673 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2674 unsigned Opcode = MI.getOpcode();
2675 switch (Opcode) {
2676 default:
2677 return UnableToLegalize;
2678 case TargetOpcode::G_ATOMICRMW_XCHG:
2679 case TargetOpcode::G_ATOMICRMW_ADD:
2680 case TargetOpcode::G_ATOMICRMW_SUB:
2681 case TargetOpcode::G_ATOMICRMW_AND:
2682 case TargetOpcode::G_ATOMICRMW_OR:
2683 case TargetOpcode::G_ATOMICRMW_XOR:
2684 case TargetOpcode::G_ATOMICRMW_MIN:
2685 case TargetOpcode::G_ATOMICRMW_MAX:
2686 case TargetOpcode::G_ATOMICRMW_UMIN:
2687 case TargetOpcode::G_ATOMICRMW_UMAX:
2688 assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2689 Observer.changingInstr(MI);
2690 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2691 widenScalarDst(MI, WideTy, 0);
2692 Observer.changedInstr(MI);
2693 return Legalized;
2694 case TargetOpcode::G_ATOMIC_CMPXCHG:
2695 assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2696 Observer.changingInstr(MI);
2697 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2698 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2699 widenScalarDst(MI, WideTy, 0);
2700 Observer.changedInstr(MI);
2701 return Legalized;
2702 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2703 if (TypeIdx == 0) {
2704 Observer.changingInstr(MI);
2705 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2706 widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2707 widenScalarDst(MI, WideTy, 0);
2708 Observer.changedInstr(MI);
2709 return Legalized;
2710 }
2711 assert(TypeIdx == 1 &&
2712 "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2713 Observer.changingInstr(MI);
2714 widenScalarDst(MI, WideTy, 1);
2715 Observer.changedInstr(MI);
2716 return Legalized;
2717 case TargetOpcode::G_EXTRACT:
2718 return widenScalarExtract(MI, TypeIdx, WideTy);
2719 case TargetOpcode::G_INSERT:
2720 return widenScalarInsert(MI, TypeIdx, WideTy);
2721 case TargetOpcode::G_MERGE_VALUES:
2722 return widenScalarMergeValues(MI, TypeIdx, WideTy);
2723 case TargetOpcode::G_UNMERGE_VALUES:
2724 return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2725 case TargetOpcode::G_SADDO:
2726 case TargetOpcode::G_SSUBO:
2727 case TargetOpcode::G_UADDO:
2728 case TargetOpcode::G_USUBO:
2729 case TargetOpcode::G_SADDE:
2730 case TargetOpcode::G_SSUBE:
2731 case TargetOpcode::G_UADDE:
2732 case TargetOpcode::G_USUBE:
2733 return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2734 case TargetOpcode::G_UMULO:
2735 case TargetOpcode::G_SMULO:
2736 return widenScalarMulo(MI, TypeIdx, WideTy);
2737 case TargetOpcode::G_SADDSAT:
2738 case TargetOpcode::G_SSUBSAT:
2739 case TargetOpcode::G_SSHLSAT:
2740 case TargetOpcode::G_UADDSAT:
2741 case TargetOpcode::G_USUBSAT:
2742 case TargetOpcode::G_USHLSAT:
2743 return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2744 case TargetOpcode::G_CTTZ:
2745 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2746 case TargetOpcode::G_CTLZ:
2747 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2748 case TargetOpcode::G_CTPOP: {
2749 if (TypeIdx == 0) {
2750 Observer.changingInstr(MI);
2751 widenScalarDst(MI, WideTy, 0);
2752 Observer.changedInstr(MI);
2753 return Legalized;
2754 }
2755
2756 Register SrcReg = MI.getOperand(1).getReg();
2757
2758 // First extend the input.
2759 unsigned ExtOpc = Opcode == TargetOpcode::G_CTTZ ||
2760 Opcode == TargetOpcode::G_CTTZ_ZERO_UNDEF
2761 ? TargetOpcode::G_ANYEXT
2762 : TargetOpcode::G_ZEXT;
2763 auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2764 LLT CurTy = MRI.getType(SrcReg);
2765 unsigned NewOpc = Opcode;
2766 if (NewOpc == TargetOpcode::G_CTTZ) {
2767 // The count is the same in the larger type except if the original
2768 // value was zero. This can be handled by setting the bit just off
2769 // the top of the original type.
2770 auto TopBit =
2771 APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2772 MIBSrc = MIRBuilder.buildOr(
2773 WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2774 // Now we know the operand is non-zero, use the more relaxed opcode.
2775 NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2776 }
2777
2778 unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2779
2780 if (Opcode == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2781 // An optimization where the result is the CTLZ after the left shift by
2782 // (Difference in widety and current ty), that is,
2783 // MIBSrc = MIBSrc << (sizeinbits(WideTy) - sizeinbits(CurTy))
2784 // Result = ctlz MIBSrc
2785 MIBSrc = MIRBuilder.buildShl(WideTy, MIBSrc,
2786 MIRBuilder.buildConstant(WideTy, SizeDiff));
2787 }
2788
2789 // Perform the operation at the larger size.
2790 auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2791 // This is already the correct result for CTPOP and CTTZs
2792 if (Opcode == TargetOpcode::G_CTLZ) {
2793 // The correct result is NewOp - (Difference in widety and current ty).
2794 MIBNewOp = MIRBuilder.buildSub(
2795 WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2796 }
2797
2798 MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2799 MI.eraseFromParent();
2800 return Legalized;
2801 }
2802 case TargetOpcode::G_BSWAP: {
2803 Observer.changingInstr(MI);
2804 Register DstReg = MI.getOperand(0).getReg();
2805
2806 Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2807 Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2808 Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2809 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2810
2811 MI.getOperand(0).setReg(DstExt);
2812
2813 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2814
2815 LLT Ty = MRI.getType(DstReg);
2816 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2817 MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2818 MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2819
2820 MIRBuilder.buildTrunc(DstReg, ShrReg);
2821 Observer.changedInstr(MI);
2822 return Legalized;
2823 }
2824 case TargetOpcode::G_BITREVERSE: {
2825 Observer.changingInstr(MI);
2826
2827 Register DstReg = MI.getOperand(0).getReg();
2828 LLT Ty = MRI.getType(DstReg);
2829 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2830
2831 Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2832 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2833 MI.getOperand(0).setReg(DstExt);
2834 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2835
2836 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2837 auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2838 MIRBuilder.buildTrunc(DstReg, Shift);
2839 Observer.changedInstr(MI);
2840 return Legalized;
2841 }
2842 case TargetOpcode::G_FREEZE:
2843 case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
2844 Observer.changingInstr(MI);
2845 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2846 widenScalarDst(MI, WideTy);
2847 Observer.changedInstr(MI);
2848 return Legalized;
2849
2850 case TargetOpcode::G_ABS:
2851 Observer.changingInstr(MI);
2852 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2853 widenScalarDst(MI, WideTy);
2854 Observer.changedInstr(MI);
2855 return Legalized;
2856
2857 case TargetOpcode::G_ADD:
2858 case TargetOpcode::G_AND:
2859 case TargetOpcode::G_MUL:
2860 case TargetOpcode::G_OR:
2861 case TargetOpcode::G_XOR:
2862 case TargetOpcode::G_SUB:
2863 case TargetOpcode::G_SHUFFLE_VECTOR:
2864 // Perform operation at larger width (any extension is fines here, high bits
2865 // don't affect the result) and then truncate the result back to the
2866 // original type.
2867 Observer.changingInstr(MI);
2868 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2869 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2870 widenScalarDst(MI, WideTy);
2871 Observer.changedInstr(MI);
2872 return Legalized;
2873
2874 case TargetOpcode::G_SBFX:
2875 case TargetOpcode::G_UBFX:
2876 Observer.changingInstr(MI);
2877
2878 if (TypeIdx == 0) {
2879 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2880 widenScalarDst(MI, WideTy);
2881 } else {
2882 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2883 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2884 }
2885
2886 Observer.changedInstr(MI);
2887 return Legalized;
2888
2889 case TargetOpcode::G_SHL:
2890 Observer.changingInstr(MI);
2891
2892 if (TypeIdx == 0) {
2893 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2894 widenScalarDst(MI, WideTy);
2895 } else {
2896 assert(TypeIdx == 1);
2897 // The "number of bits to shift" operand must preserve its value as an
2898 // unsigned integer:
2899 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2900 }
2901
2902 Observer.changedInstr(MI);
2903 return Legalized;
2904
2905 case TargetOpcode::G_ROTR:
2906 case TargetOpcode::G_ROTL:
2907 if (TypeIdx != 1)
2908 return UnableToLegalize;
2909
2910 Observer.changingInstr(MI);
2911 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2912 Observer.changedInstr(MI);
2913 return Legalized;
2914
2915 case TargetOpcode::G_SDIV:
2916 case TargetOpcode::G_SREM:
2917 case TargetOpcode::G_SMIN:
2918 case TargetOpcode::G_SMAX:
2919 Observer.changingInstr(MI);
2920 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2921 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2922 widenScalarDst(MI, WideTy);
2923 Observer.changedInstr(MI);
2924 return Legalized;
2925
2926 case TargetOpcode::G_SDIVREM:
2927 Observer.changingInstr(MI);
2928 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2929 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2930 widenScalarDst(MI, WideTy);
2931 widenScalarDst(MI, WideTy, 1);
2932 Observer.changedInstr(MI);
2933 return Legalized;
2934
2935 case TargetOpcode::G_ASHR:
2936 case TargetOpcode::G_LSHR:
2937 Observer.changingInstr(MI);
2938
2939 if (TypeIdx == 0) {
2940 unsigned CvtOp = Opcode == TargetOpcode::G_ASHR ? TargetOpcode::G_SEXT
2941 : TargetOpcode::G_ZEXT;
2942
2943 widenScalarSrc(MI, WideTy, 1, CvtOp);
2944 widenScalarDst(MI, WideTy);
2945 } else {
2946 assert(TypeIdx == 1);
2947 // The "number of bits to shift" operand must preserve its value as an
2948 // unsigned integer:
2949 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2950 }
2951
2952 Observer.changedInstr(MI);
2953 return Legalized;
2954 case TargetOpcode::G_UDIV:
2955 case TargetOpcode::G_UREM:
2956 Observer.changingInstr(MI);
2957 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2958 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2959 widenScalarDst(MI, WideTy);
2960 Observer.changedInstr(MI);
2961 return Legalized;
2962 case TargetOpcode::G_UDIVREM:
2963 Observer.changingInstr(MI);
2964 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2965 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2966 widenScalarDst(MI, WideTy);
2967 widenScalarDst(MI, WideTy, 1);
2968 Observer.changedInstr(MI);
2969 return Legalized;
2970 case TargetOpcode::G_UMIN:
2971 case TargetOpcode::G_UMAX: {
2972 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2973
2974 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
2975 unsigned ExtOpc =
2976 TLI.isSExtCheaperThanZExt(getApproximateEVTForLLT(Ty, Ctx),
2977 getApproximateEVTForLLT(WideTy, Ctx))
2978 ? TargetOpcode::G_SEXT
2979 : TargetOpcode::G_ZEXT;
2980
2981 Observer.changingInstr(MI);
2982 widenScalarSrc(MI, WideTy, 1, ExtOpc);
2983 widenScalarSrc(MI, WideTy, 2, ExtOpc);
2984 widenScalarDst(MI, WideTy);
2985 Observer.changedInstr(MI);
2986 return Legalized;
2987 }
2988
2989 case TargetOpcode::G_SELECT:
2990 Observer.changingInstr(MI);
2991 if (TypeIdx == 0) {
2992 // Perform operation at larger width (any extension is fine here, high
2993 // bits don't affect the result) and then truncate the result back to the
2994 // original type.
2995 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2996 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2997 widenScalarDst(MI, WideTy);
2998 } else {
2999 bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
3000 // Explicit extension is required here since high bits affect the result.
3001 widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
3002 }
3003 Observer.changedInstr(MI);
3004 return Legalized;
3005
3006 case TargetOpcode::G_FPTOSI:
3007 case TargetOpcode::G_FPTOUI:
3008 case TargetOpcode::G_INTRINSIC_LRINT:
3009 case TargetOpcode::G_INTRINSIC_LLRINT:
3010 case TargetOpcode::G_IS_FPCLASS:
3011 Observer.changingInstr(MI);
3012
3013 if (TypeIdx == 0)
3014 widenScalarDst(MI, WideTy);
3015 else
3016 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
3017
3018 Observer.changedInstr(MI);
3019 return Legalized;
3020 case TargetOpcode::G_SITOFP:
3021 Observer.changingInstr(MI);
3022
3023 if (TypeIdx == 0)
3024 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3025 else
3026 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
3027
3028 Observer.changedInstr(MI);
3029 return Legalized;
3030 case TargetOpcode::G_UITOFP:
3031 Observer.changingInstr(MI);
3032
3033 if (TypeIdx == 0)
3034 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3035 else
3036 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
3037
3038 Observer.changedInstr(MI);
3039 return Legalized;
3040 case TargetOpcode::G_FPTOSI_SAT:
3041 case TargetOpcode::G_FPTOUI_SAT:
3042 Observer.changingInstr(MI);
3043
3044 if (TypeIdx == 0) {
3045 Register OldDst = MI.getOperand(0).getReg();
3046 LLT Ty = MRI.getType(OldDst);
3047 Register ExtReg = MRI.createGenericVirtualRegister(WideTy);
3048 Register NewDst;
3049 MI.getOperand(0).setReg(ExtReg);
3050 uint64_t ShortBits = Ty.getScalarSizeInBits();
3051 uint64_t WideBits = WideTy.getScalarSizeInBits();
3052 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3053 if (Opcode == TargetOpcode::G_FPTOSI_SAT) {
3054 // z = i16 fptosi_sat(a)
3055 // ->
3056 // x = i32 fptosi_sat(a)
3057 // y = smin(x, 32767)
3058 // z = smax(y, -32768)
3059 auto MaxVal = MIRBuilder.buildConstant(
3060 WideTy, APInt::getSignedMaxValue(ShortBits).sext(WideBits));
3061 auto MinVal = MIRBuilder.buildConstant(
3062 WideTy, APInt::getSignedMinValue(ShortBits).sext(WideBits));
3063 Register MidReg =
3064 MIRBuilder.buildSMin(WideTy, ExtReg, MaxVal).getReg(0);
3065 NewDst = MIRBuilder.buildSMax(WideTy, MidReg, MinVal).getReg(0);
3066 } else {
3067 // z = i16 fptoui_sat(a)
3068 // ->
3069 // x = i32 fptoui_sat(a)
3070 // y = smin(x, 65535)
3071 auto MaxVal = MIRBuilder.buildConstant(
3072 WideTy, APInt::getAllOnes(ShortBits).zext(WideBits));
3073 NewDst = MIRBuilder.buildUMin(WideTy, ExtReg, MaxVal).getReg(0);
3074 }
3075 MIRBuilder.buildTrunc(OldDst, NewDst);
3076 } else
3077 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
3078
3079 Observer.changedInstr(MI);
3080 return Legalized;
3081 case TargetOpcode::G_LOAD:
3082 case TargetOpcode::G_SEXTLOAD:
3083 case TargetOpcode::G_ZEXTLOAD:
3084 Observer.changingInstr(MI);
3085 widenScalarDst(MI, WideTy);
3086 Observer.changedInstr(MI);
3087 return Legalized;
3088
3089 case TargetOpcode::G_STORE: {
3090 if (TypeIdx != 0)
3091 return UnableToLegalize;
3092
3093 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3094 assert(!Ty.isPointerOrPointerVector() && "Can't widen type");
3095 if (!Ty.isScalar()) {
3096 // We need to widen the vector element type.
3097 Observer.changingInstr(MI);
3098 widenScalarSrc(MI, WideTy, 0, TargetOpcode::G_ANYEXT);
3099 // We also need to adjust the MMO to turn this into a truncating store.
3100 MachineMemOperand &MMO = **MI.memoperands_begin();
3101 MachineFunction &MF = MIRBuilder.getMF();
3102 auto *NewMMO = MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), Ty);
3103 MI.setMemRefs(MF, {NewMMO});
3104 Observer.changedInstr(MI);
3105 return Legalized;
3106 }
3107
3108 Observer.changingInstr(MI);
3109
3110 unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
3111 TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
3112 widenScalarSrc(MI, WideTy, 0, ExtType);
3113
3114 Observer.changedInstr(MI);
3115 return Legalized;
3116 }
3117 case TargetOpcode::G_CONSTANT: {
3118 MachineOperand &SrcMO = MI.getOperand(1);
3119 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3120 unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
3121 MRI.getType(MI.getOperand(0).getReg()));
3122 assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
3123 ExtOpc == TargetOpcode::G_ANYEXT) &&
3124 "Illegal Extend");
3125 const APInt &SrcVal = SrcMO.getCImm()->getValue();
3126 const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
3127 ? SrcVal.sext(WideTy.getSizeInBits())
3128 : SrcVal.zext(WideTy.getSizeInBits());
3129 Observer.changingInstr(MI);
3130 SrcMO.setCImm(ConstantInt::get(Ctx, Val));
3131
3132 widenScalarDst(MI, WideTy);
3133 Observer.changedInstr(MI);
3134 return Legalized;
3135 }
3136 case TargetOpcode::G_FCONSTANT: {
3137 // To avoid changing the bits of the constant due to extension to a larger
3138 // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
3139 MachineOperand &SrcMO = MI.getOperand(1);
3140 APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
3141 MIRBuilder.setInstrAndDebugLoc(MI);
3142 auto IntCst = MIRBuilder.buildConstant(MI.getOperand(0).getReg(), Val);
3143 widenScalarDst(*IntCst, WideTy, 0, TargetOpcode::G_TRUNC);
3144 MI.eraseFromParent();
3145 return Legalized;
3146 }
3147 case TargetOpcode::G_IMPLICIT_DEF: {
3148 Observer.changingInstr(MI);
3149 widenScalarDst(MI, WideTy);
3150 Observer.changedInstr(MI);
3151 return Legalized;
3152 }
3153 case TargetOpcode::G_BRCOND:
3154 Observer.changingInstr(MI);
3155 widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
3156 Observer.changedInstr(MI);
3157 return Legalized;
3158
3159 case TargetOpcode::G_FCMP:
3160 Observer.changingInstr(MI);
3161 if (TypeIdx == 0)
3162 widenScalarDst(MI, WideTy);
3163 else {
3164 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
3165 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
3166 }
3167 Observer.changedInstr(MI);
3168 return Legalized;
3169
3170 case TargetOpcode::G_ICMP:
3171 Observer.changingInstr(MI);
3172 if (TypeIdx == 0)
3173 widenScalarDst(MI, WideTy);
3174 else {
3175 LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
3176 CmpInst::Predicate Pred =
3177 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3178
3179 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
3180 unsigned ExtOpcode =
3181 (CmpInst::isSigned(Pred) ||
3182 TLI.isSExtCheaperThanZExt(getApproximateEVTForLLT(SrcTy, Ctx),
3183 getApproximateEVTForLLT(WideTy, Ctx)))
3184 ? TargetOpcode::G_SEXT
3185 : TargetOpcode::G_ZEXT;
3186 widenScalarSrc(MI, WideTy, 2, ExtOpcode);
3187 widenScalarSrc(MI, WideTy, 3, ExtOpcode);
3188 }
3189 Observer.changedInstr(MI);
3190 return Legalized;
3191
3192 case TargetOpcode::G_PTR_ADD:
3193 assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
3194 Observer.changingInstr(MI);
3195 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
3196 Observer.changedInstr(MI);
3197 return Legalized;
3198
3199 case TargetOpcode::G_PHI: {
3200 assert(TypeIdx == 0 && "Expecting only Idx 0");
3201
3202 Observer.changingInstr(MI);
3203 for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
3204 MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
3205 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
3206 widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
3207 }
3208
3209 MachineBasicBlock &MBB = *MI.getParent();
3210 MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
3211 widenScalarDst(MI, WideTy);
3212 Observer.changedInstr(MI);
3213 return Legalized;
3214 }
3215 case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
3216 if (TypeIdx == 0) {
3217 Register VecReg = MI.getOperand(1).getReg();
3218 LLT VecTy = MRI.getType(VecReg);
3219 Observer.changingInstr(MI);
3220
3221 widenScalarSrc(
3222 MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
3223 TargetOpcode::G_ANYEXT);
3224
3225 widenScalarDst(MI, WideTy, 0);
3226 Observer.changedInstr(MI);
3227 return Legalized;
3228 }
3229
3230 if (TypeIdx != 2)
3231 return UnableToLegalize;
3232 Observer.changingInstr(MI);
3233 // TODO: Probably should be zext
3234 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
3235 Observer.changedInstr(MI);
3236 return Legalized;
3237 }
3238 case TargetOpcode::G_INSERT_VECTOR_ELT: {
3239 if (TypeIdx == 0) {
3240 Observer.changingInstr(MI);
3241 const LLT WideEltTy = WideTy.getElementType();
3242
3243 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
3244 widenScalarSrc(MI, WideEltTy, 2, TargetOpcode::G_ANYEXT);
3245 widenScalarDst(MI, WideTy, 0);
3246 Observer.changedInstr(MI);
3247 return Legalized;
3248 }
3249
3250 if (TypeIdx == 1) {
3251 Observer.changingInstr(MI);
3252
3253 Register VecReg = MI.getOperand(1).getReg();
3254 LLT VecTy = MRI.getType(VecReg);
3255 LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
3256
3257 widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
3258 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
3259 widenScalarDst(MI, WideVecTy, 0);
3260 Observer.changedInstr(MI);
3261 return Legalized;
3262 }
3263
3264 if (TypeIdx == 2) {
3265 Observer.changingInstr(MI);
3266 // TODO: Probably should be zext
3267 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
3268 Observer.changedInstr(MI);
3269 return Legalized;
3270 }
3271
3272 return UnableToLegalize;
3273 }
3274 case TargetOpcode::G_FADD:
3275 case TargetOpcode::G_FMUL:
3276 case TargetOpcode::G_FSUB:
3277 case TargetOpcode::G_FMA:
3278 case TargetOpcode::G_FMAD:
3279 case TargetOpcode::G_FNEG:
3280 case TargetOpcode::G_FABS:
3281 case TargetOpcode::G_FCANONICALIZE:
3282 case TargetOpcode::G_FMINNUM:
3283 case TargetOpcode::G_FMAXNUM:
3284 case TargetOpcode::G_FMINNUM_IEEE:
3285 case TargetOpcode::G_FMAXNUM_IEEE:
3286 case TargetOpcode::G_FMINIMUM:
3287 case TargetOpcode::G_FMAXIMUM:
3288 case TargetOpcode::G_FMINIMUMNUM:
3289 case TargetOpcode::G_FMAXIMUMNUM:
3290 case TargetOpcode::G_FDIV:
3291 case TargetOpcode::G_FREM:
3292 case TargetOpcode::G_FCEIL:
3293 case TargetOpcode::G_FFLOOR:
3294 case TargetOpcode::G_FCOS:
3295 case TargetOpcode::G_FSIN:
3296 case TargetOpcode::G_FTAN:
3297 case TargetOpcode::G_FACOS:
3298 case TargetOpcode::G_FASIN:
3299 case TargetOpcode::G_FATAN:
3300 case TargetOpcode::G_FATAN2:
3301 case TargetOpcode::G_FCOSH:
3302 case TargetOpcode::G_FSINH:
3303 case TargetOpcode::G_FTANH:
3304 case TargetOpcode::G_FLOG10:
3305 case TargetOpcode::G_FLOG:
3306 case TargetOpcode::G_FLOG2:
3307 case TargetOpcode::G_FRINT:
3308 case TargetOpcode::G_FNEARBYINT:
3309 case TargetOpcode::G_FSQRT:
3310 case TargetOpcode::G_FEXP:
3311 case TargetOpcode::G_FEXP2:
3312 case TargetOpcode::G_FEXP10:
3313 case TargetOpcode::G_FPOW:
3314 case TargetOpcode::G_INTRINSIC_TRUNC:
3315 case TargetOpcode::G_INTRINSIC_ROUND:
3316 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
3317 assert(TypeIdx == 0);
3318 Observer.changingInstr(MI);
3319
3320 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3321 widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
3322
3323 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3324 Observer.changedInstr(MI);
3325 return Legalized;
3326 case TargetOpcode::G_FPOWI:
3327 case TargetOpcode::G_FLDEXP:
3328 case TargetOpcode::G_STRICT_FLDEXP: {
3329 if (TypeIdx == 0) {
3330 if (Opcode == TargetOpcode::G_STRICT_FLDEXP)
3331 return UnableToLegalize;
3332
3333 Observer.changingInstr(MI);
3334 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
3335 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3336 Observer.changedInstr(MI);
3337 return Legalized;
3338 }
3339
3340 if (TypeIdx == 1) {
3341 // For some reason SelectionDAG tries to promote to a libcall without
3342 // actually changing the integer type for promotion.
3343 Observer.changingInstr(MI);
3344 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
3345 Observer.changedInstr(MI);
3346 return Legalized;
3347 }
3348
3349 return UnableToLegalize;
3350 }
3351 case TargetOpcode::G_FFREXP: {
3352 Observer.changingInstr(MI);
3353
3354 if (TypeIdx == 0) {
3355 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
3356 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3357 } else {
3358 widenScalarDst(MI, WideTy, 1);
3359 }
3360
3361 Observer.changedInstr(MI);
3362 return Legalized;
3363 }
3364 case TargetOpcode::G_INTTOPTR:
3365 if (TypeIdx != 1)
3366 return UnableToLegalize;
3367
3368 Observer.changingInstr(MI);
3369 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
3370 Observer.changedInstr(MI);
3371 return Legalized;
3372 case TargetOpcode::G_PTRTOINT:
3373 if (TypeIdx != 0)
3374 return UnableToLegalize;
3375
3376 Observer.changingInstr(MI);
3377 widenScalarDst(MI, WideTy, 0);
3378 Observer.changedInstr(MI);
3379 return Legalized;
3380 case TargetOpcode::G_BUILD_VECTOR: {
3381 Observer.changingInstr(MI);
3382
3383 const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
3384 for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
3385 widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
3386
3387 // Avoid changing the result vector type if the source element type was
3388 // requested.
3389 if (TypeIdx == 1) {
3390 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
3391 } else {
3392 widenScalarDst(MI, WideTy, 0);
3393 }
3394
3395 Observer.changedInstr(MI);
3396 return Legalized;
3397 }
3398 case TargetOpcode::G_SEXT_INREG:
3399 if (TypeIdx != 0)
3400 return UnableToLegalize;
3401
3402 Observer.changingInstr(MI);
3403 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
3404 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
3405 Observer.changedInstr(MI);
3406 return Legalized;
3407 case TargetOpcode::G_PTRMASK: {
3408 if (TypeIdx != 1)
3409 return UnableToLegalize;
3410 Observer.changingInstr(MI);
3411 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
3412 Observer.changedInstr(MI);
3413 return Legalized;
3414 }
3415 case TargetOpcode::G_VECREDUCE_ADD: {
3416 if (TypeIdx != 1)
3417 return UnableToLegalize;
3418 Observer.changingInstr(MI);
3419 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
3420 widenScalarDst(MI, WideTy.getScalarType(), 0, TargetOpcode::G_TRUNC);
3421 Observer.changedInstr(MI);
3422 return Legalized;
3423 }
3424 case TargetOpcode::G_VECREDUCE_FADD:
3425 case TargetOpcode::G_VECREDUCE_FMUL:
3426 case TargetOpcode::G_VECREDUCE_FMIN:
3427 case TargetOpcode::G_VECREDUCE_FMAX:
3428 case TargetOpcode::G_VECREDUCE_FMINIMUM:
3429 case TargetOpcode::G_VECREDUCE_FMAXIMUM: {
3430 if (TypeIdx != 0)
3431 return UnableToLegalize;
3432 Observer.changingInstr(MI);
3433 Register VecReg = MI.getOperand(1).getReg();
3434 LLT VecTy = MRI.getType(VecReg);
3435 LLT WideVecTy = VecTy.isVector()
3436 ? LLT::vector(VecTy.getElementCount(), WideTy)
3437 : WideTy;
3438 widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_FPEXT);
3439 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3440 Observer.changedInstr(MI);
3441 return Legalized;
3442 }
3443 case TargetOpcode::G_VSCALE: {
3444 MachineOperand &SrcMO = MI.getOperand(1);
3445 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3446 const APInt &SrcVal = SrcMO.getCImm()->getValue();
3447 // The CImm is always a signed value
3448 const APInt Val = SrcVal.sext(WideTy.getSizeInBits());
3449 Observer.changingInstr(MI);
3450 SrcMO.setCImm(ConstantInt::get(Ctx, Val));
3451 widenScalarDst(MI, WideTy);
3452 Observer.changedInstr(MI);
3453 return Legalized;
3454 }
3455 case TargetOpcode::G_SPLAT_VECTOR: {
3456 if (TypeIdx != 1)
3457 return UnableToLegalize;
3458
3459 Observer.changingInstr(MI);
3460 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
3461 Observer.changedInstr(MI);
3462 return Legalized;
3463 }
3464 case TargetOpcode::G_INSERT_SUBVECTOR: {
3465 if (TypeIdx != 0)
3466 return UnableToLegalize;
3467
3468 GInsertSubvector &IS = cast<GInsertSubvector>(MI);
3469 Register BigVec = IS.getBigVec();
3470 Register SubVec = IS.getSubVec();
3471
3472 LLT SubVecTy = MRI.getType(SubVec);
3473 LLT SubVecWideTy = SubVecTy.changeElementType(WideTy.getElementType());
3474
3475 // Widen the G_INSERT_SUBVECTOR
3476 auto BigZExt = MIRBuilder.buildZExt(WideTy, BigVec);
3477 auto SubZExt = MIRBuilder.buildZExt(SubVecWideTy, SubVec);
3478 auto WideInsert = MIRBuilder.buildInsertSubvector(WideTy, BigZExt, SubZExt,
3479 IS.getIndexImm());
3480
3481 // Truncate back down
3482 auto SplatZero = MIRBuilder.buildSplatVector(
3483 WideTy, MIRBuilder.buildConstant(WideTy.getElementType(), 0));
3484 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, IS.getReg(0), WideInsert,
3485 SplatZero);
3486
3487 MI.eraseFromParent();
3488
3489 return Legalized;
3490 }
3491 }
3492 }
3493
getUnmergePieces(SmallVectorImpl<Register> & Pieces,MachineIRBuilder & B,Register Src,LLT Ty)3494 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
3495 MachineIRBuilder &B, Register Src, LLT Ty) {
3496 auto Unmerge = B.buildUnmerge(Ty, Src);
3497 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3498 Pieces.push_back(Unmerge.getReg(I));
3499 }
3500
emitLoadFromConstantPool(Register DstReg,const Constant * ConstVal,MachineIRBuilder & MIRBuilder)3501 static void emitLoadFromConstantPool(Register DstReg, const Constant *ConstVal,
3502 MachineIRBuilder &MIRBuilder) {
3503 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3504 MachineFunction &MF = MIRBuilder.getMF();
3505 const DataLayout &DL = MIRBuilder.getDataLayout();
3506 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
3507 LLT AddrPtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3508 LLT DstLLT = MRI.getType(DstReg);
3509
3510 Align Alignment(DL.getABITypeAlign(ConstVal->getType()));
3511
3512 auto Addr = MIRBuilder.buildConstantPool(
3513 AddrPtrTy,
3514 MF.getConstantPool()->getConstantPoolIndex(ConstVal, Alignment));
3515
3516 MachineMemOperand *MMO =
3517 MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF),
3518 MachineMemOperand::MOLoad, DstLLT, Alignment);
3519
3520 MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, DstReg, Addr, *MMO);
3521 }
3522
3523 LegalizerHelper::LegalizeResult
lowerConstant(MachineInstr & MI)3524 LegalizerHelper::lowerConstant(MachineInstr &MI) {
3525 const MachineOperand &ConstOperand = MI.getOperand(1);
3526 const Constant *ConstantVal = ConstOperand.getCImm();
3527
3528 emitLoadFromConstantPool(MI.getOperand(0).getReg(), ConstantVal, MIRBuilder);
3529 MI.eraseFromParent();
3530
3531 return Legalized;
3532 }
3533
3534 LegalizerHelper::LegalizeResult
lowerFConstant(MachineInstr & MI)3535 LegalizerHelper::lowerFConstant(MachineInstr &MI) {
3536 const MachineOperand &ConstOperand = MI.getOperand(1);
3537 const Constant *ConstantVal = ConstOperand.getFPImm();
3538
3539 emitLoadFromConstantPool(MI.getOperand(0).getReg(), ConstantVal, MIRBuilder);
3540 MI.eraseFromParent();
3541
3542 return Legalized;
3543 }
3544
3545 LegalizerHelper::LegalizeResult
lowerBitcast(MachineInstr & MI)3546 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
3547 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
3548 if (SrcTy.isVector()) {
3549 LLT SrcEltTy = SrcTy.getElementType();
3550 SmallVector<Register, 8> SrcRegs;
3551
3552 if (DstTy.isVector()) {
3553 int NumDstElt = DstTy.getNumElements();
3554 int NumSrcElt = SrcTy.getNumElements();
3555
3556 LLT DstEltTy = DstTy.getElementType();
3557 LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
3558 LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
3559
3560 // If there's an element size mismatch, insert intermediate casts to match
3561 // the result element type.
3562 if (NumSrcElt < NumDstElt) { // Source element type is larger.
3563 // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
3564 //
3565 // =>
3566 //
3567 // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
3568 // %3:_(<2 x s8>) = G_BITCAST %2
3569 // %4:_(<2 x s8>) = G_BITCAST %3
3570 // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
3571 DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
3572 SrcPartTy = SrcEltTy;
3573 } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
3574 //
3575 // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
3576 //
3577 // =>
3578 //
3579 // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
3580 // %3:_(s16) = G_BITCAST %2
3581 // %4:_(s16) = G_BITCAST %3
3582 // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
3583 SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
3584 DstCastTy = DstEltTy;
3585 }
3586
3587 getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
3588 for (Register &SrcReg : SrcRegs)
3589 SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
3590 } else
3591 getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
3592
3593 MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
3594 MI.eraseFromParent();
3595 return Legalized;
3596 }
3597
3598 if (DstTy.isVector()) {
3599 SmallVector<Register, 8> SrcRegs;
3600 getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
3601 MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
3602 MI.eraseFromParent();
3603 return Legalized;
3604 }
3605
3606 return UnableToLegalize;
3607 }
3608
3609 /// Figure out the bit offset into a register when coercing a vector index for
3610 /// the wide element type. This is only for the case when promoting vector to
3611 /// one with larger elements.
3612 //
3613 ///
3614 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3615 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
getBitcastWiderVectorElementOffset(MachineIRBuilder & B,Register Idx,unsigned NewEltSize,unsigned OldEltSize)3616 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
3617 Register Idx,
3618 unsigned NewEltSize,
3619 unsigned OldEltSize) {
3620 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3621 LLT IdxTy = B.getMRI()->getType(Idx);
3622
3623 // Now figure out the amount we need to shift to get the target bits.
3624 auto OffsetMask = B.buildConstant(
3625 IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
3626 auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
3627 return B.buildShl(IdxTy, OffsetIdx,
3628 B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
3629 }
3630
3631 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
3632 /// is casting to a vector with a smaller element size, perform multiple element
3633 /// extracts and merge the results. If this is coercing to a vector with larger
3634 /// elements, index the bitcasted vector and extract the target element with bit
3635 /// operations. This is intended to force the indexing in the native register
3636 /// size for architectures that can dynamically index the register file.
3637 LegalizerHelper::LegalizeResult
bitcastExtractVectorElt(MachineInstr & MI,unsigned TypeIdx,LLT CastTy)3638 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
3639 LLT CastTy) {
3640 if (TypeIdx != 1)
3641 return UnableToLegalize;
3642
3643 auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs();
3644
3645 LLT SrcEltTy = SrcVecTy.getElementType();
3646 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3647 unsigned OldNumElts = SrcVecTy.getNumElements();
3648
3649 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3650 Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
3651
3652 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3653 const unsigned OldEltSize = SrcEltTy.getSizeInBits();
3654 if (NewNumElts > OldNumElts) {
3655 // Decreasing the vector element size
3656 //
3657 // e.g. i64 = extract_vector_elt x:v2i64, y:i32
3658 // =>
3659 // v4i32:castx = bitcast x:v2i64
3660 //
3661 // i64 = bitcast
3662 // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
3663 // (i32 (extract_vector_elt castx, (2 * y + 1)))
3664 //
3665 if (NewNumElts % OldNumElts != 0)
3666 return UnableToLegalize;
3667
3668 // Type of the intermediate result vector.
3669 const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
3670 LLT MidTy =
3671 LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
3672
3673 auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
3674
3675 SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
3676 auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
3677
3678 for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
3679 auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
3680 auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
3681 auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
3682 NewOps[I] = Elt.getReg(0);
3683 }
3684
3685 auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
3686 MIRBuilder.buildBitcast(Dst, NewVec);
3687 MI.eraseFromParent();
3688 return Legalized;
3689 }
3690
3691 if (NewNumElts < OldNumElts) {
3692 if (NewEltSize % OldEltSize != 0)
3693 return UnableToLegalize;
3694
3695 // This only depends on powers of 2 because we use bit tricks to figure out
3696 // the bit offset we need to shift to get the target element. A general
3697 // expansion could emit division/multiply.
3698 if (!isPowerOf2_32(NewEltSize / OldEltSize))
3699 return UnableToLegalize;
3700
3701 // Increasing the vector element size.
3702 // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
3703 //
3704 // =>
3705 //
3706 // %cast = G_BITCAST %vec
3707 // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
3708 // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
3709 // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3710 // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3711 // %elt_bits = G_LSHR %wide_elt, %offset_bits
3712 // %elt = G_TRUNC %elt_bits
3713
3714 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3715 auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
3716
3717 // Divide to get the index in the wider element type.
3718 auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
3719
3720 Register WideElt = CastVec;
3721 if (CastTy.isVector()) {
3722 WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
3723 ScaledIdx).getReg(0);
3724 }
3725
3726 // Compute the bit offset into the register of the target element.
3727 Register OffsetBits = getBitcastWiderVectorElementOffset(
3728 MIRBuilder, Idx, NewEltSize, OldEltSize);
3729
3730 // Shift the wide element to get the target element.
3731 auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
3732 MIRBuilder.buildTrunc(Dst, ExtractedBits);
3733 MI.eraseFromParent();
3734 return Legalized;
3735 }
3736
3737 return UnableToLegalize;
3738 }
3739
3740 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
3741 /// TargetReg, while preserving other bits in \p TargetReg.
3742 ///
3743 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
buildBitFieldInsert(MachineIRBuilder & B,Register TargetReg,Register InsertReg,Register OffsetBits)3744 static Register buildBitFieldInsert(MachineIRBuilder &B,
3745 Register TargetReg, Register InsertReg,
3746 Register OffsetBits) {
3747 LLT TargetTy = B.getMRI()->getType(TargetReg);
3748 LLT InsertTy = B.getMRI()->getType(InsertReg);
3749 auto ZextVal = B.buildZExt(TargetTy, InsertReg);
3750 auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
3751
3752 // Produce a bitmask of the value to insert
3753 auto EltMask = B.buildConstant(
3754 TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
3755 InsertTy.getSizeInBits()));
3756 // Shift it into position
3757 auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
3758 auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
3759
3760 // Clear out the bits in the wide element
3761 auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
3762
3763 // The value to insert has all zeros already, so stick it into the masked
3764 // wide element.
3765 return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
3766 }
3767
3768 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
3769 /// is increasing the element size, perform the indexing in the target element
3770 /// type, and use bit operations to insert at the element position. This is
3771 /// intended for architectures that can dynamically index the register file and
3772 /// want to force indexing in the native register size.
3773 LegalizerHelper::LegalizeResult
bitcastInsertVectorElt(MachineInstr & MI,unsigned TypeIdx,LLT CastTy)3774 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
3775 LLT CastTy) {
3776 if (TypeIdx != 0)
3777 return UnableToLegalize;
3778
3779 auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] =
3780 MI.getFirst4RegLLTs();
3781 LLT VecTy = DstTy;
3782
3783 LLT VecEltTy = VecTy.getElementType();
3784 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3785 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3786 const unsigned OldEltSize = VecEltTy.getSizeInBits();
3787
3788 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3789 unsigned OldNumElts = VecTy.getNumElements();
3790
3791 Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
3792 if (NewNumElts < OldNumElts) {
3793 if (NewEltSize % OldEltSize != 0)
3794 return UnableToLegalize;
3795
3796 // This only depends on powers of 2 because we use bit tricks to figure out
3797 // the bit offset we need to shift to get the target element. A general
3798 // expansion could emit division/multiply.
3799 if (!isPowerOf2_32(NewEltSize / OldEltSize))
3800 return UnableToLegalize;
3801
3802 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3803 auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
3804
3805 // Divide to get the index in the wider element type.
3806 auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
3807
3808 Register ExtractedElt = CastVec;
3809 if (CastTy.isVector()) {
3810 ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
3811 ScaledIdx).getReg(0);
3812 }
3813
3814 // Compute the bit offset into the register of the target element.
3815 Register OffsetBits = getBitcastWiderVectorElementOffset(
3816 MIRBuilder, Idx, NewEltSize, OldEltSize);
3817
3818 Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
3819 Val, OffsetBits);
3820 if (CastTy.isVector()) {
3821 InsertedElt = MIRBuilder.buildInsertVectorElement(
3822 CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
3823 }
3824
3825 MIRBuilder.buildBitcast(Dst, InsertedElt);
3826 MI.eraseFromParent();
3827 return Legalized;
3828 }
3829
3830 return UnableToLegalize;
3831 }
3832
3833 // This attempts to handle G_CONCAT_VECTORS with illegal operands, particularly
3834 // those that have smaller than legal operands.
3835 //
3836 // <16 x s8> = G_CONCAT_VECTORS <4 x s8>, <4 x s8>, <4 x s8>, <4 x s8>
3837 //
3838 // ===>
3839 //
3840 // s32 = G_BITCAST <4 x s8>
3841 // s32 = G_BITCAST <4 x s8>
3842 // s32 = G_BITCAST <4 x s8>
3843 // s32 = G_BITCAST <4 x s8>
3844 // <4 x s32> = G_BUILD_VECTOR s32, s32, s32, s32
3845 // <16 x s8> = G_BITCAST <4 x s32>
3846 LegalizerHelper::LegalizeResult
bitcastConcatVector(MachineInstr & MI,unsigned TypeIdx,LLT CastTy)3847 LegalizerHelper::bitcastConcatVector(MachineInstr &MI, unsigned TypeIdx,
3848 LLT CastTy) {
3849 // Convert it to CONCAT instruction
3850 auto ConcatMI = dyn_cast<GConcatVectors>(&MI);
3851 if (!ConcatMI) {
3852 return UnableToLegalize;
3853 }
3854
3855 // Check if bitcast is Legal
3856 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
3857 LLT SrcScalTy = LLT::scalar(SrcTy.getSizeInBits());
3858
3859 // Check if the build vector is Legal
3860 if (!LI.isLegal({TargetOpcode::G_BUILD_VECTOR, {CastTy, SrcScalTy}})) {
3861 return UnableToLegalize;
3862 }
3863
3864 // Bitcast the sources
3865 SmallVector<Register> BitcastRegs;
3866 for (unsigned i = 0; i < ConcatMI->getNumSources(); i++) {
3867 BitcastRegs.push_back(
3868 MIRBuilder.buildBitcast(SrcScalTy, ConcatMI->getSourceReg(i))
3869 .getReg(0));
3870 }
3871
3872 // Build the scalar values into a vector
3873 Register BuildReg =
3874 MIRBuilder.buildBuildVector(CastTy, BitcastRegs).getReg(0);
3875 MIRBuilder.buildBitcast(DstReg, BuildReg);
3876
3877 MI.eraseFromParent();
3878 return Legalized;
3879 }
3880
3881 // This bitcasts a shuffle vector to a different type currently of the same
3882 // element size. Mostly used to legalize ptr vectors, where ptrtoint/inttoptr
3883 // will be used instead.
3884 //
3885 // <16 x p0> = G_CONCAT_VECTORS <4 x p0>, <4 x p0>, mask
3886 // ===>
3887 // <4 x s64> = G_PTRTOINT <4 x p0>
3888 // <4 x s64> = G_PTRTOINT <4 x p0>
3889 // <16 x s64> = G_CONCAT_VECTORS <4 x s64>, <4 x s64>, mask
3890 // <16 x p0> = G_INTTOPTR <16 x s64>
3891 LegalizerHelper::LegalizeResult
bitcastShuffleVector(MachineInstr & MI,unsigned TypeIdx,LLT CastTy)3892 LegalizerHelper::bitcastShuffleVector(MachineInstr &MI, unsigned TypeIdx,
3893 LLT CastTy) {
3894 auto ShuffleMI = cast<GShuffleVector>(&MI);
3895 LLT DstTy = MRI.getType(ShuffleMI->getReg(0));
3896 LLT SrcTy = MRI.getType(ShuffleMI->getReg(1));
3897
3898 // We currently only handle vectors of the same size.
3899 if (TypeIdx != 0 ||
3900 CastTy.getScalarSizeInBits() != DstTy.getScalarSizeInBits() ||
3901 CastTy.getElementCount() != DstTy.getElementCount())
3902 return UnableToLegalize;
3903
3904 LLT NewSrcTy = SrcTy.changeElementType(CastTy.getScalarType());
3905
3906 auto Inp1 = MIRBuilder.buildCast(NewSrcTy, ShuffleMI->getReg(1));
3907 auto Inp2 = MIRBuilder.buildCast(NewSrcTy, ShuffleMI->getReg(2));
3908 auto Shuf =
3909 MIRBuilder.buildShuffleVector(CastTy, Inp1, Inp2, ShuffleMI->getMask());
3910 MIRBuilder.buildCast(ShuffleMI->getReg(0), Shuf);
3911
3912 MI.eraseFromParent();
3913 return Legalized;
3914 }
3915
3916 /// This attempts to bitcast G_EXTRACT_SUBVECTOR to CastTy.
3917 ///
3918 /// <vscale x 8 x i1> = G_EXTRACT_SUBVECTOR <vscale x 16 x i1>, N
3919 ///
3920 /// ===>
3921 ///
3922 /// <vscale x 2 x i1> = G_BITCAST <vscale x 16 x i1>
3923 /// <vscale x 1 x i8> = G_EXTRACT_SUBVECTOR <vscale x 2 x i1>, N / 8
3924 /// <vscale x 8 x i1> = G_BITCAST <vscale x 1 x i8>
3925 LegalizerHelper::LegalizeResult
bitcastExtractSubvector(MachineInstr & MI,unsigned TypeIdx,LLT CastTy)3926 LegalizerHelper::bitcastExtractSubvector(MachineInstr &MI, unsigned TypeIdx,
3927 LLT CastTy) {
3928 auto ES = cast<GExtractSubvector>(&MI);
3929
3930 if (!CastTy.isVector())
3931 return UnableToLegalize;
3932
3933 if (TypeIdx != 0)
3934 return UnableToLegalize;
3935
3936 Register Dst = ES->getReg(0);
3937 Register Src = ES->getSrcVec();
3938 uint64_t Idx = ES->getIndexImm();
3939
3940 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3941
3942 LLT DstTy = MRI.getType(Dst);
3943 LLT SrcTy = MRI.getType(Src);
3944 ElementCount DstTyEC = DstTy.getElementCount();
3945 ElementCount SrcTyEC = SrcTy.getElementCount();
3946 auto DstTyMinElts = DstTyEC.getKnownMinValue();
3947 auto SrcTyMinElts = SrcTyEC.getKnownMinValue();
3948
3949 if (DstTy == CastTy)
3950 return Legalized;
3951
3952 if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
3953 return UnableToLegalize;
3954
3955 unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
3956 unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
3957 if (CastEltSize < DstEltSize)
3958 return UnableToLegalize;
3959
3960 auto AdjustAmt = CastEltSize / DstEltSize;
3961 if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
3962 SrcTyMinElts % AdjustAmt != 0)
3963 return UnableToLegalize;
3964
3965 Idx /= AdjustAmt;
3966 SrcTy = LLT::vector(SrcTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt);
3967 auto CastVec = MIRBuilder.buildBitcast(SrcTy, Src);
3968 auto PromotedES = MIRBuilder.buildExtractSubvector(CastTy, CastVec, Idx);
3969 MIRBuilder.buildBitcast(Dst, PromotedES);
3970
3971 ES->eraseFromParent();
3972 return Legalized;
3973 }
3974
3975 /// This attempts to bitcast G_INSERT_SUBVECTOR to CastTy.
3976 ///
3977 /// <vscale x 16 x i1> = G_INSERT_SUBVECTOR <vscale x 16 x i1>,
3978 /// <vscale x 8 x i1>,
3979 /// N
3980 ///
3981 /// ===>
3982 ///
3983 /// <vscale x 2 x i8> = G_BITCAST <vscale x 16 x i1>
3984 /// <vscale x 1 x i8> = G_BITCAST <vscale x 8 x i1>
3985 /// <vscale x 2 x i8> = G_INSERT_SUBVECTOR <vscale x 2 x i8>,
3986 /// <vscale x 1 x i8>, N / 8
3987 /// <vscale x 16 x i1> = G_BITCAST <vscale x 2 x i8>
3988 LegalizerHelper::LegalizeResult
bitcastInsertSubvector(MachineInstr & MI,unsigned TypeIdx,LLT CastTy)3989 LegalizerHelper::bitcastInsertSubvector(MachineInstr &MI, unsigned TypeIdx,
3990 LLT CastTy) {
3991 auto ES = cast<GInsertSubvector>(&MI);
3992
3993 if (!CastTy.isVector())
3994 return UnableToLegalize;
3995
3996 if (TypeIdx != 0)
3997 return UnableToLegalize;
3998
3999 Register Dst = ES->getReg(0);
4000 Register BigVec = ES->getBigVec();
4001 Register SubVec = ES->getSubVec();
4002 uint64_t Idx = ES->getIndexImm();
4003
4004 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4005
4006 LLT DstTy = MRI.getType(Dst);
4007 LLT BigVecTy = MRI.getType(BigVec);
4008 LLT SubVecTy = MRI.getType(SubVec);
4009
4010 if (DstTy == CastTy)
4011 return Legalized;
4012
4013 if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
4014 return UnableToLegalize;
4015
4016 ElementCount DstTyEC = DstTy.getElementCount();
4017 ElementCount BigVecTyEC = BigVecTy.getElementCount();
4018 ElementCount SubVecTyEC = SubVecTy.getElementCount();
4019 auto DstTyMinElts = DstTyEC.getKnownMinValue();
4020 auto BigVecTyMinElts = BigVecTyEC.getKnownMinValue();
4021 auto SubVecTyMinElts = SubVecTyEC.getKnownMinValue();
4022
4023 unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
4024 unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
4025 if (CastEltSize < DstEltSize)
4026 return UnableToLegalize;
4027
4028 auto AdjustAmt = CastEltSize / DstEltSize;
4029 if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
4030 BigVecTyMinElts % AdjustAmt != 0 || SubVecTyMinElts % AdjustAmt != 0)
4031 return UnableToLegalize;
4032
4033 Idx /= AdjustAmt;
4034 BigVecTy = LLT::vector(BigVecTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt);
4035 SubVecTy = LLT::vector(SubVecTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt);
4036 auto CastBigVec = MIRBuilder.buildBitcast(BigVecTy, BigVec);
4037 auto CastSubVec = MIRBuilder.buildBitcast(SubVecTy, SubVec);
4038 auto PromotedIS =
4039 MIRBuilder.buildInsertSubvector(CastTy, CastBigVec, CastSubVec, Idx);
4040 MIRBuilder.buildBitcast(Dst, PromotedIS);
4041
4042 ES->eraseFromParent();
4043 return Legalized;
4044 }
4045
lowerLoad(GAnyLoad & LoadMI)4046 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
4047 // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
4048 Register DstReg = LoadMI.getDstReg();
4049 Register PtrReg = LoadMI.getPointerReg();
4050 LLT DstTy = MRI.getType(DstReg);
4051 MachineMemOperand &MMO = LoadMI.getMMO();
4052 LLT MemTy = MMO.getMemoryType();
4053 MachineFunction &MF = MIRBuilder.getMF();
4054
4055 unsigned MemSizeInBits = MemTy.getSizeInBits();
4056 unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
4057
4058 if (MemSizeInBits != MemStoreSizeInBits) {
4059 if (MemTy.isVector())
4060 return UnableToLegalize;
4061
4062 // Promote to a byte-sized load if not loading an integral number of
4063 // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
4064 LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
4065 MachineMemOperand *NewMMO =
4066 MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
4067
4068 Register LoadReg = DstReg;
4069 LLT LoadTy = DstTy;
4070
4071 // If this wasn't already an extending load, we need to widen the result
4072 // register to avoid creating a load with a narrower result than the source.
4073 if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
4074 LoadTy = WideMemTy;
4075 LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
4076 }
4077
4078 if (isa<GSExtLoad>(LoadMI)) {
4079 auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
4080 MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
4081 } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == LoadTy) {
4082 auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
4083 // The extra bits are guaranteed to be zero, since we stored them that
4084 // way. A zext load from Wide thus automatically gives zext from MemVT.
4085 MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
4086 } else {
4087 MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
4088 }
4089
4090 if (DstTy != LoadTy)
4091 MIRBuilder.buildTrunc(DstReg, LoadReg);
4092
4093 LoadMI.eraseFromParent();
4094 return Legalized;
4095 }
4096
4097 // Big endian lowering not implemented.
4098 if (MIRBuilder.getDataLayout().isBigEndian())
4099 return UnableToLegalize;
4100
4101 // This load needs splitting into power of 2 sized loads.
4102 //
4103 // Our strategy here is to generate anyextending loads for the smaller
4104 // types up to next power-2 result type, and then combine the two larger
4105 // result values together, before truncating back down to the non-pow-2
4106 // type.
4107 // E.g. v1 = i24 load =>
4108 // v2 = i32 zextload (2 byte)
4109 // v3 = i32 load (1 byte)
4110 // v4 = i32 shl v3, 16
4111 // v5 = i32 or v4, v2
4112 // v1 = i24 trunc v5
4113 // By doing this we generate the correct truncate which should get
4114 // combined away as an artifact with a matching extend.
4115
4116 uint64_t LargeSplitSize, SmallSplitSize;
4117
4118 if (!isPowerOf2_32(MemSizeInBits)) {
4119 // This load needs splitting into power of 2 sized loads.
4120 LargeSplitSize = llvm::bit_floor(MemSizeInBits);
4121 SmallSplitSize = MemSizeInBits - LargeSplitSize;
4122 } else {
4123 // This is already a power of 2, but we still need to split this in half.
4124 //
4125 // Assume we're being asked to decompose an unaligned load.
4126 // TODO: If this requires multiple splits, handle them all at once.
4127 auto &Ctx = MF.getFunction().getContext();
4128 if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
4129 return UnableToLegalize;
4130
4131 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4132 }
4133
4134 if (MemTy.isVector()) {
4135 // TODO: Handle vector extloads
4136 if (MemTy != DstTy)
4137 return UnableToLegalize;
4138
4139 Align Alignment = LoadMI.getAlign();
4140 // Given an alignment larger than the size of the memory, we can increase
4141 // the size of the load without needing to scalarize it.
4142 if (Alignment.value() * 8 > MemSizeInBits &&
4143 isPowerOf2_64(DstTy.getScalarSizeInBits())) {
4144 LLT MoreTy = LLT::fixed_vector(NextPowerOf2(DstTy.getNumElements()),
4145 DstTy.getElementType());
4146 MachineMemOperand *NewMMO = MF.getMachineMemOperand(&MMO, 0, MoreTy);
4147 auto NewLoad = MIRBuilder.buildLoad(MoreTy, PtrReg, *NewMMO);
4148 MIRBuilder.buildDeleteTrailingVectorElements(LoadMI.getReg(0),
4149 NewLoad.getReg(0));
4150 LoadMI.eraseFromParent();
4151 return Legalized;
4152 }
4153
4154 // TODO: We can do better than scalarizing the vector and at least split it
4155 // in half.
4156 return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
4157 }
4158
4159 MachineMemOperand *LargeMMO =
4160 MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
4161 MachineMemOperand *SmallMMO =
4162 MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
4163
4164 LLT PtrTy = MRI.getType(PtrReg);
4165 unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
4166 LLT AnyExtTy = LLT::scalar(AnyExtSize);
4167 auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
4168 PtrReg, *LargeMMO);
4169
4170 auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
4171 LargeSplitSize / 8);
4172 Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
4173 auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
4174 auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
4175 SmallPtr, *SmallMMO);
4176
4177 auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
4178 auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
4179
4180 if (AnyExtTy == DstTy)
4181 MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
4182 else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
4183 auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
4184 MIRBuilder.buildTrunc(DstReg, {Or});
4185 } else {
4186 assert(DstTy.isPointer() && "expected pointer");
4187 auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
4188
4189 // FIXME: We currently consider this to be illegal for non-integral address
4190 // spaces, but we need still need a way to reinterpret the bits.
4191 MIRBuilder.buildIntToPtr(DstReg, Or);
4192 }
4193
4194 LoadMI.eraseFromParent();
4195 return Legalized;
4196 }
4197
lowerStore(GStore & StoreMI)4198 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
4199 // Lower a non-power of 2 store into multiple pow-2 stores.
4200 // E.g. split an i24 store into an i16 store + i8 store.
4201 // We do this by first extending the stored value to the next largest power
4202 // of 2 type, and then using truncating stores to store the components.
4203 // By doing this, likewise with G_LOAD, generate an extend that can be
4204 // artifact-combined away instead of leaving behind extracts.
4205 Register SrcReg = StoreMI.getValueReg();
4206 Register PtrReg = StoreMI.getPointerReg();
4207 LLT SrcTy = MRI.getType(SrcReg);
4208 MachineFunction &MF = MIRBuilder.getMF();
4209 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4210 LLT MemTy = MMO.getMemoryType();
4211
4212 unsigned StoreWidth = MemTy.getSizeInBits();
4213 unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
4214
4215 if (StoreWidth != StoreSizeInBits && !SrcTy.isVector()) {
4216 // Promote to a byte-sized store with upper bits zero if not
4217 // storing an integral number of bytes. For example, promote
4218 // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
4219 LLT WideTy = LLT::scalar(StoreSizeInBits);
4220
4221 if (StoreSizeInBits > SrcTy.getSizeInBits()) {
4222 // Avoid creating a store with a narrower source than result.
4223 SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
4224 SrcTy = WideTy;
4225 }
4226
4227 auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
4228
4229 MachineMemOperand *NewMMO =
4230 MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
4231 MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
4232 StoreMI.eraseFromParent();
4233 return Legalized;
4234 }
4235
4236 if (MemTy.isVector()) {
4237 if (MemTy != SrcTy)
4238 return scalarizeVectorBooleanStore(StoreMI);
4239
4240 // TODO: We can do better than scalarizing the vector and at least split it
4241 // in half.
4242 return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
4243 }
4244
4245 unsigned MemSizeInBits = MemTy.getSizeInBits();
4246 uint64_t LargeSplitSize, SmallSplitSize;
4247
4248 if (!isPowerOf2_32(MemSizeInBits)) {
4249 LargeSplitSize = llvm::bit_floor<uint64_t>(MemTy.getSizeInBits());
4250 SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
4251 } else {
4252 auto &Ctx = MF.getFunction().getContext();
4253 if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
4254 return UnableToLegalize; // Don't know what we're being asked to do.
4255
4256 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4257 }
4258
4259 // Extend to the next pow-2. If this store was itself the result of lowering,
4260 // e.g. an s56 store being broken into s32 + s24, we might have a stored type
4261 // that's wider than the stored size.
4262 unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
4263 const LLT NewSrcTy = LLT::scalar(AnyExtSize);
4264
4265 if (SrcTy.isPointer()) {
4266 const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
4267 SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
4268 }
4269
4270 auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
4271
4272 // Obtain the smaller value by shifting away the larger value.
4273 auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
4274 auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
4275
4276 // Generate the PtrAdd and truncating stores.
4277 LLT PtrTy = MRI.getType(PtrReg);
4278 auto OffsetCst = MIRBuilder.buildConstant(
4279 LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
4280 auto SmallPtr =
4281 MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
4282
4283 MachineMemOperand *LargeMMO =
4284 MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
4285 MachineMemOperand *SmallMMO =
4286 MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
4287 MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
4288 MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
4289 StoreMI.eraseFromParent();
4290 return Legalized;
4291 }
4292
4293 LegalizerHelper::LegalizeResult
scalarizeVectorBooleanStore(GStore & StoreMI)4294 LegalizerHelper::scalarizeVectorBooleanStore(GStore &StoreMI) {
4295 Register SrcReg = StoreMI.getValueReg();
4296 Register PtrReg = StoreMI.getPointerReg();
4297 LLT SrcTy = MRI.getType(SrcReg);
4298 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4299 LLT MemTy = MMO.getMemoryType();
4300 LLT MemScalarTy = MemTy.getElementType();
4301 MachineFunction &MF = MIRBuilder.getMF();
4302
4303 assert(SrcTy.isVector() && "Expect a vector store type");
4304
4305 if (!MemScalarTy.isByteSized()) {
4306 // We need to build an integer scalar of the vector bit pattern.
4307 // It's not legal for us to add padding when storing a vector.
4308 unsigned NumBits = MemTy.getSizeInBits();
4309 LLT IntTy = LLT::scalar(NumBits);
4310 auto CurrVal = MIRBuilder.buildConstant(IntTy, 0);
4311 LLT IdxTy = TLI.getVectorIdxLLT(MF.getDataLayout());
4312
4313 for (unsigned I = 0, E = MemTy.getNumElements(); I < E; ++I) {
4314 auto Elt = MIRBuilder.buildExtractVectorElement(
4315 SrcTy.getElementType(), SrcReg, MIRBuilder.buildConstant(IdxTy, I));
4316 auto Trunc = MIRBuilder.buildTrunc(MemScalarTy, Elt);
4317 auto ZExt = MIRBuilder.buildZExt(IntTy, Trunc);
4318 unsigned ShiftIntoIdx = MF.getDataLayout().isBigEndian()
4319 ? (MemTy.getNumElements() - 1) - I
4320 : I;
4321 auto ShiftAmt = MIRBuilder.buildConstant(
4322 IntTy, ShiftIntoIdx * MemScalarTy.getSizeInBits());
4323 auto Shifted = MIRBuilder.buildShl(IntTy, ZExt, ShiftAmt);
4324 CurrVal = MIRBuilder.buildOr(IntTy, CurrVal, Shifted);
4325 }
4326 auto PtrInfo = MMO.getPointerInfo();
4327 auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, IntTy);
4328 MIRBuilder.buildStore(CurrVal, PtrReg, *NewMMO);
4329 StoreMI.eraseFromParent();
4330 return Legalized;
4331 }
4332
4333 // TODO: implement simple scalarization.
4334 return UnableToLegalize;
4335 }
4336
4337 LegalizerHelper::LegalizeResult
bitcast(MachineInstr & MI,unsigned TypeIdx,LLT CastTy)4338 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
4339 switch (MI.getOpcode()) {
4340 case TargetOpcode::G_LOAD: {
4341 if (TypeIdx != 0)
4342 return UnableToLegalize;
4343 MachineMemOperand &MMO = **MI.memoperands_begin();
4344
4345 // Not sure how to interpret a bitcast of an extending load.
4346 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4347 return UnableToLegalize;
4348
4349 Observer.changingInstr(MI);
4350 bitcastDst(MI, CastTy, 0);
4351 MMO.setType(CastTy);
4352 // The range metadata is no longer valid when reinterpreted as a different
4353 // type.
4354 MMO.clearRanges();
4355 Observer.changedInstr(MI);
4356 return Legalized;
4357 }
4358 case TargetOpcode::G_STORE: {
4359 if (TypeIdx != 0)
4360 return UnableToLegalize;
4361
4362 MachineMemOperand &MMO = **MI.memoperands_begin();
4363
4364 // Not sure how to interpret a bitcast of a truncating store.
4365 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4366 return UnableToLegalize;
4367
4368 Observer.changingInstr(MI);
4369 bitcastSrc(MI, CastTy, 0);
4370 MMO.setType(CastTy);
4371 Observer.changedInstr(MI);
4372 return Legalized;
4373 }
4374 case TargetOpcode::G_SELECT: {
4375 if (TypeIdx != 0)
4376 return UnableToLegalize;
4377
4378 if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
4379 LLVM_DEBUG(
4380 dbgs() << "bitcast action not implemented for vector select\n");
4381 return UnableToLegalize;
4382 }
4383
4384 Observer.changingInstr(MI);
4385 bitcastSrc(MI, CastTy, 2);
4386 bitcastSrc(MI, CastTy, 3);
4387 bitcastDst(MI, CastTy, 0);
4388 Observer.changedInstr(MI);
4389 return Legalized;
4390 }
4391 case TargetOpcode::G_AND:
4392 case TargetOpcode::G_OR:
4393 case TargetOpcode::G_XOR: {
4394 Observer.changingInstr(MI);
4395 bitcastSrc(MI, CastTy, 1);
4396 bitcastSrc(MI, CastTy, 2);
4397 bitcastDst(MI, CastTy, 0);
4398 Observer.changedInstr(MI);
4399 return Legalized;
4400 }
4401 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4402 return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
4403 case TargetOpcode::G_INSERT_VECTOR_ELT:
4404 return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
4405 case TargetOpcode::G_CONCAT_VECTORS:
4406 return bitcastConcatVector(MI, TypeIdx, CastTy);
4407 case TargetOpcode::G_SHUFFLE_VECTOR:
4408 return bitcastShuffleVector(MI, TypeIdx, CastTy);
4409 case TargetOpcode::G_EXTRACT_SUBVECTOR:
4410 return bitcastExtractSubvector(MI, TypeIdx, CastTy);
4411 case TargetOpcode::G_INSERT_SUBVECTOR:
4412 return bitcastInsertSubvector(MI, TypeIdx, CastTy);
4413 default:
4414 return UnableToLegalize;
4415 }
4416 }
4417
4418 // Legalize an instruction by changing the opcode in place.
changeOpcode(MachineInstr & MI,unsigned NewOpcode)4419 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
4420 Observer.changingInstr(MI);
4421 MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
4422 Observer.changedInstr(MI);
4423 }
4424
4425 LegalizerHelper::LegalizeResult
lower(MachineInstr & MI,unsigned TypeIdx,LLT LowerHintTy)4426 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
4427 using namespace TargetOpcode;
4428
4429 switch(MI.getOpcode()) {
4430 default:
4431 return UnableToLegalize;
4432 case TargetOpcode::G_FCONSTANT:
4433 return lowerFConstant(MI);
4434 case TargetOpcode::G_BITCAST:
4435 return lowerBitcast(MI);
4436 case TargetOpcode::G_SREM:
4437 case TargetOpcode::G_UREM: {
4438 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4439 auto Quot =
4440 MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
4441 {MI.getOperand(1), MI.getOperand(2)});
4442
4443 auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
4444 MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
4445 MI.eraseFromParent();
4446 return Legalized;
4447 }
4448 case TargetOpcode::G_SADDO:
4449 case TargetOpcode::G_SSUBO:
4450 return lowerSADDO_SSUBO(MI);
4451 case TargetOpcode::G_UMULH:
4452 case TargetOpcode::G_SMULH:
4453 return lowerSMULH_UMULH(MI);
4454 case TargetOpcode::G_SMULO:
4455 case TargetOpcode::G_UMULO: {
4456 // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
4457 // result.
4458 auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs();
4459 LLT Ty = MRI.getType(Res);
4460
4461 unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
4462 ? TargetOpcode::G_SMULH
4463 : TargetOpcode::G_UMULH;
4464
4465 Observer.changingInstr(MI);
4466 const auto &TII = MIRBuilder.getTII();
4467 MI.setDesc(TII.get(TargetOpcode::G_MUL));
4468 MI.removeOperand(1);
4469 Observer.changedInstr(MI);
4470
4471 auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
4472 auto Zero = MIRBuilder.buildConstant(Ty, 0);
4473
4474 // Move insert point forward so we can use the Res register if needed.
4475 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
4476
4477 // For *signed* multiply, overflow is detected by checking:
4478 // (hi != (lo >> bitwidth-1))
4479 if (Opcode == TargetOpcode::G_SMULH) {
4480 auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
4481 auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
4482 MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
4483 } else {
4484 MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
4485 }
4486 return Legalized;
4487 }
4488 case TargetOpcode::G_FNEG: {
4489 auto [Res, SubByReg] = MI.getFirst2Regs();
4490 LLT Ty = MRI.getType(Res);
4491
4492 auto SignMask = MIRBuilder.buildConstant(
4493 Ty, APInt::getSignMask(Ty.getScalarSizeInBits()));
4494 MIRBuilder.buildXor(Res, SubByReg, SignMask);
4495 MI.eraseFromParent();
4496 return Legalized;
4497 }
4498 case TargetOpcode::G_FSUB:
4499 case TargetOpcode::G_STRICT_FSUB: {
4500 auto [Res, LHS, RHS] = MI.getFirst3Regs();
4501 LLT Ty = MRI.getType(Res);
4502
4503 // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
4504 auto Neg = MIRBuilder.buildFNeg(Ty, RHS);
4505
4506 if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB)
4507 MIRBuilder.buildStrictFAdd(Res, LHS, Neg, MI.getFlags());
4508 else
4509 MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
4510
4511 MI.eraseFromParent();
4512 return Legalized;
4513 }
4514 case TargetOpcode::G_FMAD:
4515 return lowerFMad(MI);
4516 case TargetOpcode::G_FFLOOR:
4517 return lowerFFloor(MI);
4518 case TargetOpcode::G_LROUND:
4519 case TargetOpcode::G_LLROUND: {
4520 Register DstReg = MI.getOperand(0).getReg();
4521 Register SrcReg = MI.getOperand(1).getReg();
4522 LLT SrcTy = MRI.getType(SrcReg);
4523 auto Round = MIRBuilder.buildInstr(TargetOpcode::G_INTRINSIC_ROUND, {SrcTy},
4524 {SrcReg});
4525 MIRBuilder.buildFPTOSI(DstReg, Round);
4526 MI.eraseFromParent();
4527 return Legalized;
4528 }
4529 case TargetOpcode::G_INTRINSIC_ROUND:
4530 return lowerIntrinsicRound(MI);
4531 case TargetOpcode::G_FRINT: {
4532 // Since round even is the assumed rounding mode for unconstrained FP
4533 // operations, rint and roundeven are the same operation.
4534 changeOpcode(MI, TargetOpcode::G_INTRINSIC_ROUNDEVEN);
4535 return Legalized;
4536 }
4537 case TargetOpcode::G_INTRINSIC_LRINT:
4538 case TargetOpcode::G_INTRINSIC_LLRINT: {
4539 Register DstReg = MI.getOperand(0).getReg();
4540 Register SrcReg = MI.getOperand(1).getReg();
4541 LLT SrcTy = MRI.getType(SrcReg);
4542 auto Round =
4543 MIRBuilder.buildInstr(TargetOpcode::G_FRINT, {SrcTy}, {SrcReg});
4544 MIRBuilder.buildFPTOSI(DstReg, Round);
4545 MI.eraseFromParent();
4546 return Legalized;
4547 }
4548 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
4549 auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
4550 Register NewOldValRes = MRI.cloneVirtualRegister(OldValRes);
4551 MIRBuilder.buildAtomicCmpXchg(NewOldValRes, Addr, CmpVal, NewVal,
4552 **MI.memoperands_begin());
4553 MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, NewOldValRes, CmpVal);
4554 MIRBuilder.buildCopy(OldValRes, NewOldValRes);
4555 MI.eraseFromParent();
4556 return Legalized;
4557 }
4558 case TargetOpcode::G_LOAD:
4559 case TargetOpcode::G_SEXTLOAD:
4560 case TargetOpcode::G_ZEXTLOAD:
4561 return lowerLoad(cast<GAnyLoad>(MI));
4562 case TargetOpcode::G_STORE:
4563 return lowerStore(cast<GStore>(MI));
4564 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
4565 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
4566 case TargetOpcode::G_CTLZ:
4567 case TargetOpcode::G_CTTZ:
4568 case TargetOpcode::G_CTPOP:
4569 return lowerBitCount(MI);
4570 case G_UADDO: {
4571 auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs();
4572
4573 Register NewRes = MRI.cloneVirtualRegister(Res);
4574
4575 MIRBuilder.buildAdd(NewRes, LHS, RHS);
4576 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, NewRes, RHS);
4577
4578 MIRBuilder.buildCopy(Res, NewRes);
4579
4580 MI.eraseFromParent();
4581 return Legalized;
4582 }
4583 case G_UADDE: {
4584 auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
4585 const LLT CondTy = MRI.getType(CarryOut);
4586 const LLT Ty = MRI.getType(Res);
4587
4588 Register NewRes = MRI.cloneVirtualRegister(Res);
4589
4590 // Initial add of the two operands.
4591 auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
4592
4593 // Initial check for carry.
4594 auto Carry = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, TmpRes, LHS);
4595
4596 // Add the sum and the carry.
4597 auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
4598 MIRBuilder.buildAdd(NewRes, TmpRes, ZExtCarryIn);
4599
4600 // Second check for carry. We can only carry if the initial sum is all 1s
4601 // and the carry is set, resulting in a new sum of 0.
4602 auto Zero = MIRBuilder.buildConstant(Ty, 0);
4603 auto ResEqZero =
4604 MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, NewRes, Zero);
4605 auto Carry2 = MIRBuilder.buildAnd(CondTy, ResEqZero, CarryIn);
4606 MIRBuilder.buildOr(CarryOut, Carry, Carry2);
4607
4608 MIRBuilder.buildCopy(Res, NewRes);
4609
4610 MI.eraseFromParent();
4611 return Legalized;
4612 }
4613 case G_USUBO: {
4614 auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs();
4615
4616 MIRBuilder.buildSub(Res, LHS, RHS);
4617 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
4618
4619 MI.eraseFromParent();
4620 return Legalized;
4621 }
4622 case G_USUBE: {
4623 auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs();
4624 const LLT CondTy = MRI.getType(BorrowOut);
4625 const LLT Ty = MRI.getType(Res);
4626
4627 // Initial subtract of the two operands.
4628 auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
4629
4630 // Initial check for borrow.
4631 auto Borrow = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, CondTy, TmpRes, LHS);
4632
4633 // Subtract the borrow from the first subtract.
4634 auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
4635 MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
4636
4637 // Second check for borrow. We can only borrow if the initial difference is
4638 // 0 and the borrow is set, resulting in a new difference of all 1s.
4639 auto Zero = MIRBuilder.buildConstant(Ty, 0);
4640 auto TmpResEqZero =
4641 MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, TmpRes, Zero);
4642 auto Borrow2 = MIRBuilder.buildAnd(CondTy, TmpResEqZero, BorrowIn);
4643 MIRBuilder.buildOr(BorrowOut, Borrow, Borrow2);
4644
4645 MI.eraseFromParent();
4646 return Legalized;
4647 }
4648 case G_UITOFP:
4649 return lowerUITOFP(MI);
4650 case G_SITOFP:
4651 return lowerSITOFP(MI);
4652 case G_FPTOUI:
4653 return lowerFPTOUI(MI);
4654 case G_FPTOSI:
4655 return lowerFPTOSI(MI);
4656 case G_FPTOUI_SAT:
4657 case G_FPTOSI_SAT:
4658 return lowerFPTOINT_SAT(MI);
4659 case G_FPTRUNC:
4660 return lowerFPTRUNC(MI);
4661 case G_FPOWI:
4662 return lowerFPOWI(MI);
4663 case G_SMIN:
4664 case G_SMAX:
4665 case G_UMIN:
4666 case G_UMAX:
4667 return lowerMinMax(MI);
4668 case G_SCMP:
4669 case G_UCMP:
4670 return lowerThreewayCompare(MI);
4671 case G_FCOPYSIGN:
4672 return lowerFCopySign(MI);
4673 case G_FMINNUM:
4674 case G_FMAXNUM:
4675 case G_FMINIMUMNUM:
4676 case G_FMAXIMUMNUM:
4677 return lowerFMinNumMaxNum(MI);
4678 case G_MERGE_VALUES:
4679 return lowerMergeValues(MI);
4680 case G_UNMERGE_VALUES:
4681 return lowerUnmergeValues(MI);
4682 case TargetOpcode::G_SEXT_INREG: {
4683 assert(MI.getOperand(2).isImm() && "Expected immediate");
4684 int64_t SizeInBits = MI.getOperand(2).getImm();
4685
4686 auto [DstReg, SrcReg] = MI.getFirst2Regs();
4687 LLT DstTy = MRI.getType(DstReg);
4688 Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
4689
4690 auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
4691 MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
4692 MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
4693 MI.eraseFromParent();
4694 return Legalized;
4695 }
4696 case G_EXTRACT_VECTOR_ELT:
4697 case G_INSERT_VECTOR_ELT:
4698 return lowerExtractInsertVectorElt(MI);
4699 case G_SHUFFLE_VECTOR:
4700 return lowerShuffleVector(MI);
4701 case G_VECTOR_COMPRESS:
4702 return lowerVECTOR_COMPRESS(MI);
4703 case G_DYN_STACKALLOC:
4704 return lowerDynStackAlloc(MI);
4705 case G_STACKSAVE:
4706 return lowerStackSave(MI);
4707 case G_STACKRESTORE:
4708 return lowerStackRestore(MI);
4709 case G_EXTRACT:
4710 return lowerExtract(MI);
4711 case G_INSERT:
4712 return lowerInsert(MI);
4713 case G_BSWAP:
4714 return lowerBswap(MI);
4715 case G_BITREVERSE:
4716 return lowerBitreverse(MI);
4717 case G_READ_REGISTER:
4718 case G_WRITE_REGISTER:
4719 return lowerReadWriteRegister(MI);
4720 case G_UADDSAT:
4721 case G_USUBSAT: {
4722 // Try to make a reasonable guess about which lowering strategy to use. The
4723 // target can override this with custom lowering and calling the
4724 // implementation functions.
4725 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4726 if (LI.isLegalOrCustom({G_UMIN, Ty}))
4727 return lowerAddSubSatToMinMax(MI);
4728 return lowerAddSubSatToAddoSubo(MI);
4729 }
4730 case G_SADDSAT:
4731 case G_SSUBSAT: {
4732 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4733
4734 // FIXME: It would probably make more sense to see if G_SADDO is preferred,
4735 // since it's a shorter expansion. However, we would need to figure out the
4736 // preferred boolean type for the carry out for the query.
4737 if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
4738 return lowerAddSubSatToMinMax(MI);
4739 return lowerAddSubSatToAddoSubo(MI);
4740 }
4741 case G_SSHLSAT:
4742 case G_USHLSAT:
4743 return lowerShlSat(MI);
4744 case G_ABS:
4745 return lowerAbsToAddXor(MI);
4746 case G_FABS:
4747 return lowerFAbs(MI);
4748 case G_SELECT:
4749 return lowerSelect(MI);
4750 case G_IS_FPCLASS:
4751 return lowerISFPCLASS(MI);
4752 case G_SDIVREM:
4753 case G_UDIVREM:
4754 return lowerDIVREM(MI);
4755 case G_FSHL:
4756 case G_FSHR:
4757 return lowerFunnelShift(MI);
4758 case G_ROTL:
4759 case G_ROTR:
4760 return lowerRotate(MI);
4761 case G_MEMSET:
4762 case G_MEMCPY:
4763 case G_MEMMOVE:
4764 return lowerMemCpyFamily(MI);
4765 case G_MEMCPY_INLINE:
4766 return lowerMemcpyInline(MI);
4767 case G_ZEXT:
4768 case G_SEXT:
4769 case G_ANYEXT:
4770 return lowerEXT(MI);
4771 case G_TRUNC:
4772 return lowerTRUNC(MI);
4773 GISEL_VECREDUCE_CASES_NONSEQ
4774 return lowerVectorReduction(MI);
4775 case G_VAARG:
4776 return lowerVAArg(MI);
4777 }
4778 }
4779
getStackTemporaryAlignment(LLT Ty,Align MinAlign) const4780 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
4781 Align MinAlign) const {
4782 // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
4783 // datalayout for the preferred alignment. Also there should be a target hook
4784 // for this to allow targets to reduce the alignment and ignore the
4785 // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
4786 // the type.
4787 return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
4788 }
4789
4790 MachineInstrBuilder
createStackTemporary(TypeSize Bytes,Align Alignment,MachinePointerInfo & PtrInfo)4791 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
4792 MachinePointerInfo &PtrInfo) {
4793 MachineFunction &MF = MIRBuilder.getMF();
4794 const DataLayout &DL = MIRBuilder.getDataLayout();
4795 int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
4796
4797 unsigned AddrSpace = DL.getAllocaAddrSpace();
4798 LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
4799
4800 PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
4801 return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
4802 }
4803
createStackStoreLoad(const DstOp & Res,const SrcOp & Val)4804 MachineInstrBuilder LegalizerHelper::createStackStoreLoad(const DstOp &Res,
4805 const SrcOp &Val) {
4806 LLT SrcTy = Val.getLLTTy(MRI);
4807 Align StackTypeAlign =
4808 std::max(getStackTemporaryAlignment(SrcTy),
4809 getStackTemporaryAlignment(Res.getLLTTy(MRI)));
4810 MachinePointerInfo PtrInfo;
4811 auto StackTemp =
4812 createStackTemporary(SrcTy.getSizeInBytes(), StackTypeAlign, PtrInfo);
4813
4814 MIRBuilder.buildStore(Val, StackTemp, PtrInfo, StackTypeAlign);
4815 return MIRBuilder.buildLoad(Res, StackTemp, PtrInfo, StackTypeAlign);
4816 }
4817
clampVectorIndex(MachineIRBuilder & B,Register IdxReg,LLT VecTy)4818 static Register clampVectorIndex(MachineIRBuilder &B, Register IdxReg,
4819 LLT VecTy) {
4820 LLT IdxTy = B.getMRI()->getType(IdxReg);
4821 unsigned NElts = VecTy.getNumElements();
4822
4823 int64_t IdxVal;
4824 if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal))) {
4825 if (IdxVal < VecTy.getNumElements())
4826 return IdxReg;
4827 // If a constant index would be out of bounds, clamp it as well.
4828 }
4829
4830 if (isPowerOf2_32(NElts)) {
4831 APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
4832 return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
4833 }
4834
4835 return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
4836 .getReg(0);
4837 }
4838
getVectorElementPointer(Register VecPtr,LLT VecTy,Register Index)4839 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
4840 Register Index) {
4841 LLT EltTy = VecTy.getElementType();
4842
4843 // Calculate the element offset and add it to the pointer.
4844 unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
4845 assert(EltSize * 8 == EltTy.getSizeInBits() &&
4846 "Converting bits to bytes lost precision");
4847
4848 Index = clampVectorIndex(MIRBuilder, Index, VecTy);
4849
4850 // Convert index to the correct size for the address space.
4851 const DataLayout &DL = MIRBuilder.getDataLayout();
4852 unsigned AS = MRI.getType(VecPtr).getAddressSpace();
4853 unsigned IndexSizeInBits = DL.getIndexSize(AS) * 8;
4854 LLT IdxTy = MRI.getType(Index).changeElementSize(IndexSizeInBits);
4855 if (IdxTy != MRI.getType(Index))
4856 Index = MIRBuilder.buildSExtOrTrunc(IdxTy, Index).getReg(0);
4857
4858 auto Mul = MIRBuilder.buildMul(IdxTy, Index,
4859 MIRBuilder.buildConstant(IdxTy, EltSize));
4860
4861 LLT PtrTy = MRI.getType(VecPtr);
4862 return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
4863 }
4864
4865 #ifndef NDEBUG
4866 /// Check that all vector operands have same number of elements. Other operands
4867 /// should be listed in NonVecOp.
hasSameNumEltsOnAllVectorOperands(GenericMachineInstr & MI,MachineRegisterInfo & MRI,std::initializer_list<unsigned> NonVecOpIndices)4868 static bool hasSameNumEltsOnAllVectorOperands(
4869 GenericMachineInstr &MI, MachineRegisterInfo &MRI,
4870 std::initializer_list<unsigned> NonVecOpIndices) {
4871 if (MI.getNumMemOperands() != 0)
4872 return false;
4873
4874 LLT VecTy = MRI.getType(MI.getReg(0));
4875 if (!VecTy.isVector())
4876 return false;
4877 unsigned NumElts = VecTy.getNumElements();
4878
4879 for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
4880 MachineOperand &Op = MI.getOperand(OpIdx);
4881 if (!Op.isReg()) {
4882 if (!is_contained(NonVecOpIndices, OpIdx))
4883 return false;
4884 continue;
4885 }
4886
4887 LLT Ty = MRI.getType(Op.getReg());
4888 if (!Ty.isVector()) {
4889 if (!is_contained(NonVecOpIndices, OpIdx))
4890 return false;
4891 continue;
4892 }
4893
4894 if (Ty.getNumElements() != NumElts)
4895 return false;
4896 }
4897
4898 return true;
4899 }
4900 #endif
4901
4902 /// Fill \p DstOps with DstOps that have same number of elements combined as
4903 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
4904 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
4905 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
makeDstOps(SmallVectorImpl<DstOp> & DstOps,LLT Ty,unsigned NumElts)4906 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
4907 unsigned NumElts) {
4908 LLT LeftoverTy;
4909 assert(Ty.isVector() && "Expected vector type");
4910 LLT EltTy = Ty.getElementType();
4911 LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
4912 int NumParts, NumLeftover;
4913 std::tie(NumParts, NumLeftover) =
4914 getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
4915
4916 assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
4917 for (int i = 0; i < NumParts; ++i) {
4918 DstOps.push_back(NarrowTy);
4919 }
4920
4921 if (LeftoverTy.isValid()) {
4922 assert(NumLeftover == 1 && "expected exactly one leftover");
4923 DstOps.push_back(LeftoverTy);
4924 }
4925 }
4926
4927 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
4928 /// made from \p Op depending on operand type.
broadcastSrcOp(SmallVectorImpl<SrcOp> & Ops,unsigned N,MachineOperand & Op)4929 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
4930 MachineOperand &Op) {
4931 for (unsigned i = 0; i < N; ++i) {
4932 if (Op.isReg())
4933 Ops.push_back(Op.getReg());
4934 else if (Op.isImm())
4935 Ops.push_back(Op.getImm());
4936 else if (Op.isPredicate())
4937 Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
4938 else
4939 llvm_unreachable("Unsupported type");
4940 }
4941 }
4942
4943 // Handle splitting vector operations which need to have the same number of
4944 // elements in each type index, but each type index may have a different element
4945 // type.
4946 //
4947 // e.g. <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
4948 // <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4949 // <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4950 //
4951 // Also handles some irregular breakdown cases, e.g.
4952 // e.g. <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
4953 // <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4954 // s64 = G_SHL s64, s32
4955 LegalizerHelper::LegalizeResult
fewerElementsVectorMultiEltType(GenericMachineInstr & MI,unsigned NumElts,std::initializer_list<unsigned> NonVecOpIndices)4956 LegalizerHelper::fewerElementsVectorMultiEltType(
4957 GenericMachineInstr &MI, unsigned NumElts,
4958 std::initializer_list<unsigned> NonVecOpIndices) {
4959 assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
4960 "Non-compatible opcode or not specified non-vector operands");
4961 unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
4962
4963 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
4964 unsigned NumDefs = MI.getNumDefs();
4965
4966 // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
4967 // Build instructions with DstOps to use instruction found by CSE directly.
4968 // CSE copies found instruction into given vreg when building with vreg dest.
4969 SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
4970 // Output registers will be taken from created instructions.
4971 SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
4972 for (unsigned i = 0; i < NumDefs; ++i) {
4973 makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
4974 }
4975
4976 // Split vector input operands into sub-vectors with NumElts elts + Leftover.
4977 // Operands listed in NonVecOpIndices will be used as is without splitting;
4978 // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
4979 // scalar condition (op 1), immediate in sext_inreg (op 2).
4980 SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
4981 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
4982 ++UseIdx, ++UseNo) {
4983 if (is_contained(NonVecOpIndices, UseIdx)) {
4984 broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
4985 MI.getOperand(UseIdx));
4986 } else {
4987 SmallVector<Register, 8> SplitPieces;
4988 extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces, MIRBuilder,
4989 MRI);
4990 llvm::append_range(InputOpsPieces[UseNo], SplitPieces);
4991 }
4992 }
4993
4994 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
4995
4996 // Take i-th piece of each input operand split and build sub-vector/scalar
4997 // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
4998 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
4999 SmallVector<DstOp, 2> Defs;
5000 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
5001 Defs.push_back(OutputOpsPieces[DstNo][i]);
5002
5003 SmallVector<SrcOp, 3> Uses;
5004 for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
5005 Uses.push_back(InputOpsPieces[InputNo][i]);
5006
5007 auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
5008 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
5009 OutputRegs[DstNo].push_back(I.getReg(DstNo));
5010 }
5011
5012 // Merge small outputs into MI's output for each def operand.
5013 if (NumLeftovers) {
5014 for (unsigned i = 0; i < NumDefs; ++i)
5015 mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
5016 } else {
5017 for (unsigned i = 0; i < NumDefs; ++i)
5018 MIRBuilder.buildMergeLikeInstr(MI.getReg(i), OutputRegs[i]);
5019 }
5020
5021 MI.eraseFromParent();
5022 return Legalized;
5023 }
5024
5025 LegalizerHelper::LegalizeResult
fewerElementsVectorPhi(GenericMachineInstr & MI,unsigned NumElts)5026 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
5027 unsigned NumElts) {
5028 unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
5029
5030 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
5031 unsigned NumDefs = MI.getNumDefs();
5032
5033 SmallVector<DstOp, 8> OutputOpsPieces;
5034 SmallVector<Register, 8> OutputRegs;
5035 makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
5036
5037 // Instructions that perform register split will be inserted in basic block
5038 // where register is defined (basic block is in the next operand).
5039 SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
5040 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
5041 UseIdx += 2, ++UseNo) {
5042 MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
5043 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
5044 extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo],
5045 MIRBuilder, MRI);
5046 }
5047
5048 // Build PHIs with fewer elements.
5049 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
5050 MIRBuilder.setInsertPt(*MI.getParent(), MI);
5051 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
5052 auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
5053 Phi.addDef(
5054 MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
5055 OutputRegs.push_back(Phi.getReg(0));
5056
5057 for (unsigned j = 0; j < NumInputs / 2; ++j) {
5058 Phi.addUse(InputOpsPieces[j][i]);
5059 Phi.add(MI.getOperand(1 + j * 2 + 1));
5060 }
5061 }
5062
5063 // Set the insert point after the existing PHIs
5064 MachineBasicBlock &MBB = *MI.getParent();
5065 MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
5066
5067 // Merge small outputs into MI's def.
5068 if (NumLeftovers) {
5069 mergeMixedSubvectors(MI.getReg(0), OutputRegs);
5070 } else {
5071 MIRBuilder.buildMergeLikeInstr(MI.getReg(0), OutputRegs);
5072 }
5073
5074 MI.eraseFromParent();
5075 return Legalized;
5076 }
5077
5078 LegalizerHelper::LegalizeResult
fewerElementsVectorUnmergeValues(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)5079 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
5080 unsigned TypeIdx,
5081 LLT NarrowTy) {
5082 const int NumDst = MI.getNumOperands() - 1;
5083 const Register SrcReg = MI.getOperand(NumDst).getReg();
5084 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
5085 LLT SrcTy = MRI.getType(SrcReg);
5086
5087 if (TypeIdx != 1 || NarrowTy == DstTy)
5088 return UnableToLegalize;
5089
5090 // Requires compatible types. Otherwise SrcReg should have been defined by
5091 // merge-like instruction that would get artifact combined. Most likely
5092 // instruction that defines SrcReg has to perform more/fewer elements
5093 // legalization compatible with NarrowTy.
5094 assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5095 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5096
5097 if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5098 (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
5099 return UnableToLegalize;
5100
5101 // This is most likely DstTy (smaller then register size) packed in SrcTy
5102 // (larger then register size) and since unmerge was not combined it will be
5103 // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
5104 // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
5105
5106 // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
5107 //
5108 // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
5109 // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
5110 // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
5111 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
5112 const int NumUnmerge = Unmerge->getNumOperands() - 1;
5113 const int PartsPerUnmerge = NumDst / NumUnmerge;
5114
5115 for (int I = 0; I != NumUnmerge; ++I) {
5116 auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
5117
5118 for (int J = 0; J != PartsPerUnmerge; ++J)
5119 MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
5120 MIB.addUse(Unmerge.getReg(I));
5121 }
5122
5123 MI.eraseFromParent();
5124 return Legalized;
5125 }
5126
5127 LegalizerHelper::LegalizeResult
fewerElementsVectorMerge(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)5128 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
5129 LLT NarrowTy) {
5130 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5131 // Requires compatible types. Otherwise user of DstReg did not perform unmerge
5132 // that should have been artifact combined. Most likely instruction that uses
5133 // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
5134 assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5135 assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5136 if (NarrowTy == SrcTy)
5137 return UnableToLegalize;
5138
5139 // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
5140 // is for old mir tests. Since the changes to more/fewer elements it should no
5141 // longer be possible to generate MIR like this when starting from llvm-ir
5142 // because LCMTy approach was replaced with merge/unmerge to vector elements.
5143 if (TypeIdx == 1) {
5144 assert(SrcTy.isVector() && "Expected vector types");
5145 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5146 if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5147 (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
5148 return UnableToLegalize;
5149 // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
5150 //
5151 // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
5152 // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
5153 // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
5154 // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
5155 // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
5156 // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
5157
5158 SmallVector<Register, 8> Elts;
5159 LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
5160 for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
5161 auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
5162 for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
5163 Elts.push_back(Unmerge.getReg(j));
5164 }
5165
5166 SmallVector<Register, 8> NarrowTyElts;
5167 unsigned NumNarrowTyElts = NarrowTy.getNumElements();
5168 unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
5169 for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
5170 ++i, Offset += NumNarrowTyElts) {
5171 ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
5172 NarrowTyElts.push_back(
5173 MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0));
5174 }
5175
5176 MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
5177 MI.eraseFromParent();
5178 return Legalized;
5179 }
5180
5181 assert(TypeIdx == 0 && "Bad type index");
5182 if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
5183 (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
5184 return UnableToLegalize;
5185
5186 // This is most likely SrcTy (smaller then register size) packed in DstTy
5187 // (larger then register size) and since merge was not combined it will be
5188 // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
5189 // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
5190
5191 // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
5192 //
5193 // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
5194 // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
5195 // %0:_(DstTy) = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
5196 SmallVector<Register, 8> NarrowTyElts;
5197 unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
5198 unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
5199 unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
5200 for (unsigned i = 0; i < NumParts; ++i) {
5201 SmallVector<Register, 8> Sources;
5202 for (unsigned j = 0; j < NumElts; ++j)
5203 Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
5204 NarrowTyElts.push_back(
5205 MIRBuilder.buildMergeLikeInstr(NarrowTy, Sources).getReg(0));
5206 }
5207
5208 MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
5209 MI.eraseFromParent();
5210 return Legalized;
5211 }
5212
5213 LegalizerHelper::LegalizeResult
fewerElementsVectorExtractInsertVectorElt(MachineInstr & MI,unsigned TypeIdx,LLT NarrowVecTy)5214 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
5215 unsigned TypeIdx,
5216 LLT NarrowVecTy) {
5217 auto [DstReg, SrcVec] = MI.getFirst2Regs();
5218 Register InsertVal;
5219 bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
5220
5221 assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
5222 if (IsInsert)
5223 InsertVal = MI.getOperand(2).getReg();
5224
5225 Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
5226
5227 // TODO: Handle total scalarization case.
5228 if (!NarrowVecTy.isVector())
5229 return UnableToLegalize;
5230
5231 LLT VecTy = MRI.getType(SrcVec);
5232
5233 // If the index is a constant, we can really break this down as you would
5234 // expect, and index into the target size pieces.
5235 int64_t IdxVal;
5236 auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
5237 if (MaybeCst) {
5238 IdxVal = MaybeCst->Value.getSExtValue();
5239 // Avoid out of bounds indexing the pieces.
5240 if (IdxVal >= VecTy.getNumElements()) {
5241 MIRBuilder.buildUndef(DstReg);
5242 MI.eraseFromParent();
5243 return Legalized;
5244 }
5245
5246 SmallVector<Register, 8> VecParts;
5247 LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
5248
5249 // Build a sequence of NarrowTy pieces in VecParts for this operand.
5250 LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
5251 TargetOpcode::G_ANYEXT);
5252
5253 unsigned NewNumElts = NarrowVecTy.getNumElements();
5254
5255 LLT IdxTy = MRI.getType(Idx);
5256 int64_t PartIdx = IdxVal / NewNumElts;
5257 auto NewIdx =
5258 MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
5259
5260 if (IsInsert) {
5261 LLT PartTy = MRI.getType(VecParts[PartIdx]);
5262
5263 // Use the adjusted index to insert into one of the subvectors.
5264 auto InsertPart = MIRBuilder.buildInsertVectorElement(
5265 PartTy, VecParts[PartIdx], InsertVal, NewIdx);
5266 VecParts[PartIdx] = InsertPart.getReg(0);
5267
5268 // Recombine the inserted subvector with the others to reform the result
5269 // vector.
5270 buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
5271 } else {
5272 MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
5273 }
5274
5275 MI.eraseFromParent();
5276 return Legalized;
5277 }
5278
5279 // With a variable index, we can't perform the operation in a smaller type, so
5280 // we're forced to expand this.
5281 //
5282 // TODO: We could emit a chain of compare/select to figure out which piece to
5283 // index.
5284 return lowerExtractInsertVectorElt(MI);
5285 }
5286
5287 LegalizerHelper::LegalizeResult
reduceLoadStoreWidth(GLoadStore & LdStMI,unsigned TypeIdx,LLT NarrowTy)5288 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
5289 LLT NarrowTy) {
5290 // FIXME: Don't know how to handle secondary types yet.
5291 if (TypeIdx != 0)
5292 return UnableToLegalize;
5293
5294 if (!NarrowTy.isByteSized()) {
5295 LLVM_DEBUG(dbgs() << "Can't narrow load/store to non-byte-sized type\n");
5296 return UnableToLegalize;
5297 }
5298
5299 // This implementation doesn't work for atomics. Give up instead of doing
5300 // something invalid.
5301 if (LdStMI.isAtomic())
5302 return UnableToLegalize;
5303
5304 bool IsLoad = isa<GLoad>(LdStMI);
5305 Register ValReg = LdStMI.getReg(0);
5306 Register AddrReg = LdStMI.getPointerReg();
5307 LLT ValTy = MRI.getType(ValReg);
5308
5309 // FIXME: Do we need a distinct NarrowMemory legalize action?
5310 if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize().getValue()) {
5311 LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
5312 return UnableToLegalize;
5313 }
5314
5315 int NumParts = -1;
5316 int NumLeftover = -1;
5317 LLT LeftoverTy;
5318 SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
5319 if (IsLoad) {
5320 std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
5321 } else {
5322 if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
5323 NarrowLeftoverRegs, MIRBuilder, MRI)) {
5324 NumParts = NarrowRegs.size();
5325 NumLeftover = NarrowLeftoverRegs.size();
5326 }
5327 }
5328
5329 if (NumParts == -1)
5330 return UnableToLegalize;
5331
5332 LLT PtrTy = MRI.getType(AddrReg);
5333 const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
5334
5335 unsigned TotalSize = ValTy.getSizeInBits();
5336
5337 // Split the load/store into PartTy sized pieces starting at Offset. If this
5338 // is a load, return the new registers in ValRegs. For a store, each elements
5339 // of ValRegs should be PartTy. Returns the next offset that needs to be
5340 // handled.
5341 bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
5342 auto MMO = LdStMI.getMMO();
5343 auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
5344 unsigned NumParts, unsigned Offset) -> unsigned {
5345 MachineFunction &MF = MIRBuilder.getMF();
5346 unsigned PartSize = PartTy.getSizeInBits();
5347 for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
5348 ++Idx) {
5349 unsigned ByteOffset = Offset / 8;
5350 Register NewAddrReg;
5351
5352 MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
5353
5354 MachineMemOperand *NewMMO =
5355 MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
5356
5357 if (IsLoad) {
5358 Register Dst = MRI.createGenericVirtualRegister(PartTy);
5359 ValRegs.push_back(Dst);
5360 MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
5361 } else {
5362 MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
5363 }
5364 Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
5365 }
5366
5367 return Offset;
5368 };
5369
5370 unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
5371 unsigned HandledOffset =
5372 splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
5373
5374 // Handle the rest of the register if this isn't an even type breakdown.
5375 if (LeftoverTy.isValid())
5376 splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
5377
5378 if (IsLoad) {
5379 insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
5380 LeftoverTy, NarrowLeftoverRegs);
5381 }
5382
5383 LdStMI.eraseFromParent();
5384 return Legalized;
5385 }
5386
5387 LegalizerHelper::LegalizeResult
fewerElementsVector(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)5388 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
5389 LLT NarrowTy) {
5390 using namespace TargetOpcode;
5391 GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
5392 unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5393
5394 switch (MI.getOpcode()) {
5395 case G_IMPLICIT_DEF:
5396 case G_TRUNC:
5397 case G_AND:
5398 case G_OR:
5399 case G_XOR:
5400 case G_ADD:
5401 case G_SUB:
5402 case G_MUL:
5403 case G_PTR_ADD:
5404 case G_SMULH:
5405 case G_UMULH:
5406 case G_FADD:
5407 case G_FMUL:
5408 case G_FSUB:
5409 case G_FNEG:
5410 case G_FABS:
5411 case G_FCANONICALIZE:
5412 case G_FDIV:
5413 case G_FREM:
5414 case G_FMA:
5415 case G_FMAD:
5416 case G_FPOW:
5417 case G_FEXP:
5418 case G_FEXP2:
5419 case G_FEXP10:
5420 case G_FLOG:
5421 case G_FLOG2:
5422 case G_FLOG10:
5423 case G_FLDEXP:
5424 case G_FNEARBYINT:
5425 case G_FCEIL:
5426 case G_FFLOOR:
5427 case G_FRINT:
5428 case G_INTRINSIC_LRINT:
5429 case G_INTRINSIC_LLRINT:
5430 case G_INTRINSIC_ROUND:
5431 case G_INTRINSIC_ROUNDEVEN:
5432 case G_LROUND:
5433 case G_LLROUND:
5434 case G_INTRINSIC_TRUNC:
5435 case G_FCOS:
5436 case G_FSIN:
5437 case G_FTAN:
5438 case G_FACOS:
5439 case G_FASIN:
5440 case G_FATAN:
5441 case G_FATAN2:
5442 case G_FCOSH:
5443 case G_FSINH:
5444 case G_FTANH:
5445 case G_FSQRT:
5446 case G_BSWAP:
5447 case G_BITREVERSE:
5448 case G_SDIV:
5449 case G_UDIV:
5450 case G_SREM:
5451 case G_UREM:
5452 case G_SDIVREM:
5453 case G_UDIVREM:
5454 case G_SMIN:
5455 case G_SMAX:
5456 case G_UMIN:
5457 case G_UMAX:
5458 case G_ABS:
5459 case G_FMINNUM:
5460 case G_FMAXNUM:
5461 case G_FMINNUM_IEEE:
5462 case G_FMAXNUM_IEEE:
5463 case G_FMINIMUM:
5464 case G_FMAXIMUM:
5465 case G_FMINIMUMNUM:
5466 case G_FMAXIMUMNUM:
5467 case G_FSHL:
5468 case G_FSHR:
5469 case G_ROTL:
5470 case G_ROTR:
5471 case G_FREEZE:
5472 case G_SADDSAT:
5473 case G_SSUBSAT:
5474 case G_UADDSAT:
5475 case G_USUBSAT:
5476 case G_UMULO:
5477 case G_SMULO:
5478 case G_SHL:
5479 case G_LSHR:
5480 case G_ASHR:
5481 case G_SSHLSAT:
5482 case G_USHLSAT:
5483 case G_CTLZ:
5484 case G_CTLZ_ZERO_UNDEF:
5485 case G_CTTZ:
5486 case G_CTTZ_ZERO_UNDEF:
5487 case G_CTPOP:
5488 case G_FCOPYSIGN:
5489 case G_ZEXT:
5490 case G_SEXT:
5491 case G_ANYEXT:
5492 case G_FPEXT:
5493 case G_FPTRUNC:
5494 case G_SITOFP:
5495 case G_UITOFP:
5496 case G_FPTOSI:
5497 case G_FPTOUI:
5498 case G_FPTOSI_SAT:
5499 case G_FPTOUI_SAT:
5500 case G_INTTOPTR:
5501 case G_PTRTOINT:
5502 case G_ADDRSPACE_CAST:
5503 case G_UADDO:
5504 case G_USUBO:
5505 case G_UADDE:
5506 case G_USUBE:
5507 case G_SADDO:
5508 case G_SSUBO:
5509 case G_SADDE:
5510 case G_SSUBE:
5511 case G_STRICT_FADD:
5512 case G_STRICT_FSUB:
5513 case G_STRICT_FMUL:
5514 case G_STRICT_FMA:
5515 case G_STRICT_FLDEXP:
5516 case G_FFREXP:
5517 return fewerElementsVectorMultiEltType(GMI, NumElts);
5518 case G_ICMP:
5519 case G_FCMP:
5520 return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
5521 case G_IS_FPCLASS:
5522 return fewerElementsVectorMultiEltType(GMI, NumElts, {2, 3 /*mask,fpsem*/});
5523 case G_SELECT:
5524 if (MRI.getType(MI.getOperand(1).getReg()).isVector())
5525 return fewerElementsVectorMultiEltType(GMI, NumElts);
5526 return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
5527 case G_PHI:
5528 return fewerElementsVectorPhi(GMI, NumElts);
5529 case G_UNMERGE_VALUES:
5530 return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
5531 case G_BUILD_VECTOR:
5532 assert(TypeIdx == 0 && "not a vector type index");
5533 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5534 case G_CONCAT_VECTORS:
5535 if (TypeIdx != 1) // TODO: This probably does work as expected already.
5536 return UnableToLegalize;
5537 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5538 case G_EXTRACT_VECTOR_ELT:
5539 case G_INSERT_VECTOR_ELT:
5540 return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
5541 case G_LOAD:
5542 case G_STORE:
5543 return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
5544 case G_SEXT_INREG:
5545 return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
5546 GISEL_VECREDUCE_CASES_NONSEQ
5547 return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
5548 case TargetOpcode::G_VECREDUCE_SEQ_FADD:
5549 case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
5550 return fewerElementsVectorSeqReductions(MI, TypeIdx, NarrowTy);
5551 case G_SHUFFLE_VECTOR:
5552 return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
5553 case G_FPOWI:
5554 return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*pow*/});
5555 case G_BITCAST:
5556 return fewerElementsBitcast(MI, TypeIdx, NarrowTy);
5557 case G_INTRINSIC_FPTRUNC_ROUND:
5558 return fewerElementsVectorMultiEltType(GMI, NumElts, {2});
5559 default:
5560 return UnableToLegalize;
5561 }
5562 }
5563
5564 LegalizerHelper::LegalizeResult
fewerElementsBitcast(MachineInstr & MI,unsigned int TypeIdx,LLT NarrowTy)5565 LegalizerHelper::fewerElementsBitcast(MachineInstr &MI, unsigned int TypeIdx,
5566 LLT NarrowTy) {
5567 assert(MI.getOpcode() == TargetOpcode::G_BITCAST &&
5568 "Not a bitcast operation");
5569
5570 if (TypeIdx != 0)
5571 return UnableToLegalize;
5572
5573 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5574
5575 unsigned NewElemCount =
5576 NarrowTy.getSizeInBits() / SrcTy.getScalarSizeInBits();
5577 LLT SrcNarrowTy = LLT::fixed_vector(NewElemCount, SrcTy.getElementType());
5578
5579 // Split the Src and Dst Reg into smaller registers
5580 SmallVector<Register> SrcVRegs, BitcastVRegs;
5581 if (extractGCDType(SrcVRegs, DstTy, SrcNarrowTy, SrcReg) != SrcNarrowTy)
5582 return UnableToLegalize;
5583
5584 // Build new smaller bitcast instructions
5585 // Not supporting Leftover types for now but will have to
5586 for (Register Reg : SrcVRegs)
5587 BitcastVRegs.push_back(MIRBuilder.buildBitcast(NarrowTy, Reg).getReg(0));
5588
5589 MIRBuilder.buildMergeLikeInstr(DstReg, BitcastVRegs);
5590 MI.eraseFromParent();
5591 return Legalized;
5592 }
5593
fewerElementsVectorShuffle(MachineInstr & MI,unsigned int TypeIdx,LLT NarrowTy)5594 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
5595 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5596 assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
5597 if (TypeIdx != 0)
5598 return UnableToLegalize;
5599
5600 auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] =
5601 MI.getFirst3RegLLTs();
5602 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5603 // The shuffle should be canonicalized by now.
5604 if (DstTy != Src1Ty)
5605 return UnableToLegalize;
5606 if (DstTy != Src2Ty)
5607 return UnableToLegalize;
5608
5609 if (!isPowerOf2_32(DstTy.getNumElements()))
5610 return UnableToLegalize;
5611
5612 // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
5613 // Further legalization attempts will be needed to do split further.
5614 NarrowTy =
5615 DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
5616 unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5617
5618 SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
5619 extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs, MIRBuilder, MRI);
5620 extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs, MIRBuilder, MRI);
5621 Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
5622 SplitSrc2Regs[1]};
5623
5624 Register Hi, Lo;
5625
5626 // If Lo or Hi uses elements from at most two of the four input vectors, then
5627 // express it as a vector shuffle of those two inputs. Otherwise extract the
5628 // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
5629 SmallVector<int, 16> Ops;
5630 for (unsigned High = 0; High < 2; ++High) {
5631 Register &Output = High ? Hi : Lo;
5632
5633 // Build a shuffle mask for the output, discovering on the fly which
5634 // input vectors to use as shuffle operands (recorded in InputUsed).
5635 // If building a suitable shuffle vector proves too hard, then bail
5636 // out with useBuildVector set.
5637 unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
5638 unsigned FirstMaskIdx = High * NewElts;
5639 bool UseBuildVector = false;
5640 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5641 // The mask element. This indexes into the input.
5642 int Idx = Mask[FirstMaskIdx + MaskOffset];
5643
5644 // The input vector this mask element indexes into.
5645 unsigned Input = (unsigned)Idx / NewElts;
5646
5647 if (Input >= std::size(Inputs)) {
5648 // The mask element does not index into any input vector.
5649 Ops.push_back(-1);
5650 continue;
5651 }
5652
5653 // Turn the index into an offset from the start of the input vector.
5654 Idx -= Input * NewElts;
5655
5656 // Find or create a shuffle vector operand to hold this input.
5657 unsigned OpNo;
5658 for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) {
5659 if (InputUsed[OpNo] == Input) {
5660 // This input vector is already an operand.
5661 break;
5662 } else if (InputUsed[OpNo] == -1U) {
5663 // Create a new operand for this input vector.
5664 InputUsed[OpNo] = Input;
5665 break;
5666 }
5667 }
5668
5669 if (OpNo >= std::size(InputUsed)) {
5670 // More than two input vectors used! Give up on trying to create a
5671 // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
5672 UseBuildVector = true;
5673 break;
5674 }
5675
5676 // Add the mask index for the new shuffle vector.
5677 Ops.push_back(Idx + OpNo * NewElts);
5678 }
5679
5680 if (UseBuildVector) {
5681 LLT EltTy = NarrowTy.getElementType();
5682 SmallVector<Register, 16> SVOps;
5683
5684 // Extract the input elements by hand.
5685 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5686 // The mask element. This indexes into the input.
5687 int Idx = Mask[FirstMaskIdx + MaskOffset];
5688
5689 // The input vector this mask element indexes into.
5690 unsigned Input = (unsigned)Idx / NewElts;
5691
5692 if (Input >= std::size(Inputs)) {
5693 // The mask element is "undef" or indexes off the end of the input.
5694 SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
5695 continue;
5696 }
5697
5698 // Turn the index into an offset from the start of the input vector.
5699 Idx -= Input * NewElts;
5700
5701 // Extract the vector element by hand.
5702 SVOps.push_back(MIRBuilder
5703 .buildExtractVectorElement(
5704 EltTy, Inputs[Input],
5705 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
5706 .getReg(0));
5707 }
5708
5709 // Construct the Lo/Hi output using a G_BUILD_VECTOR.
5710 Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
5711 } else if (InputUsed[0] == -1U) {
5712 // No input vectors were used! The result is undefined.
5713 Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
5714 } else {
5715 Register Op0 = Inputs[InputUsed[0]];
5716 // If only one input was used, use an undefined vector for the other.
5717 Register Op1 = InputUsed[1] == -1U
5718 ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
5719 : Inputs[InputUsed[1]];
5720 // At least one input vector was used. Create a new shuffle vector.
5721 Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
5722 }
5723
5724 Ops.clear();
5725 }
5726
5727 MIRBuilder.buildMergeLikeInstr(DstReg, {Lo, Hi});
5728 MI.eraseFromParent();
5729 return Legalized;
5730 }
5731
fewerElementsVectorReductions(MachineInstr & MI,unsigned int TypeIdx,LLT NarrowTy)5732 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
5733 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5734 auto &RdxMI = cast<GVecReduce>(MI);
5735
5736 if (TypeIdx != 1)
5737 return UnableToLegalize;
5738
5739 // The semantics of the normal non-sequential reductions allow us to freely
5740 // re-associate the operation.
5741 auto [DstReg, DstTy, SrcReg, SrcTy] = RdxMI.getFirst2RegLLTs();
5742
5743 if (NarrowTy.isVector() &&
5744 (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
5745 return UnableToLegalize;
5746
5747 unsigned ScalarOpc = RdxMI.getScalarOpcForReduction();
5748 SmallVector<Register> SplitSrcs;
5749 // If NarrowTy is a scalar then we're being asked to scalarize.
5750 const unsigned NumParts =
5751 NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
5752 : SrcTy.getNumElements();
5753
5754 extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI);
5755 if (NarrowTy.isScalar()) {
5756 if (DstTy != NarrowTy)
5757 return UnableToLegalize; // FIXME: handle implicit extensions.
5758
5759 if (isPowerOf2_32(NumParts)) {
5760 // Generate a tree of scalar operations to reduce the critical path.
5761 SmallVector<Register> PartialResults;
5762 unsigned NumPartsLeft = NumParts;
5763 while (NumPartsLeft > 1) {
5764 for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
5765 PartialResults.emplace_back(
5766 MIRBuilder
5767 .buildInstr(ScalarOpc, {NarrowTy},
5768 {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
5769 .getReg(0));
5770 }
5771 SplitSrcs = PartialResults;
5772 PartialResults.clear();
5773 NumPartsLeft = SplitSrcs.size();
5774 }
5775 assert(SplitSrcs.size() == 1);
5776 MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
5777 MI.eraseFromParent();
5778 return Legalized;
5779 }
5780 // If we can't generate a tree, then just do sequential operations.
5781 Register Acc = SplitSrcs[0];
5782 for (unsigned Idx = 1; Idx < NumParts; ++Idx)
5783 Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
5784 .getReg(0);
5785 MIRBuilder.buildCopy(DstReg, Acc);
5786 MI.eraseFromParent();
5787 return Legalized;
5788 }
5789 SmallVector<Register> PartialReductions;
5790 for (unsigned Part = 0; Part < NumParts; ++Part) {
5791 PartialReductions.push_back(
5792 MIRBuilder.buildInstr(RdxMI.getOpcode(), {DstTy}, {SplitSrcs[Part]})
5793 .getReg(0));
5794 }
5795
5796 // If the types involved are powers of 2, we can generate intermediate vector
5797 // ops, before generating a final reduction operation.
5798 if (isPowerOf2_32(SrcTy.getNumElements()) &&
5799 isPowerOf2_32(NarrowTy.getNumElements())) {
5800 return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
5801 }
5802
5803 Register Acc = PartialReductions[0];
5804 for (unsigned Part = 1; Part < NumParts; ++Part) {
5805 if (Part == NumParts - 1) {
5806 MIRBuilder.buildInstr(ScalarOpc, {DstReg},
5807 {Acc, PartialReductions[Part]});
5808 } else {
5809 Acc = MIRBuilder
5810 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
5811 .getReg(0);
5812 }
5813 }
5814 MI.eraseFromParent();
5815 return Legalized;
5816 }
5817
5818 LegalizerHelper::LegalizeResult
fewerElementsVectorSeqReductions(MachineInstr & MI,unsigned int TypeIdx,LLT NarrowTy)5819 LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr &MI,
5820 unsigned int TypeIdx,
5821 LLT NarrowTy) {
5822 auto [DstReg, DstTy, ScalarReg, ScalarTy, SrcReg, SrcTy] =
5823 MI.getFirst3RegLLTs();
5824 if (!NarrowTy.isScalar() || TypeIdx != 2 || DstTy != ScalarTy ||
5825 DstTy != NarrowTy)
5826 return UnableToLegalize;
5827
5828 assert((MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD ||
5829 MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FMUL) &&
5830 "Unexpected vecreduce opcode");
5831 unsigned ScalarOpc = MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD
5832 ? TargetOpcode::G_FADD
5833 : TargetOpcode::G_FMUL;
5834
5835 SmallVector<Register> SplitSrcs;
5836 unsigned NumParts = SrcTy.getNumElements();
5837 extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI);
5838 Register Acc = ScalarReg;
5839 for (unsigned i = 0; i < NumParts; i++)
5840 Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[i]})
5841 .getReg(0);
5842
5843 MIRBuilder.buildCopy(DstReg, Acc);
5844 MI.eraseFromParent();
5845 return Legalized;
5846 }
5847
5848 LegalizerHelper::LegalizeResult
tryNarrowPow2Reduction(MachineInstr & MI,Register SrcReg,LLT SrcTy,LLT NarrowTy,unsigned ScalarOpc)5849 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
5850 LLT SrcTy, LLT NarrowTy,
5851 unsigned ScalarOpc) {
5852 SmallVector<Register> SplitSrcs;
5853 // Split the sources into NarrowTy size pieces.
5854 extractParts(SrcReg, NarrowTy,
5855 SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs,
5856 MIRBuilder, MRI);
5857 // We're going to do a tree reduction using vector operations until we have
5858 // one NarrowTy size value left.
5859 while (SplitSrcs.size() > 1) {
5860 SmallVector<Register> PartialRdxs;
5861 for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
5862 Register LHS = SplitSrcs[Idx];
5863 Register RHS = SplitSrcs[Idx + 1];
5864 // Create the intermediate vector op.
5865 Register Res =
5866 MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
5867 PartialRdxs.push_back(Res);
5868 }
5869 SplitSrcs = std::move(PartialRdxs);
5870 }
5871 // Finally generate the requested NarrowTy based reduction.
5872 Observer.changingInstr(MI);
5873 MI.getOperand(1).setReg(SplitSrcs[0]);
5874 Observer.changedInstr(MI);
5875 return Legalized;
5876 }
5877
5878 LegalizerHelper::LegalizeResult
narrowScalarShiftByConstant(MachineInstr & MI,const APInt & Amt,const LLT HalfTy,const LLT AmtTy)5879 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
5880 const LLT HalfTy, const LLT AmtTy) {
5881
5882 Register InL = MRI.createGenericVirtualRegister(HalfTy);
5883 Register InH = MRI.createGenericVirtualRegister(HalfTy);
5884 MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
5885
5886 if (Amt.isZero()) {
5887 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {InL, InH});
5888 MI.eraseFromParent();
5889 return Legalized;
5890 }
5891
5892 LLT NVT = HalfTy;
5893 unsigned NVTBits = HalfTy.getSizeInBits();
5894 unsigned VTBits = 2 * NVTBits;
5895
5896 SrcOp Lo(Register(0)), Hi(Register(0));
5897 if (MI.getOpcode() == TargetOpcode::G_SHL) {
5898 if (Amt.ugt(VTBits)) {
5899 Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
5900 } else if (Amt.ugt(NVTBits)) {
5901 Lo = MIRBuilder.buildConstant(NVT, 0);
5902 Hi = MIRBuilder.buildShl(NVT, InL,
5903 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5904 } else if (Amt == NVTBits) {
5905 Lo = MIRBuilder.buildConstant(NVT, 0);
5906 Hi = InL;
5907 } else {
5908 Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
5909 auto OrLHS =
5910 MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
5911 auto OrRHS = MIRBuilder.buildLShr(
5912 NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5913 Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5914 }
5915 } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
5916 if (Amt.ugt(VTBits)) {
5917 Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
5918 } else if (Amt.ugt(NVTBits)) {
5919 Lo = MIRBuilder.buildLShr(NVT, InH,
5920 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5921 Hi = MIRBuilder.buildConstant(NVT, 0);
5922 } else if (Amt == NVTBits) {
5923 Lo = InH;
5924 Hi = MIRBuilder.buildConstant(NVT, 0);
5925 } else {
5926 auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
5927
5928 auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
5929 auto OrRHS = MIRBuilder.buildShl(
5930 NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5931
5932 Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5933 Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
5934 }
5935 } else {
5936 if (Amt.ugt(VTBits)) {
5937 Hi = Lo = MIRBuilder.buildAShr(
5938 NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5939 } else if (Amt.ugt(NVTBits)) {
5940 Lo = MIRBuilder.buildAShr(NVT, InH,
5941 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5942 Hi = MIRBuilder.buildAShr(NVT, InH,
5943 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5944 } else if (Amt == NVTBits) {
5945 Lo = InH;
5946 Hi = MIRBuilder.buildAShr(NVT, InH,
5947 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5948 } else {
5949 auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
5950
5951 auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
5952 auto OrRHS = MIRBuilder.buildShl(
5953 NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5954
5955 Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5956 Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
5957 }
5958 }
5959
5960 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {Lo, Hi});
5961 MI.eraseFromParent();
5962
5963 return Legalized;
5964 }
5965
5966 // TODO: Optimize if constant shift amount.
5967 LegalizerHelper::LegalizeResult
narrowScalarShift(MachineInstr & MI,unsigned TypeIdx,LLT RequestedTy)5968 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
5969 LLT RequestedTy) {
5970 if (TypeIdx == 1) {
5971 Observer.changingInstr(MI);
5972 narrowScalarSrc(MI, RequestedTy, 2);
5973 Observer.changedInstr(MI);
5974 return Legalized;
5975 }
5976
5977 Register DstReg = MI.getOperand(0).getReg();
5978 LLT DstTy = MRI.getType(DstReg);
5979 if (DstTy.isVector())
5980 return UnableToLegalize;
5981
5982 Register Amt = MI.getOperand(2).getReg();
5983 LLT ShiftAmtTy = MRI.getType(Amt);
5984 const unsigned DstEltSize = DstTy.getScalarSizeInBits();
5985 if (DstEltSize % 2 != 0)
5986 return UnableToLegalize;
5987
5988 // Ignore the input type. We can only go to exactly half the size of the
5989 // input. If that isn't small enough, the resulting pieces will be further
5990 // legalized.
5991 const unsigned NewBitSize = DstEltSize / 2;
5992 const LLT HalfTy = LLT::scalar(NewBitSize);
5993 const LLT CondTy = LLT::scalar(1);
5994
5995 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
5996 return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
5997 ShiftAmtTy);
5998 }
5999
6000 // TODO: Expand with known bits.
6001
6002 // Handle the fully general expansion by an unknown amount.
6003 auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
6004
6005 Register InL = MRI.createGenericVirtualRegister(HalfTy);
6006 Register InH = MRI.createGenericVirtualRegister(HalfTy);
6007 MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
6008
6009 auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
6010 auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
6011
6012 auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
6013 auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
6014 auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
6015
6016 Register ResultRegs[2];
6017 switch (MI.getOpcode()) {
6018 case TargetOpcode::G_SHL: {
6019 // Short: ShAmt < NewBitSize
6020 auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
6021
6022 auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
6023 auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
6024 auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
6025
6026 // Long: ShAmt >= NewBitSize
6027 auto LoL = MIRBuilder.buildConstant(HalfTy, 0); // Lo part is zero.
6028 auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
6029
6030 auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
6031 auto Hi = MIRBuilder.buildSelect(
6032 HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
6033
6034 ResultRegs[0] = Lo.getReg(0);
6035 ResultRegs[1] = Hi.getReg(0);
6036 break;
6037 }
6038 case TargetOpcode::G_LSHR:
6039 case TargetOpcode::G_ASHR: {
6040 // Short: ShAmt < NewBitSize
6041 auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
6042
6043 auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
6044 auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
6045 auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
6046
6047 // Long: ShAmt >= NewBitSize
6048 MachineInstrBuilder HiL;
6049 if (MI.getOpcode() == TargetOpcode::G_LSHR) {
6050 HiL = MIRBuilder.buildConstant(HalfTy, 0); // Hi part is zero.
6051 } else {
6052 auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
6053 HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt); // Sign of Hi part.
6054 }
6055 auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
6056 {InH, AmtExcess}); // Lo from Hi part.
6057
6058 auto Lo = MIRBuilder.buildSelect(
6059 HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
6060
6061 auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
6062
6063 ResultRegs[0] = Lo.getReg(0);
6064 ResultRegs[1] = Hi.getReg(0);
6065 break;
6066 }
6067 default:
6068 llvm_unreachable("not a shift");
6069 }
6070
6071 MIRBuilder.buildMergeLikeInstr(DstReg, ResultRegs);
6072 MI.eraseFromParent();
6073 return Legalized;
6074 }
6075
6076 LegalizerHelper::LegalizeResult
moreElementsVectorPhi(MachineInstr & MI,unsigned TypeIdx,LLT MoreTy)6077 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
6078 LLT MoreTy) {
6079 assert(TypeIdx == 0 && "Expecting only Idx 0");
6080
6081 Observer.changingInstr(MI);
6082 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6083 MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
6084 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
6085 moreElementsVectorSrc(MI, MoreTy, I);
6086 }
6087
6088 MachineBasicBlock &MBB = *MI.getParent();
6089 MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
6090 moreElementsVectorDst(MI, MoreTy, 0);
6091 Observer.changedInstr(MI);
6092 return Legalized;
6093 }
6094
getNeutralElementForVecReduce(unsigned Opcode,MachineIRBuilder & MIRBuilder,LLT Ty)6095 MachineInstrBuilder LegalizerHelper::getNeutralElementForVecReduce(
6096 unsigned Opcode, MachineIRBuilder &MIRBuilder, LLT Ty) {
6097 assert(Ty.isScalar() && "Expected scalar type to make neutral element for");
6098
6099 switch (Opcode) {
6100 default:
6101 llvm_unreachable(
6102 "getNeutralElementForVecReduce called with invalid opcode!");
6103 case TargetOpcode::G_VECREDUCE_ADD:
6104 case TargetOpcode::G_VECREDUCE_OR:
6105 case TargetOpcode::G_VECREDUCE_XOR:
6106 case TargetOpcode::G_VECREDUCE_UMAX:
6107 return MIRBuilder.buildConstant(Ty, 0);
6108 case TargetOpcode::G_VECREDUCE_MUL:
6109 return MIRBuilder.buildConstant(Ty, 1);
6110 case TargetOpcode::G_VECREDUCE_AND:
6111 case TargetOpcode::G_VECREDUCE_UMIN:
6112 return MIRBuilder.buildConstant(
6113 Ty, APInt::getAllOnes(Ty.getScalarSizeInBits()));
6114 case TargetOpcode::G_VECREDUCE_SMAX:
6115 return MIRBuilder.buildConstant(
6116 Ty, APInt::getSignedMinValue(Ty.getSizeInBits()));
6117 case TargetOpcode::G_VECREDUCE_SMIN:
6118 return MIRBuilder.buildConstant(
6119 Ty, APInt::getSignedMaxValue(Ty.getSizeInBits()));
6120 case TargetOpcode::G_VECREDUCE_FADD:
6121 return MIRBuilder.buildFConstant(Ty, -0.0);
6122 case TargetOpcode::G_VECREDUCE_FMUL:
6123 return MIRBuilder.buildFConstant(Ty, 1.0);
6124 case TargetOpcode::G_VECREDUCE_FMINIMUM:
6125 case TargetOpcode::G_VECREDUCE_FMAXIMUM:
6126 assert(false && "getNeutralElementForVecReduce unimplemented for "
6127 "G_VECREDUCE_FMINIMUM and G_VECREDUCE_FMAXIMUM!");
6128 }
6129 llvm_unreachable("switch expected to return!");
6130 }
6131
6132 LegalizerHelper::LegalizeResult
moreElementsVector(MachineInstr & MI,unsigned TypeIdx,LLT MoreTy)6133 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
6134 LLT MoreTy) {
6135 unsigned Opc = MI.getOpcode();
6136 switch (Opc) {
6137 case TargetOpcode::G_IMPLICIT_DEF:
6138 case TargetOpcode::G_LOAD: {
6139 if (TypeIdx != 0)
6140 return UnableToLegalize;
6141 Observer.changingInstr(MI);
6142 moreElementsVectorDst(MI, MoreTy, 0);
6143 Observer.changedInstr(MI);
6144 return Legalized;
6145 }
6146 case TargetOpcode::G_STORE:
6147 if (TypeIdx != 0)
6148 return UnableToLegalize;
6149 Observer.changingInstr(MI);
6150 moreElementsVectorSrc(MI, MoreTy, 0);
6151 Observer.changedInstr(MI);
6152 return Legalized;
6153 case TargetOpcode::G_AND:
6154 case TargetOpcode::G_OR:
6155 case TargetOpcode::G_XOR:
6156 case TargetOpcode::G_ADD:
6157 case TargetOpcode::G_SUB:
6158 case TargetOpcode::G_MUL:
6159 case TargetOpcode::G_FADD:
6160 case TargetOpcode::G_FSUB:
6161 case TargetOpcode::G_FMUL:
6162 case TargetOpcode::G_FDIV:
6163 case TargetOpcode::G_FCOPYSIGN:
6164 case TargetOpcode::G_UADDSAT:
6165 case TargetOpcode::G_USUBSAT:
6166 case TargetOpcode::G_SADDSAT:
6167 case TargetOpcode::G_SSUBSAT:
6168 case TargetOpcode::G_SMIN:
6169 case TargetOpcode::G_SMAX:
6170 case TargetOpcode::G_UMIN:
6171 case TargetOpcode::G_UMAX:
6172 case TargetOpcode::G_FMINNUM:
6173 case TargetOpcode::G_FMAXNUM:
6174 case TargetOpcode::G_FMINNUM_IEEE:
6175 case TargetOpcode::G_FMAXNUM_IEEE:
6176 case TargetOpcode::G_FMINIMUM:
6177 case TargetOpcode::G_FMAXIMUM:
6178 case TargetOpcode::G_FMINIMUMNUM:
6179 case TargetOpcode::G_FMAXIMUMNUM:
6180 case TargetOpcode::G_STRICT_FADD:
6181 case TargetOpcode::G_STRICT_FSUB:
6182 case TargetOpcode::G_STRICT_FMUL:
6183 case TargetOpcode::G_SHL:
6184 case TargetOpcode::G_ASHR:
6185 case TargetOpcode::G_LSHR: {
6186 Observer.changingInstr(MI);
6187 moreElementsVectorSrc(MI, MoreTy, 1);
6188 moreElementsVectorSrc(MI, MoreTy, 2);
6189 moreElementsVectorDst(MI, MoreTy, 0);
6190 Observer.changedInstr(MI);
6191 return Legalized;
6192 }
6193 case TargetOpcode::G_FMA:
6194 case TargetOpcode::G_STRICT_FMA:
6195 case TargetOpcode::G_FSHR:
6196 case TargetOpcode::G_FSHL: {
6197 Observer.changingInstr(MI);
6198 moreElementsVectorSrc(MI, MoreTy, 1);
6199 moreElementsVectorSrc(MI, MoreTy, 2);
6200 moreElementsVectorSrc(MI, MoreTy, 3);
6201 moreElementsVectorDst(MI, MoreTy, 0);
6202 Observer.changedInstr(MI);
6203 return Legalized;
6204 }
6205 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
6206 case TargetOpcode::G_EXTRACT:
6207 if (TypeIdx != 1)
6208 return UnableToLegalize;
6209 Observer.changingInstr(MI);
6210 moreElementsVectorSrc(MI, MoreTy, 1);
6211 Observer.changedInstr(MI);
6212 return Legalized;
6213 case TargetOpcode::G_INSERT:
6214 case TargetOpcode::G_INSERT_VECTOR_ELT:
6215 case TargetOpcode::G_FREEZE:
6216 case TargetOpcode::G_FNEG:
6217 case TargetOpcode::G_FABS:
6218 case TargetOpcode::G_FSQRT:
6219 case TargetOpcode::G_FCEIL:
6220 case TargetOpcode::G_FFLOOR:
6221 case TargetOpcode::G_FNEARBYINT:
6222 case TargetOpcode::G_FRINT:
6223 case TargetOpcode::G_INTRINSIC_ROUND:
6224 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
6225 case TargetOpcode::G_INTRINSIC_TRUNC:
6226 case TargetOpcode::G_BITREVERSE:
6227 case TargetOpcode::G_BSWAP:
6228 case TargetOpcode::G_FCANONICALIZE:
6229 case TargetOpcode::G_SEXT_INREG:
6230 case TargetOpcode::G_ABS:
6231 case TargetOpcode::G_CTLZ:
6232 case TargetOpcode::G_CTPOP:
6233 if (TypeIdx != 0)
6234 return UnableToLegalize;
6235 Observer.changingInstr(MI);
6236 moreElementsVectorSrc(MI, MoreTy, 1);
6237 moreElementsVectorDst(MI, MoreTy, 0);
6238 Observer.changedInstr(MI);
6239 return Legalized;
6240 case TargetOpcode::G_SELECT: {
6241 auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs();
6242 if (TypeIdx == 1) {
6243 if (!CondTy.isScalar() ||
6244 DstTy.getElementCount() != MoreTy.getElementCount())
6245 return UnableToLegalize;
6246
6247 // This is turning a scalar select of vectors into a vector
6248 // select. Broadcast the select condition.
6249 auto ShufSplat = MIRBuilder.buildShuffleSplat(MoreTy, CondReg);
6250 Observer.changingInstr(MI);
6251 MI.getOperand(1).setReg(ShufSplat.getReg(0));
6252 Observer.changedInstr(MI);
6253 return Legalized;
6254 }
6255
6256 if (CondTy.isVector())
6257 return UnableToLegalize;
6258
6259 Observer.changingInstr(MI);
6260 moreElementsVectorSrc(MI, MoreTy, 2);
6261 moreElementsVectorSrc(MI, MoreTy, 3);
6262 moreElementsVectorDst(MI, MoreTy, 0);
6263 Observer.changedInstr(MI);
6264 return Legalized;
6265 }
6266 case TargetOpcode::G_UNMERGE_VALUES:
6267 return UnableToLegalize;
6268 case TargetOpcode::G_PHI:
6269 return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
6270 case TargetOpcode::G_SHUFFLE_VECTOR:
6271 return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
6272 case TargetOpcode::G_BUILD_VECTOR: {
6273 SmallVector<SrcOp, 8> Elts;
6274 for (auto Op : MI.uses()) {
6275 Elts.push_back(Op.getReg());
6276 }
6277
6278 for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
6279 Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
6280 }
6281
6282 MIRBuilder.buildDeleteTrailingVectorElements(
6283 MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
6284 MI.eraseFromParent();
6285 return Legalized;
6286 }
6287 case TargetOpcode::G_SEXT:
6288 case TargetOpcode::G_ZEXT:
6289 case TargetOpcode::G_ANYEXT:
6290 case TargetOpcode::G_TRUNC:
6291 case TargetOpcode::G_FPTRUNC:
6292 case TargetOpcode::G_FPEXT:
6293 case TargetOpcode::G_FPTOSI:
6294 case TargetOpcode::G_FPTOUI:
6295 case TargetOpcode::G_FPTOSI_SAT:
6296 case TargetOpcode::G_FPTOUI_SAT:
6297 case TargetOpcode::G_SITOFP:
6298 case TargetOpcode::G_UITOFP: {
6299 Observer.changingInstr(MI);
6300 LLT SrcExtTy;
6301 LLT DstExtTy;
6302 if (TypeIdx == 0) {
6303 DstExtTy = MoreTy;
6304 SrcExtTy = LLT::fixed_vector(
6305 MoreTy.getNumElements(),
6306 MRI.getType(MI.getOperand(1).getReg()).getElementType());
6307 } else {
6308 DstExtTy = LLT::fixed_vector(
6309 MoreTy.getNumElements(),
6310 MRI.getType(MI.getOperand(0).getReg()).getElementType());
6311 SrcExtTy = MoreTy;
6312 }
6313 moreElementsVectorSrc(MI, SrcExtTy, 1);
6314 moreElementsVectorDst(MI, DstExtTy, 0);
6315 Observer.changedInstr(MI);
6316 return Legalized;
6317 }
6318 case TargetOpcode::G_ICMP:
6319 case TargetOpcode::G_FCMP: {
6320 if (TypeIdx != 1)
6321 return UnableToLegalize;
6322
6323 Observer.changingInstr(MI);
6324 moreElementsVectorSrc(MI, MoreTy, 2);
6325 moreElementsVectorSrc(MI, MoreTy, 3);
6326 LLT CondTy = LLT::fixed_vector(
6327 MoreTy.getNumElements(),
6328 MRI.getType(MI.getOperand(0).getReg()).getElementType());
6329 moreElementsVectorDst(MI, CondTy, 0);
6330 Observer.changedInstr(MI);
6331 return Legalized;
6332 }
6333 case TargetOpcode::G_BITCAST: {
6334 if (TypeIdx != 0)
6335 return UnableToLegalize;
6336
6337 LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
6338 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
6339
6340 unsigned coefficient = SrcTy.getNumElements() * MoreTy.getNumElements();
6341 if (coefficient % DstTy.getNumElements() != 0)
6342 return UnableToLegalize;
6343
6344 coefficient = coefficient / DstTy.getNumElements();
6345
6346 LLT NewTy = SrcTy.changeElementCount(
6347 ElementCount::get(coefficient, MoreTy.isScalable()));
6348 Observer.changingInstr(MI);
6349 moreElementsVectorSrc(MI, NewTy, 1);
6350 moreElementsVectorDst(MI, MoreTy, 0);
6351 Observer.changedInstr(MI);
6352 return Legalized;
6353 }
6354 case TargetOpcode::G_VECREDUCE_FADD:
6355 case TargetOpcode::G_VECREDUCE_FMUL:
6356 case TargetOpcode::G_VECREDUCE_ADD:
6357 case TargetOpcode::G_VECREDUCE_MUL:
6358 case TargetOpcode::G_VECREDUCE_AND:
6359 case TargetOpcode::G_VECREDUCE_OR:
6360 case TargetOpcode::G_VECREDUCE_XOR:
6361 case TargetOpcode::G_VECREDUCE_SMAX:
6362 case TargetOpcode::G_VECREDUCE_SMIN:
6363 case TargetOpcode::G_VECREDUCE_UMAX:
6364 case TargetOpcode::G_VECREDUCE_UMIN: {
6365 LLT OrigTy = MRI.getType(MI.getOperand(1).getReg());
6366 MachineOperand &MO = MI.getOperand(1);
6367 auto NewVec = MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO);
6368 auto NeutralElement = getNeutralElementForVecReduce(
6369 MI.getOpcode(), MIRBuilder, MoreTy.getElementType());
6370
6371 LLT IdxTy(TLI.getVectorIdxLLT(MIRBuilder.getDataLayout()));
6372 for (size_t i = OrigTy.getNumElements(), e = MoreTy.getNumElements();
6373 i != e; i++) {
6374 auto Idx = MIRBuilder.buildConstant(IdxTy, i);
6375 NewVec = MIRBuilder.buildInsertVectorElement(MoreTy, NewVec,
6376 NeutralElement, Idx);
6377 }
6378
6379 Observer.changingInstr(MI);
6380 MO.setReg(NewVec.getReg(0));
6381 Observer.changedInstr(MI);
6382 return Legalized;
6383 }
6384
6385 default:
6386 return UnableToLegalize;
6387 }
6388 }
6389
6390 LegalizerHelper::LegalizeResult
equalizeVectorShuffleLengths(MachineInstr & MI)6391 LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) {
6392 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6393 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6394 unsigned MaskNumElts = Mask.size();
6395 unsigned SrcNumElts = SrcTy.getNumElements();
6396 LLT DestEltTy = DstTy.getElementType();
6397
6398 if (MaskNumElts == SrcNumElts)
6399 return Legalized;
6400
6401 if (MaskNumElts < SrcNumElts) {
6402 // Extend mask to match new destination vector size with
6403 // undef values.
6404 SmallVector<int, 16> NewMask(SrcNumElts, -1);
6405 llvm::copy(Mask, NewMask.begin());
6406
6407 moreElementsVectorDst(MI, SrcTy, 0);
6408 MIRBuilder.setInstrAndDebugLoc(MI);
6409 MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
6410 MI.getOperand(1).getReg(),
6411 MI.getOperand(2).getReg(), NewMask);
6412 MI.eraseFromParent();
6413
6414 return Legalized;
6415 }
6416
6417 unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts);
6418 unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
6419 LLT PaddedTy = LLT::fixed_vector(PaddedMaskNumElts, DestEltTy);
6420
6421 // Create new source vectors by concatenating the initial
6422 // source vectors with undefined vectors of the same size.
6423 auto Undef = MIRBuilder.buildUndef(SrcTy);
6424 SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(0));
6425 SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(0));
6426 MOps1[0] = MI.getOperand(1).getReg();
6427 MOps2[0] = MI.getOperand(2).getReg();
6428
6429 auto Src1 = MIRBuilder.buildConcatVectors(PaddedTy, MOps1);
6430 auto Src2 = MIRBuilder.buildConcatVectors(PaddedTy, MOps2);
6431
6432 // Readjust mask for new input vector length.
6433 SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
6434 for (unsigned I = 0; I != MaskNumElts; ++I) {
6435 int Idx = Mask[I];
6436 if (Idx >= static_cast<int>(SrcNumElts))
6437 Idx += PaddedMaskNumElts - SrcNumElts;
6438 MappedOps[I] = Idx;
6439 }
6440
6441 // If we got more elements than required, extract subvector.
6442 if (MaskNumElts != PaddedMaskNumElts) {
6443 auto Shuffle =
6444 MIRBuilder.buildShuffleVector(PaddedTy, Src1, Src2, MappedOps);
6445
6446 SmallVector<Register, 16> Elts(MaskNumElts);
6447 for (unsigned I = 0; I < MaskNumElts; ++I) {
6448 Elts[I] =
6449 MIRBuilder.buildExtractVectorElementConstant(DestEltTy, Shuffle, I)
6450 .getReg(0);
6451 }
6452 MIRBuilder.buildBuildVector(DstReg, Elts);
6453 } else {
6454 MIRBuilder.buildShuffleVector(DstReg, Src1, Src2, MappedOps);
6455 }
6456
6457 MI.eraseFromParent();
6458 return LegalizerHelper::LegalizeResult::Legalized;
6459 }
6460
6461 LegalizerHelper::LegalizeResult
moreElementsVectorShuffle(MachineInstr & MI,unsigned int TypeIdx,LLT MoreTy)6462 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
6463 unsigned int TypeIdx, LLT MoreTy) {
6464 auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs();
6465 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6466 unsigned NumElts = DstTy.getNumElements();
6467 unsigned WidenNumElts = MoreTy.getNumElements();
6468
6469 if (DstTy.isVector() && Src1Ty.isVector() &&
6470 DstTy.getNumElements() != Src1Ty.getNumElements()) {
6471 return equalizeVectorShuffleLengths(MI);
6472 }
6473
6474 if (TypeIdx != 0)
6475 return UnableToLegalize;
6476
6477 // Expect a canonicalized shuffle.
6478 if (DstTy != Src1Ty || DstTy != Src2Ty)
6479 return UnableToLegalize;
6480
6481 moreElementsVectorSrc(MI, MoreTy, 1);
6482 moreElementsVectorSrc(MI, MoreTy, 2);
6483
6484 // Adjust mask based on new input vector length.
6485 SmallVector<int, 16> NewMask(WidenNumElts, -1);
6486 for (unsigned I = 0; I != NumElts; ++I) {
6487 int Idx = Mask[I];
6488 if (Idx < static_cast<int>(NumElts))
6489 NewMask[I] = Idx;
6490 else
6491 NewMask[I] = Idx - NumElts + WidenNumElts;
6492 }
6493 moreElementsVectorDst(MI, MoreTy, 0);
6494 MIRBuilder.setInstrAndDebugLoc(MI);
6495 MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
6496 MI.getOperand(1).getReg(),
6497 MI.getOperand(2).getReg(), NewMask);
6498 MI.eraseFromParent();
6499 return Legalized;
6500 }
6501
multiplyRegisters(SmallVectorImpl<Register> & DstRegs,ArrayRef<Register> Src1Regs,ArrayRef<Register> Src2Regs,LLT NarrowTy)6502 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
6503 ArrayRef<Register> Src1Regs,
6504 ArrayRef<Register> Src2Regs,
6505 LLT NarrowTy) {
6506 MachineIRBuilder &B = MIRBuilder;
6507 unsigned SrcParts = Src1Regs.size();
6508 unsigned DstParts = DstRegs.size();
6509
6510 unsigned DstIdx = 0; // Low bits of the result.
6511 Register FactorSum =
6512 B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
6513 DstRegs[DstIdx] = FactorSum;
6514
6515 Register CarrySumPrevDstIdx;
6516 SmallVector<Register, 4> Factors;
6517
6518 for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
6519 // Collect low parts of muls for DstIdx.
6520 for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
6521 i <= std::min(DstIdx, SrcParts - 1); ++i) {
6522 MachineInstrBuilder Mul =
6523 B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
6524 Factors.push_back(Mul.getReg(0));
6525 }
6526 // Collect high parts of muls from previous DstIdx.
6527 for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
6528 i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
6529 MachineInstrBuilder Umulh =
6530 B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
6531 Factors.push_back(Umulh.getReg(0));
6532 }
6533 // Add CarrySum from additions calculated for previous DstIdx.
6534 if (DstIdx != 1) {
6535 Factors.push_back(CarrySumPrevDstIdx);
6536 }
6537
6538 Register CarrySum;
6539 // Add all factors and accumulate all carries into CarrySum.
6540 if (DstIdx != DstParts - 1) {
6541 MachineInstrBuilder Uaddo =
6542 B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
6543 FactorSum = Uaddo.getReg(0);
6544 CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
6545 for (unsigned i = 2; i < Factors.size(); ++i) {
6546 MachineInstrBuilder Uaddo =
6547 B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
6548 FactorSum = Uaddo.getReg(0);
6549 MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
6550 CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
6551 }
6552 } else {
6553 // Since value for the next index is not calculated, neither is CarrySum.
6554 FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
6555 for (unsigned i = 2; i < Factors.size(); ++i)
6556 FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
6557 }
6558
6559 CarrySumPrevDstIdx = CarrySum;
6560 DstRegs[DstIdx] = FactorSum;
6561 Factors.clear();
6562 }
6563 }
6564
6565 LegalizerHelper::LegalizeResult
narrowScalarAddSub(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)6566 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
6567 LLT NarrowTy) {
6568 if (TypeIdx != 0)
6569 return UnableToLegalize;
6570
6571 Register DstReg = MI.getOperand(0).getReg();
6572 LLT DstType = MRI.getType(DstReg);
6573 // FIXME: add support for vector types
6574 if (DstType.isVector())
6575 return UnableToLegalize;
6576
6577 unsigned Opcode = MI.getOpcode();
6578 unsigned OpO, OpE, OpF;
6579 switch (Opcode) {
6580 case TargetOpcode::G_SADDO:
6581 case TargetOpcode::G_SADDE:
6582 case TargetOpcode::G_UADDO:
6583 case TargetOpcode::G_UADDE:
6584 case TargetOpcode::G_ADD:
6585 OpO = TargetOpcode::G_UADDO;
6586 OpE = TargetOpcode::G_UADDE;
6587 OpF = TargetOpcode::G_UADDE;
6588 if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
6589 OpF = TargetOpcode::G_SADDE;
6590 break;
6591 case TargetOpcode::G_SSUBO:
6592 case TargetOpcode::G_SSUBE:
6593 case TargetOpcode::G_USUBO:
6594 case TargetOpcode::G_USUBE:
6595 case TargetOpcode::G_SUB:
6596 OpO = TargetOpcode::G_USUBO;
6597 OpE = TargetOpcode::G_USUBE;
6598 OpF = TargetOpcode::G_USUBE;
6599 if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
6600 OpF = TargetOpcode::G_SSUBE;
6601 break;
6602 default:
6603 llvm_unreachable("Unexpected add/sub opcode!");
6604 }
6605
6606 // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
6607 unsigned NumDefs = MI.getNumExplicitDefs();
6608 Register Src1 = MI.getOperand(NumDefs).getReg();
6609 Register Src2 = MI.getOperand(NumDefs + 1).getReg();
6610 Register CarryDst, CarryIn;
6611 if (NumDefs == 2)
6612 CarryDst = MI.getOperand(1).getReg();
6613 if (MI.getNumOperands() == NumDefs + 3)
6614 CarryIn = MI.getOperand(NumDefs + 2).getReg();
6615
6616 LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
6617 LLT LeftoverTy, DummyTy;
6618 SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
6619 extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left,
6620 MIRBuilder, MRI);
6621 extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left, MIRBuilder,
6622 MRI);
6623
6624 int NarrowParts = Src1Regs.size();
6625 Src1Regs.append(Src1Left);
6626 Src2Regs.append(Src2Left);
6627 DstRegs.reserve(Src1Regs.size());
6628
6629 for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
6630 Register DstReg =
6631 MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
6632 Register CarryOut;
6633 // Forward the final carry-out to the destination register
6634 if (i == e - 1 && CarryDst)
6635 CarryOut = CarryDst;
6636 else
6637 CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
6638
6639 if (!CarryIn) {
6640 MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
6641 {Src1Regs[i], Src2Regs[i]});
6642 } else if (i == e - 1) {
6643 MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
6644 {Src1Regs[i], Src2Regs[i], CarryIn});
6645 } else {
6646 MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
6647 {Src1Regs[i], Src2Regs[i], CarryIn});
6648 }
6649
6650 DstRegs.push_back(DstReg);
6651 CarryIn = CarryOut;
6652 }
6653 insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
6654 ArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
6655 ArrayRef(DstRegs).drop_front(NarrowParts));
6656
6657 MI.eraseFromParent();
6658 return Legalized;
6659 }
6660
6661 LegalizerHelper::LegalizeResult
narrowScalarMul(MachineInstr & MI,LLT NarrowTy)6662 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
6663 auto [DstReg, Src1, Src2] = MI.getFirst3Regs();
6664
6665 LLT Ty = MRI.getType(DstReg);
6666 if (Ty.isVector())
6667 return UnableToLegalize;
6668
6669 unsigned Size = Ty.getSizeInBits();
6670 unsigned NarrowSize = NarrowTy.getSizeInBits();
6671 if (Size % NarrowSize != 0)
6672 return UnableToLegalize;
6673
6674 unsigned NumParts = Size / NarrowSize;
6675 bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
6676 unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
6677
6678 SmallVector<Register, 2> Src1Parts, Src2Parts;
6679 SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
6680 extractParts(Src1, NarrowTy, NumParts, Src1Parts, MIRBuilder, MRI);
6681 extractParts(Src2, NarrowTy, NumParts, Src2Parts, MIRBuilder, MRI);
6682 multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
6683
6684 // Take only high half of registers if this is high mul.
6685 ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
6686 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
6687 MI.eraseFromParent();
6688 return Legalized;
6689 }
6690
6691 LegalizerHelper::LegalizeResult
narrowScalarFPTOI(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)6692 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
6693 LLT NarrowTy) {
6694 if (TypeIdx != 0)
6695 return UnableToLegalize;
6696
6697 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
6698
6699 Register Src = MI.getOperand(1).getReg();
6700 LLT SrcTy = MRI.getType(Src);
6701
6702 // If all finite floats fit into the narrowed integer type, we can just swap
6703 // out the result type. This is practically only useful for conversions from
6704 // half to at least 16-bits, so just handle the one case.
6705 if (SrcTy.getScalarType() != LLT::scalar(16) ||
6706 NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
6707 return UnableToLegalize;
6708
6709 Observer.changingInstr(MI);
6710 narrowScalarDst(MI, NarrowTy, 0,
6711 IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
6712 Observer.changedInstr(MI);
6713 return Legalized;
6714 }
6715
6716 LegalizerHelper::LegalizeResult
narrowScalarExtract(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)6717 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
6718 LLT NarrowTy) {
6719 if (TypeIdx != 1)
6720 return UnableToLegalize;
6721
6722 uint64_t NarrowSize = NarrowTy.getSizeInBits();
6723
6724 int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
6725 // FIXME: add support for when SizeOp1 isn't an exact multiple of
6726 // NarrowSize.
6727 if (SizeOp1 % NarrowSize != 0)
6728 return UnableToLegalize;
6729 int NumParts = SizeOp1 / NarrowSize;
6730
6731 SmallVector<Register, 2> SrcRegs, DstRegs;
6732 extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs,
6733 MIRBuilder, MRI);
6734
6735 Register OpReg = MI.getOperand(0).getReg();
6736 uint64_t OpStart = MI.getOperand(2).getImm();
6737 uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
6738 for (int i = 0; i < NumParts; ++i) {
6739 unsigned SrcStart = i * NarrowSize;
6740
6741 if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
6742 // No part of the extract uses this subregister, ignore it.
6743 continue;
6744 } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
6745 // The entire subregister is extracted, forward the value.
6746 DstRegs.push_back(SrcRegs[i]);
6747 continue;
6748 }
6749
6750 // OpSegStart is where this destination segment would start in OpReg if it
6751 // extended infinitely in both directions.
6752 int64_t ExtractOffset;
6753 uint64_t SegSize;
6754 if (OpStart < SrcStart) {
6755 ExtractOffset = 0;
6756 SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
6757 } else {
6758 ExtractOffset = OpStart - SrcStart;
6759 SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
6760 }
6761
6762 Register SegReg = SrcRegs[i];
6763 if (ExtractOffset != 0 || SegSize != NarrowSize) {
6764 // A genuine extract is needed.
6765 SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
6766 MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
6767 }
6768
6769 DstRegs.push_back(SegReg);
6770 }
6771
6772 Register DstReg = MI.getOperand(0).getReg();
6773 if (MRI.getType(DstReg).isVector())
6774 MIRBuilder.buildBuildVector(DstReg, DstRegs);
6775 else if (DstRegs.size() > 1)
6776 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
6777 else
6778 MIRBuilder.buildCopy(DstReg, DstRegs[0]);
6779 MI.eraseFromParent();
6780 return Legalized;
6781 }
6782
6783 LegalizerHelper::LegalizeResult
narrowScalarInsert(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)6784 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
6785 LLT NarrowTy) {
6786 // FIXME: Don't know how to handle secondary types yet.
6787 if (TypeIdx != 0)
6788 return UnableToLegalize;
6789
6790 SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
6791 LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
6792 LLT LeftoverTy;
6793 extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
6794 LeftoverRegs, MIRBuilder, MRI);
6795
6796 SrcRegs.append(LeftoverRegs);
6797
6798 uint64_t NarrowSize = NarrowTy.getSizeInBits();
6799 Register OpReg = MI.getOperand(2).getReg();
6800 uint64_t OpStart = MI.getOperand(3).getImm();
6801 uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
6802 for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
6803 unsigned DstStart = I * NarrowSize;
6804
6805 if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
6806 // The entire subregister is defined by this insert, forward the new
6807 // value.
6808 DstRegs.push_back(OpReg);
6809 continue;
6810 }
6811
6812 Register SrcReg = SrcRegs[I];
6813 if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
6814 // The leftover reg is smaller than NarrowTy, so we need to extend it.
6815 SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
6816 MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
6817 }
6818
6819 if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
6820 // No part of the insert affects this subregister, forward the original.
6821 DstRegs.push_back(SrcReg);
6822 continue;
6823 }
6824
6825 // OpSegStart is where this destination segment would start in OpReg if it
6826 // extended infinitely in both directions.
6827 int64_t ExtractOffset, InsertOffset;
6828 uint64_t SegSize;
6829 if (OpStart < DstStart) {
6830 InsertOffset = 0;
6831 ExtractOffset = DstStart - OpStart;
6832 SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
6833 } else {
6834 InsertOffset = OpStart - DstStart;
6835 ExtractOffset = 0;
6836 SegSize =
6837 std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
6838 }
6839
6840 Register SegReg = OpReg;
6841 if (ExtractOffset != 0 || SegSize != OpSize) {
6842 // A genuine extract is needed.
6843 SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
6844 MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
6845 }
6846
6847 Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
6848 MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
6849 DstRegs.push_back(DstReg);
6850 }
6851
6852 uint64_t WideSize = DstRegs.size() * NarrowSize;
6853 Register DstReg = MI.getOperand(0).getReg();
6854 if (WideSize > RegTy.getSizeInBits()) {
6855 Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
6856 MIRBuilder.buildMergeLikeInstr(MergeReg, DstRegs);
6857 MIRBuilder.buildTrunc(DstReg, MergeReg);
6858 } else
6859 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
6860
6861 MI.eraseFromParent();
6862 return Legalized;
6863 }
6864
6865 LegalizerHelper::LegalizeResult
narrowScalarBasic(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)6866 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
6867 LLT NarrowTy) {
6868 Register DstReg = MI.getOperand(0).getReg();
6869 LLT DstTy = MRI.getType(DstReg);
6870
6871 assert(MI.getNumOperands() == 3 && TypeIdx == 0);
6872
6873 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
6874 SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
6875 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
6876 LLT LeftoverTy;
6877 if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
6878 Src0Regs, Src0LeftoverRegs, MIRBuilder, MRI))
6879 return UnableToLegalize;
6880
6881 LLT Unused;
6882 if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
6883 Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI))
6884 llvm_unreachable("inconsistent extractParts result");
6885
6886 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
6887 auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
6888 {Src0Regs[I], Src1Regs[I]});
6889 DstRegs.push_back(Inst.getReg(0));
6890 }
6891
6892 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
6893 auto Inst = MIRBuilder.buildInstr(
6894 MI.getOpcode(),
6895 {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
6896 DstLeftoverRegs.push_back(Inst.getReg(0));
6897 }
6898
6899 insertParts(DstReg, DstTy, NarrowTy, DstRegs,
6900 LeftoverTy, DstLeftoverRegs);
6901
6902 MI.eraseFromParent();
6903 return Legalized;
6904 }
6905
6906 LegalizerHelper::LegalizeResult
narrowScalarExt(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)6907 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
6908 LLT NarrowTy) {
6909 if (TypeIdx != 0)
6910 return UnableToLegalize;
6911
6912 auto [DstReg, SrcReg] = MI.getFirst2Regs();
6913
6914 LLT DstTy = MRI.getType(DstReg);
6915 if (DstTy.isVector())
6916 return UnableToLegalize;
6917
6918 SmallVector<Register, 8> Parts;
6919 LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
6920 LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
6921 buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
6922
6923 MI.eraseFromParent();
6924 return Legalized;
6925 }
6926
6927 LegalizerHelper::LegalizeResult
narrowScalarSelect(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)6928 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
6929 LLT NarrowTy) {
6930 if (TypeIdx != 0)
6931 return UnableToLegalize;
6932
6933 Register CondReg = MI.getOperand(1).getReg();
6934 LLT CondTy = MRI.getType(CondReg);
6935 if (CondTy.isVector()) // TODO: Handle vselect
6936 return UnableToLegalize;
6937
6938 Register DstReg = MI.getOperand(0).getReg();
6939 LLT DstTy = MRI.getType(DstReg);
6940
6941 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
6942 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
6943 SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
6944 LLT LeftoverTy;
6945 if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
6946 Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI))
6947 return UnableToLegalize;
6948
6949 LLT Unused;
6950 if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
6951 Src2Regs, Src2LeftoverRegs, MIRBuilder, MRI))
6952 llvm_unreachable("inconsistent extractParts result");
6953
6954 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
6955 auto Select = MIRBuilder.buildSelect(NarrowTy,
6956 CondReg, Src1Regs[I], Src2Regs[I]);
6957 DstRegs.push_back(Select.getReg(0));
6958 }
6959
6960 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
6961 auto Select = MIRBuilder.buildSelect(
6962 LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
6963 DstLeftoverRegs.push_back(Select.getReg(0));
6964 }
6965
6966 insertParts(DstReg, DstTy, NarrowTy, DstRegs,
6967 LeftoverTy, DstLeftoverRegs);
6968
6969 MI.eraseFromParent();
6970 return Legalized;
6971 }
6972
6973 LegalizerHelper::LegalizeResult
narrowScalarCTLZ(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)6974 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
6975 LLT NarrowTy) {
6976 if (TypeIdx != 1)
6977 return UnableToLegalize;
6978
6979 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6980 unsigned NarrowSize = NarrowTy.getSizeInBits();
6981
6982 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
6983 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
6984
6985 MachineIRBuilder &B = MIRBuilder;
6986 auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
6987 // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
6988 auto C_0 = B.buildConstant(NarrowTy, 0);
6989 auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
6990 UnmergeSrc.getReg(1), C_0);
6991 auto LoCTLZ = IsUndef ?
6992 B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
6993 B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
6994 auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
6995 auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
6996 auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
6997 B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
6998
6999 MI.eraseFromParent();
7000 return Legalized;
7001 }
7002
7003 return UnableToLegalize;
7004 }
7005
7006 LegalizerHelper::LegalizeResult
narrowScalarCTTZ(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)7007 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
7008 LLT NarrowTy) {
7009 if (TypeIdx != 1)
7010 return UnableToLegalize;
7011
7012 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7013 unsigned NarrowSize = NarrowTy.getSizeInBits();
7014
7015 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7016 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
7017
7018 MachineIRBuilder &B = MIRBuilder;
7019 auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
7020 // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
7021 auto C_0 = B.buildConstant(NarrowTy, 0);
7022 auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
7023 UnmergeSrc.getReg(0), C_0);
7024 auto HiCTTZ = IsUndef ?
7025 B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
7026 B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
7027 auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
7028 auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
7029 auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
7030 B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
7031
7032 MI.eraseFromParent();
7033 return Legalized;
7034 }
7035
7036 return UnableToLegalize;
7037 }
7038
7039 LegalizerHelper::LegalizeResult
narrowScalarCTPOP(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)7040 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
7041 LLT NarrowTy) {
7042 if (TypeIdx != 1)
7043 return UnableToLegalize;
7044
7045 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7046 unsigned NarrowSize = NarrowTy.getSizeInBits();
7047
7048 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7049 auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
7050
7051 auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
7052 auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
7053 MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
7054
7055 MI.eraseFromParent();
7056 return Legalized;
7057 }
7058
7059 return UnableToLegalize;
7060 }
7061
7062 LegalizerHelper::LegalizeResult
narrowScalarFLDEXP(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)7063 LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx,
7064 LLT NarrowTy) {
7065 if (TypeIdx != 1)
7066 return UnableToLegalize;
7067
7068 MachineIRBuilder &B = MIRBuilder;
7069 Register ExpReg = MI.getOperand(2).getReg();
7070 LLT ExpTy = MRI.getType(ExpReg);
7071
7072 unsigned ClampSize = NarrowTy.getScalarSizeInBits();
7073
7074 // Clamp the exponent to the range of the target type.
7075 auto MinExp = B.buildConstant(ExpTy, minIntN(ClampSize));
7076 auto ClampMin = B.buildSMax(ExpTy, ExpReg, MinExp);
7077 auto MaxExp = B.buildConstant(ExpTy, maxIntN(ClampSize));
7078 auto Clamp = B.buildSMin(ExpTy, ClampMin, MaxExp);
7079
7080 auto Trunc = B.buildTrunc(NarrowTy, Clamp);
7081 Observer.changingInstr(MI);
7082 MI.getOperand(2).setReg(Trunc.getReg(0));
7083 Observer.changedInstr(MI);
7084 return Legalized;
7085 }
7086
7087 LegalizerHelper::LegalizeResult
lowerBitCount(MachineInstr & MI)7088 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
7089 unsigned Opc = MI.getOpcode();
7090 const auto &TII = MIRBuilder.getTII();
7091 auto isSupported = [this](const LegalityQuery &Q) {
7092 auto QAction = LI.getAction(Q).Action;
7093 return QAction == Legal || QAction == Libcall || QAction == Custom;
7094 };
7095 switch (Opc) {
7096 default:
7097 return UnableToLegalize;
7098 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
7099 // This trivially expands to CTLZ.
7100 Observer.changingInstr(MI);
7101 MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
7102 Observer.changedInstr(MI);
7103 return Legalized;
7104 }
7105 case TargetOpcode::G_CTLZ: {
7106 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7107 unsigned Len = SrcTy.getSizeInBits();
7108
7109 if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7110 // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
7111 auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
7112 auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
7113 auto ICmp = MIRBuilder.buildICmp(
7114 CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
7115 auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
7116 MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
7117 MI.eraseFromParent();
7118 return Legalized;
7119 }
7120 // for now, we do this:
7121 // NewLen = NextPowerOf2(Len);
7122 // x = x | (x >> 1);
7123 // x = x | (x >> 2);
7124 // ...
7125 // x = x | (x >>16);
7126 // x = x | (x >>32); // for 64-bit input
7127 // Upto NewLen/2
7128 // return Len - popcount(x);
7129 //
7130 // Ref: "Hacker's Delight" by Henry Warren
7131 Register Op = SrcReg;
7132 unsigned NewLen = PowerOf2Ceil(Len);
7133 for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
7134 auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
7135 auto MIBOp = MIRBuilder.buildOr(
7136 SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
7137 Op = MIBOp.getReg(0);
7138 }
7139 auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
7140 MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
7141 MIBPop);
7142 MI.eraseFromParent();
7143 return Legalized;
7144 }
7145 case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
7146 // This trivially expands to CTTZ.
7147 Observer.changingInstr(MI);
7148 MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
7149 Observer.changedInstr(MI);
7150 return Legalized;
7151 }
7152 case TargetOpcode::G_CTTZ: {
7153 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7154
7155 unsigned Len = SrcTy.getSizeInBits();
7156 if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7157 // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
7158 // zero.
7159 auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
7160 auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
7161 auto ICmp = MIRBuilder.buildICmp(
7162 CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
7163 auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
7164 MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
7165 MI.eraseFromParent();
7166 return Legalized;
7167 }
7168 // for now, we use: { return popcount(~x & (x - 1)); }
7169 // unless the target has ctlz but not ctpop, in which case we use:
7170 // { return 32 - nlz(~x & (x-1)); }
7171 // Ref: "Hacker's Delight" by Henry Warren
7172 auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
7173 auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
7174 auto MIBTmp = MIRBuilder.buildAnd(
7175 SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
7176 if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
7177 isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
7178 auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
7179 MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
7180 MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
7181 MI.eraseFromParent();
7182 return Legalized;
7183 }
7184 Observer.changingInstr(MI);
7185 MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
7186 MI.getOperand(1).setReg(MIBTmp.getReg(0));
7187 Observer.changedInstr(MI);
7188 return Legalized;
7189 }
7190 case TargetOpcode::G_CTPOP: {
7191 Register SrcReg = MI.getOperand(1).getReg();
7192 LLT Ty = MRI.getType(SrcReg);
7193 unsigned Size = Ty.getSizeInBits();
7194 MachineIRBuilder &B = MIRBuilder;
7195
7196 // Count set bits in blocks of 2 bits. Default approach would be
7197 // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
7198 // We use following formula instead:
7199 // B2Count = val - { (val >> 1) & 0x55555555 }
7200 // since it gives same result in blocks of 2 with one instruction less.
7201 auto C_1 = B.buildConstant(Ty, 1);
7202 auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
7203 APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
7204 auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
7205 auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
7206 auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
7207
7208 // In order to get count in blocks of 4 add values from adjacent block of 2.
7209 // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
7210 auto C_2 = B.buildConstant(Ty, 2);
7211 auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
7212 APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
7213 auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
7214 auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
7215 auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
7216 auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
7217
7218 // For count in blocks of 8 bits we don't have to mask high 4 bits before
7219 // addition since count value sits in range {0,...,8} and 4 bits are enough
7220 // to hold such binary values. After addition high 4 bits still hold count
7221 // of set bits in high 4 bit block, set them to zero and get 8 bit result.
7222 // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
7223 auto C_4 = B.buildConstant(Ty, 4);
7224 auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
7225 auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
7226 APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
7227 auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
7228 auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
7229
7230 assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
7231 // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
7232 // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
7233 auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
7234
7235 // Shift count result from 8 high bits to low bits.
7236 auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
7237
7238 auto IsMulSupported = [this](const LLT Ty) {
7239 auto Action = LI.getAction({TargetOpcode::G_MUL, {Ty}}).Action;
7240 return Action == Legal || Action == WidenScalar || Action == Custom;
7241 };
7242 if (IsMulSupported(Ty)) {
7243 auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
7244 B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
7245 } else {
7246 auto ResTmp = B8Count;
7247 for (unsigned Shift = 8; Shift < Size; Shift *= 2) {
7248 auto ShiftC = B.buildConstant(Ty, Shift);
7249 auto Shl = B.buildShl(Ty, ResTmp, ShiftC);
7250 ResTmp = B.buildAdd(Ty, ResTmp, Shl);
7251 }
7252 B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
7253 }
7254 MI.eraseFromParent();
7255 return Legalized;
7256 }
7257 }
7258 }
7259
7260 // Check that (every element of) Reg is undef or not an exact multiple of BW.
isNonZeroModBitWidthOrUndef(const MachineRegisterInfo & MRI,Register Reg,unsigned BW)7261 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
7262 Register Reg, unsigned BW) {
7263 return matchUnaryPredicate(
7264 MRI, Reg,
7265 [=](const Constant *C) {
7266 // Null constant here means an undef.
7267 const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
7268 return !CI || CI->getValue().urem(BW) != 0;
7269 },
7270 /*AllowUndefs*/ true);
7271 }
7272
7273 LegalizerHelper::LegalizeResult
lowerFunnelShiftWithInverse(MachineInstr & MI)7274 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
7275 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7276 LLT Ty = MRI.getType(Dst);
7277 LLT ShTy = MRI.getType(Z);
7278
7279 unsigned BW = Ty.getScalarSizeInBits();
7280
7281 if (!isPowerOf2_32(BW))
7282 return UnableToLegalize;
7283
7284 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7285 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
7286
7287 if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
7288 // fshl X, Y, Z -> fshr X, Y, -Z
7289 // fshr X, Y, Z -> fshl X, Y, -Z
7290 auto Zero = MIRBuilder.buildConstant(ShTy, 0);
7291 Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
7292 } else {
7293 // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
7294 // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
7295 auto One = MIRBuilder.buildConstant(ShTy, 1);
7296 if (IsFSHL) {
7297 Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
7298 X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
7299 } else {
7300 X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
7301 Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
7302 }
7303
7304 Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
7305 }
7306
7307 MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
7308 MI.eraseFromParent();
7309 return Legalized;
7310 }
7311
7312 LegalizerHelper::LegalizeResult
lowerFunnelShiftAsShifts(MachineInstr & MI)7313 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
7314 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7315 LLT Ty = MRI.getType(Dst);
7316 LLT ShTy = MRI.getType(Z);
7317
7318 const unsigned BW = Ty.getScalarSizeInBits();
7319 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7320
7321 Register ShX, ShY;
7322 Register ShAmt, InvShAmt;
7323
7324 // FIXME: Emit optimized urem by constant instead of letting it expand later.
7325 if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
7326 // fshl: X << C | Y >> (BW - C)
7327 // fshr: X << (BW - C) | Y >> C
7328 // where C = Z % BW is not zero
7329 auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
7330 ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
7331 InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
7332 ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
7333 ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
7334 } else {
7335 // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
7336 // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
7337 auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
7338 if (isPowerOf2_32(BW)) {
7339 // Z % BW -> Z & (BW - 1)
7340 ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
7341 // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
7342 auto NotZ = MIRBuilder.buildNot(ShTy, Z);
7343 InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
7344 } else {
7345 auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
7346 ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
7347 InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
7348 }
7349
7350 auto One = MIRBuilder.buildConstant(ShTy, 1);
7351 if (IsFSHL) {
7352 ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
7353 auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
7354 ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
7355 } else {
7356 auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
7357 ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
7358 ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
7359 }
7360 }
7361
7362 MIRBuilder.buildOr(Dst, ShX, ShY, MachineInstr::Disjoint);
7363 MI.eraseFromParent();
7364 return Legalized;
7365 }
7366
7367 LegalizerHelper::LegalizeResult
lowerFunnelShift(MachineInstr & MI)7368 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
7369 // These operations approximately do the following (while avoiding undefined
7370 // shifts by BW):
7371 // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
7372 // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
7373 Register Dst = MI.getOperand(0).getReg();
7374 LLT Ty = MRI.getType(Dst);
7375 LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
7376
7377 bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7378 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
7379
7380 // TODO: Use smarter heuristic that accounts for vector legalization.
7381 if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
7382 return lowerFunnelShiftAsShifts(MI);
7383
7384 // This only works for powers of 2, fallback to shifts if it fails.
7385 LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
7386 if (Result == UnableToLegalize)
7387 return lowerFunnelShiftAsShifts(MI);
7388 return Result;
7389 }
7390
lowerEXT(MachineInstr & MI)7391 LegalizerHelper::LegalizeResult LegalizerHelper::lowerEXT(MachineInstr &MI) {
7392 auto [Dst, Src] = MI.getFirst2Regs();
7393 LLT DstTy = MRI.getType(Dst);
7394 LLT SrcTy = MRI.getType(Src);
7395
7396 uint32_t DstTySize = DstTy.getSizeInBits();
7397 uint32_t DstTyScalarSize = DstTy.getScalarSizeInBits();
7398 uint32_t SrcTyScalarSize = SrcTy.getScalarSizeInBits();
7399
7400 if (!isPowerOf2_32(DstTySize) || !isPowerOf2_32(DstTyScalarSize) ||
7401 !isPowerOf2_32(SrcTyScalarSize))
7402 return UnableToLegalize;
7403
7404 // The step between extend is too large, split it by creating an intermediate
7405 // extend instruction
7406 if (SrcTyScalarSize * 2 < DstTyScalarSize) {
7407 LLT MidTy = SrcTy.changeElementSize(SrcTyScalarSize * 2);
7408 // If the destination type is illegal, split it into multiple statements
7409 // zext x -> zext(merge(zext(unmerge), zext(unmerge)))
7410 auto NewExt = MIRBuilder.buildInstr(MI.getOpcode(), {MidTy}, {Src});
7411 // Unmerge the vector
7412 LLT EltTy = MidTy.changeElementCount(
7413 MidTy.getElementCount().divideCoefficientBy(2));
7414 auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, NewExt);
7415
7416 // ZExt the vectors
7417 LLT ZExtResTy = DstTy.changeElementCount(
7418 DstTy.getElementCount().divideCoefficientBy(2));
7419 auto ZExtRes1 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy},
7420 {UnmergeSrc.getReg(0)});
7421 auto ZExtRes2 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy},
7422 {UnmergeSrc.getReg(1)});
7423
7424 // Merge the ending vectors
7425 MIRBuilder.buildMergeLikeInstr(Dst, {ZExtRes1, ZExtRes2});
7426
7427 MI.eraseFromParent();
7428 return Legalized;
7429 }
7430 return UnableToLegalize;
7431 }
7432
lowerTRUNC(MachineInstr & MI)7433 LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) {
7434 // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
7435 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
7436 // Similar to how operand splitting is done in SelectiondDAG, we can handle
7437 // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
7438 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
7439 // %lo16(<4 x s16>) = G_TRUNC %inlo
7440 // %hi16(<4 x s16>) = G_TRUNC %inhi
7441 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
7442 // %res(<8 x s8>) = G_TRUNC %in16
7443
7444 assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
7445
7446 Register DstReg = MI.getOperand(0).getReg();
7447 Register SrcReg = MI.getOperand(1).getReg();
7448 LLT DstTy = MRI.getType(DstReg);
7449 LLT SrcTy = MRI.getType(SrcReg);
7450
7451 if (DstTy.isVector() && isPowerOf2_32(DstTy.getNumElements()) &&
7452 isPowerOf2_32(DstTy.getScalarSizeInBits()) &&
7453 isPowerOf2_32(SrcTy.getNumElements()) &&
7454 isPowerOf2_32(SrcTy.getScalarSizeInBits())) {
7455 // Split input type.
7456 LLT SplitSrcTy = SrcTy.changeElementCount(
7457 SrcTy.getElementCount().divideCoefficientBy(2));
7458
7459 // First, split the source into two smaller vectors.
7460 SmallVector<Register, 2> SplitSrcs;
7461 extractParts(SrcReg, SplitSrcTy, 2, SplitSrcs, MIRBuilder, MRI);
7462
7463 // Truncate the splits into intermediate narrower elements.
7464 LLT InterTy;
7465 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
7466 InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
7467 else
7468 InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits());
7469 for (Register &Src : SplitSrcs)
7470 Src = MIRBuilder.buildTrunc(InterTy, Src).getReg(0);
7471
7472 // Combine the new truncates into one vector
7473 auto Merge = MIRBuilder.buildMergeLikeInstr(
7474 DstTy.changeElementSize(InterTy.getScalarSizeInBits()), SplitSrcs);
7475
7476 // Truncate the new vector to the final result type
7477 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
7478 MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), Merge.getReg(0));
7479 else
7480 MIRBuilder.buildCopy(MI.getOperand(0).getReg(), Merge.getReg(0));
7481
7482 MI.eraseFromParent();
7483
7484 return Legalized;
7485 }
7486 return UnableToLegalize;
7487 }
7488
7489 LegalizerHelper::LegalizeResult
lowerRotateWithReverseRotate(MachineInstr & MI)7490 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
7491 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
7492 auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
7493 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
7494 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
7495 auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
7496 MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
7497 MI.eraseFromParent();
7498 return Legalized;
7499 }
7500
lowerRotate(MachineInstr & MI)7501 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
7502 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
7503
7504 unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
7505 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
7506
7507 MIRBuilder.setInstrAndDebugLoc(MI);
7508
7509 // If a rotate in the other direction is supported, use it.
7510 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
7511 if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
7512 isPowerOf2_32(EltSizeInBits))
7513 return lowerRotateWithReverseRotate(MI);
7514
7515 // If a funnel shift is supported, use it.
7516 unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
7517 unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
7518 bool IsFShLegal = false;
7519 if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
7520 LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
7521 auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
7522 Register R3) {
7523 MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
7524 MI.eraseFromParent();
7525 return Legalized;
7526 };
7527 // If a funnel shift in the other direction is supported, use it.
7528 if (IsFShLegal) {
7529 return buildFunnelShift(FShOpc, Dst, Src, Amt);
7530 } else if (isPowerOf2_32(EltSizeInBits)) {
7531 Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
7532 return buildFunnelShift(RevFsh, Dst, Src, Amt);
7533 }
7534 }
7535
7536 auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
7537 unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
7538 unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
7539 auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
7540 Register ShVal;
7541 Register RevShiftVal;
7542 if (isPowerOf2_32(EltSizeInBits)) {
7543 // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
7544 // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
7545 auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
7546 auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
7547 ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
7548 auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
7549 RevShiftVal =
7550 MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
7551 } else {
7552 // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
7553 // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
7554 auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
7555 auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
7556 ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
7557 auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
7558 auto One = MIRBuilder.buildConstant(AmtTy, 1);
7559 auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
7560 RevShiftVal =
7561 MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
7562 }
7563 MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
7564 MI.eraseFromParent();
7565 return Legalized;
7566 }
7567
7568 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
7569 // representation.
7570 LegalizerHelper::LegalizeResult
lowerU64ToF32BitOps(MachineInstr & MI)7571 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
7572 auto [Dst, Src] = MI.getFirst2Regs();
7573 const LLT S64 = LLT::scalar(64);
7574 const LLT S32 = LLT::scalar(32);
7575 const LLT S1 = LLT::scalar(1);
7576
7577 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
7578
7579 // unsigned cul2f(ulong u) {
7580 // uint lz = clz(u);
7581 // uint e = (u != 0) ? 127U + 63U - lz : 0;
7582 // u = (u << lz) & 0x7fffffffffffffffUL;
7583 // ulong t = u & 0xffffffffffUL;
7584 // uint v = (e << 23) | (uint)(u >> 40);
7585 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
7586 // return as_float(v + r);
7587 // }
7588
7589 auto Zero32 = MIRBuilder.buildConstant(S32, 0);
7590 auto Zero64 = MIRBuilder.buildConstant(S64, 0);
7591
7592 auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
7593
7594 auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
7595 auto Sub = MIRBuilder.buildSub(S32, K, LZ);
7596
7597 auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
7598 auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
7599
7600 auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
7601 auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
7602
7603 auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
7604
7605 auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
7606 auto T = MIRBuilder.buildAnd(S64, U, Mask1);
7607
7608 auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
7609 auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
7610 auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
7611
7612 auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
7613 auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
7614 auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
7615 auto One = MIRBuilder.buildConstant(S32, 1);
7616
7617 auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
7618 auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
7619 auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
7620 MIRBuilder.buildAdd(Dst, V, R);
7621
7622 MI.eraseFromParent();
7623 return Legalized;
7624 }
7625
7626 // Expand s32 = G_UITOFP s64 to an IEEE float representation using bit
7627 // operations and G_SITOFP
7628 LegalizerHelper::LegalizeResult
lowerU64ToF32WithSITOFP(MachineInstr & MI)7629 LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
7630 auto [Dst, Src] = MI.getFirst2Regs();
7631 const LLT S64 = LLT::scalar(64);
7632 const LLT S32 = LLT::scalar(32);
7633 const LLT S1 = LLT::scalar(1);
7634
7635 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
7636
7637 // For i64 < INT_MAX we simply reuse SITOFP.
7638 // Otherwise, divide i64 by 2, round result by ORing with the lowest bit
7639 // saved before division, convert to float by SITOFP, multiply the result
7640 // by 2.
7641 auto One = MIRBuilder.buildConstant(S64, 1);
7642 auto Zero = MIRBuilder.buildConstant(S64, 0);
7643 // Result if Src < INT_MAX
7644 auto SmallResult = MIRBuilder.buildSITOFP(S32, Src);
7645 // Result if Src >= INT_MAX
7646 auto Halved = MIRBuilder.buildLShr(S64, Src, One);
7647 auto LowerBit = MIRBuilder.buildAnd(S64, Src, One);
7648 auto RoundedHalved = MIRBuilder.buildOr(S64, Halved, LowerBit);
7649 auto HalvedFP = MIRBuilder.buildSITOFP(S32, RoundedHalved);
7650 auto LargeResult = MIRBuilder.buildFAdd(S32, HalvedFP, HalvedFP);
7651 // Check if the original value is larger than INT_MAX by comparing with
7652 // zero to pick one of the two conversions.
7653 auto IsLarge =
7654 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, S1, Src, Zero);
7655 MIRBuilder.buildSelect(Dst, IsLarge, LargeResult, SmallResult);
7656
7657 MI.eraseFromParent();
7658 return Legalized;
7659 }
7660
7661 // Expand s64 = G_UITOFP s64 using bit and float arithmetic operations to an
7662 // IEEE double representation.
7663 LegalizerHelper::LegalizeResult
lowerU64ToF64BitFloatOps(MachineInstr & MI)7664 LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
7665 auto [Dst, Src] = MI.getFirst2Regs();
7666 const LLT S64 = LLT::scalar(64);
7667 const LLT S32 = LLT::scalar(32);
7668
7669 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
7670
7671 // We create double value from 32 bit parts with 32 exponent difference.
7672 // Note that + and - are float operations that adjust the implicit leading
7673 // one, the bases 2^52 and 2^84 are for illustrative purposes.
7674 //
7675 // X = 2^52 * 1.0...LowBits
7676 // Y = 2^84 * 1.0...HighBits
7677 // Scratch = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0
7678 // = - 2^52 * 1.0...HighBits
7679 // Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits
7680 auto TwoP52 = MIRBuilder.buildConstant(S64, UINT64_C(0x4330000000000000));
7681 auto TwoP84 = MIRBuilder.buildConstant(S64, UINT64_C(0x4530000000000000));
7682 auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000));
7683 auto TwoP52P84FP = MIRBuilder.buildFConstant(S64, TwoP52P84);
7684 auto HalfWidth = MIRBuilder.buildConstant(S64, 32);
7685
7686 auto LowBits = MIRBuilder.buildTrunc(S32, Src);
7687 LowBits = MIRBuilder.buildZExt(S64, LowBits);
7688 auto LowBitsFP = MIRBuilder.buildOr(S64, TwoP52, LowBits);
7689 auto HighBits = MIRBuilder.buildLShr(S64, Src, HalfWidth);
7690 auto HighBitsFP = MIRBuilder.buildOr(S64, TwoP84, HighBits);
7691 auto Scratch = MIRBuilder.buildFSub(S64, HighBitsFP, TwoP52P84FP);
7692 MIRBuilder.buildFAdd(Dst, Scratch, LowBitsFP);
7693
7694 MI.eraseFromParent();
7695 return Legalized;
7696 }
7697
7698 /// i64->fp16 itofp can be lowered to i64->f64,f64->f32,f32->f16. We cannot
7699 /// convert fpround f64->f16 without double-rounding, so we manually perform the
7700 /// lowering here where we know it is valid.
7701 static LegalizerHelper::LegalizeResult
loweri64tof16ITOFP(MachineInstr & MI,Register Dst,LLT DstTy,Register Src,LLT SrcTy,MachineIRBuilder & MIRBuilder)7702 loweri64tof16ITOFP(MachineInstr &MI, Register Dst, LLT DstTy, Register Src,
7703 LLT SrcTy, MachineIRBuilder &MIRBuilder) {
7704 auto M1 = MI.getOpcode() == TargetOpcode::G_UITOFP
7705 ? MIRBuilder.buildUITOFP(SrcTy, Src)
7706 : MIRBuilder.buildSITOFP(SrcTy, Src);
7707 LLT S32Ty = SrcTy.changeElementSize(32);
7708 auto M2 = MIRBuilder.buildFPTrunc(S32Ty, M1);
7709 MIRBuilder.buildFPTrunc(Dst, M2);
7710 MI.eraseFromParent();
7711 return LegalizerHelper::Legalized;
7712 }
7713
lowerUITOFP(MachineInstr & MI)7714 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
7715 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7716
7717 if (SrcTy == LLT::scalar(1)) {
7718 auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
7719 auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
7720 MIRBuilder.buildSelect(Dst, Src, True, False);
7721 MI.eraseFromParent();
7722 return Legalized;
7723 }
7724
7725 if (DstTy.getScalarSizeInBits() == 16 && SrcTy.getScalarSizeInBits() == 64)
7726 return loweri64tof16ITOFP(MI, Dst, DstTy, Src, SrcTy, MIRBuilder);
7727
7728 if (SrcTy != LLT::scalar(64))
7729 return UnableToLegalize;
7730
7731 if (DstTy == LLT::scalar(32))
7732 // TODO: SelectionDAG has several alternative expansions to port which may
7733 // be more reasonable depending on the available instructions. We also need
7734 // a more advanced mechanism to choose an optimal version depending on
7735 // target features such as sitofp or CTLZ availability.
7736 return lowerU64ToF32WithSITOFP(MI);
7737
7738 if (DstTy == LLT::scalar(64))
7739 return lowerU64ToF64BitFloatOps(MI);
7740
7741 return UnableToLegalize;
7742 }
7743
lowerSITOFP(MachineInstr & MI)7744 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
7745 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7746
7747 const LLT S64 = LLT::scalar(64);
7748 const LLT S32 = LLT::scalar(32);
7749 const LLT S1 = LLT::scalar(1);
7750
7751 if (SrcTy == S1) {
7752 auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
7753 auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
7754 MIRBuilder.buildSelect(Dst, Src, True, False);
7755 MI.eraseFromParent();
7756 return Legalized;
7757 }
7758
7759 if (DstTy.getScalarSizeInBits() == 16 && SrcTy.getScalarSizeInBits() == 64)
7760 return loweri64tof16ITOFP(MI, Dst, DstTy, Src, SrcTy, MIRBuilder);
7761
7762 if (SrcTy != S64)
7763 return UnableToLegalize;
7764
7765 if (DstTy == S32) {
7766 // signed cl2f(long l) {
7767 // long s = l >> 63;
7768 // float r = cul2f((l + s) ^ s);
7769 // return s ? -r : r;
7770 // }
7771 Register L = Src;
7772 auto SignBit = MIRBuilder.buildConstant(S64, 63);
7773 auto S = MIRBuilder.buildAShr(S64, L, SignBit);
7774
7775 auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
7776 auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
7777 auto R = MIRBuilder.buildUITOFP(S32, Xor);
7778
7779 auto RNeg = MIRBuilder.buildFNeg(S32, R);
7780 auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
7781 MIRBuilder.buildConstant(S64, 0));
7782 MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
7783 MI.eraseFromParent();
7784 return Legalized;
7785 }
7786
7787 return UnableToLegalize;
7788 }
7789
lowerFPTOUI(MachineInstr & MI)7790 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
7791 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7792 const LLT S64 = LLT::scalar(64);
7793 const LLT S32 = LLT::scalar(32);
7794
7795 if (SrcTy != S64 && SrcTy != S32)
7796 return UnableToLegalize;
7797 if (DstTy != S32 && DstTy != S64)
7798 return UnableToLegalize;
7799
7800 // FPTOSI gives same result as FPTOUI for positive signed integers.
7801 // FPTOUI needs to deal with fp values that convert to unsigned integers
7802 // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
7803
7804 APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
7805 APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
7806 : APFloat::IEEEdouble(),
7807 APInt::getZero(SrcTy.getSizeInBits()));
7808 TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
7809
7810 MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
7811
7812 MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
7813 // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
7814 // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
7815 MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
7816 MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
7817 MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
7818 MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
7819
7820 const LLT S1 = LLT::scalar(1);
7821
7822 MachineInstrBuilder FCMP =
7823 MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
7824 MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
7825
7826 MI.eraseFromParent();
7827 return Legalized;
7828 }
7829
lowerFPTOSI(MachineInstr & MI)7830 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
7831 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7832 const LLT S64 = LLT::scalar(64);
7833 const LLT S32 = LLT::scalar(32);
7834
7835 // FIXME: Only f32 to i64 conversions are supported.
7836 if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
7837 return UnableToLegalize;
7838
7839 // Expand f32 -> i64 conversion
7840 // This algorithm comes from compiler-rt's implementation of fixsfdi:
7841 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
7842
7843 unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
7844
7845 auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
7846 auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
7847
7848 auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
7849 auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
7850
7851 auto SignMask = MIRBuilder.buildConstant(SrcTy,
7852 APInt::getSignMask(SrcEltBits));
7853 auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
7854 auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
7855 auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
7856 Sign = MIRBuilder.buildSExt(DstTy, Sign);
7857
7858 auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
7859 auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
7860 auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
7861
7862 auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
7863 R = MIRBuilder.buildZExt(DstTy, R);
7864
7865 auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
7866 auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
7867 auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
7868 auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
7869
7870 auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
7871 auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
7872
7873 const LLT S1 = LLT::scalar(1);
7874 auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
7875 S1, Exponent, ExponentLoBit);
7876
7877 R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
7878
7879 auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
7880 auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
7881
7882 auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
7883
7884 auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
7885 S1, Exponent, ZeroSrcTy);
7886
7887 auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
7888 MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
7889
7890 MI.eraseFromParent();
7891 return Legalized;
7892 }
7893
7894 LegalizerHelper::LegalizeResult
lowerFPTOINT_SAT(MachineInstr & MI)7895 LegalizerHelper::lowerFPTOINT_SAT(MachineInstr &MI) {
7896 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7897
7898 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI_SAT;
7899 unsigned SatWidth = DstTy.getScalarSizeInBits();
7900
7901 // Determine minimum and maximum integer values and their corresponding
7902 // floating-point values.
7903 APInt MinInt, MaxInt;
7904 if (IsSigned) {
7905 MinInt = APInt::getSignedMinValue(SatWidth);
7906 MaxInt = APInt::getSignedMaxValue(SatWidth);
7907 } else {
7908 MinInt = APInt::getMinValue(SatWidth);
7909 MaxInt = APInt::getMaxValue(SatWidth);
7910 }
7911
7912 const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
7913 APFloat MinFloat(Semantics);
7914 APFloat MaxFloat(Semantics);
7915
7916 APFloat::opStatus MinStatus =
7917 MinFloat.convertFromAPInt(MinInt, IsSigned, APFloat::rmTowardZero);
7918 APFloat::opStatus MaxStatus =
7919 MaxFloat.convertFromAPInt(MaxInt, IsSigned, APFloat::rmTowardZero);
7920 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) &&
7921 !(MaxStatus & APFloat::opStatus::opInexact);
7922
7923 // If the integer bounds are exactly representable as floats, emit a
7924 // min+max+fptoi sequence. Otherwise we have to use a sequence of comparisons
7925 // and selects.
7926 if (AreExactFloatBounds) {
7927 // Clamp Src by MinFloat from below. If Src is NaN the result is MinFloat.
7928 auto MaxC = MIRBuilder.buildFConstant(SrcTy, MinFloat);
7929 auto MaxP = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT,
7930 SrcTy.changeElementSize(1), Src, MaxC);
7931 auto Max = MIRBuilder.buildSelect(SrcTy, MaxP, Src, MaxC);
7932 // Clamp by MaxFloat from above. NaN cannot occur.
7933 auto MinC = MIRBuilder.buildFConstant(SrcTy, MaxFloat);
7934 auto MinP =
7935 MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, SrcTy.changeElementSize(1), Max,
7936 MinC, MachineInstr::FmNoNans);
7937 auto Min =
7938 MIRBuilder.buildSelect(SrcTy, MinP, Max, MinC, MachineInstr::FmNoNans);
7939 // Convert clamped value to integer. In the unsigned case we're done,
7940 // because we mapped NaN to MinFloat, which will cast to zero.
7941 if (!IsSigned) {
7942 MIRBuilder.buildFPTOUI(Dst, Min);
7943 MI.eraseFromParent();
7944 return Legalized;
7945 }
7946
7947 // Otherwise, select 0 if Src is NaN.
7948 auto FpToInt = MIRBuilder.buildFPTOSI(DstTy, Min);
7949 auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_UNO,
7950 DstTy.changeElementSize(1), Src, Src);
7951 MIRBuilder.buildSelect(Dst, IsZero, MIRBuilder.buildConstant(DstTy, 0),
7952 FpToInt);
7953 MI.eraseFromParent();
7954 return Legalized;
7955 }
7956
7957 // Result of direct conversion. The assumption here is that the operation is
7958 // non-trapping and it's fine to apply it to an out-of-range value if we
7959 // select it away later.
7960 auto FpToInt = IsSigned ? MIRBuilder.buildFPTOSI(DstTy, Src)
7961 : MIRBuilder.buildFPTOUI(DstTy, Src);
7962
7963 // If Src ULT MinFloat, select MinInt. In particular, this also selects
7964 // MinInt if Src is NaN.
7965 auto ULT =
7966 MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, SrcTy.changeElementSize(1), Src,
7967 MIRBuilder.buildFConstant(SrcTy, MinFloat));
7968 auto Max = MIRBuilder.buildSelect(
7969 DstTy, ULT, MIRBuilder.buildConstant(DstTy, MinInt), FpToInt);
7970 // If Src OGT MaxFloat, select MaxInt.
7971 auto OGT =
7972 MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, SrcTy.changeElementSize(1), Src,
7973 MIRBuilder.buildFConstant(SrcTy, MaxFloat));
7974
7975 // In the unsigned case we are done, because we mapped NaN to MinInt, which
7976 // is already zero.
7977 if (!IsSigned) {
7978 MIRBuilder.buildSelect(Dst, OGT, MIRBuilder.buildConstant(DstTy, MaxInt),
7979 Max);
7980 MI.eraseFromParent();
7981 return Legalized;
7982 }
7983
7984 // Otherwise, select 0 if Src is NaN.
7985 auto Min = MIRBuilder.buildSelect(
7986 DstTy, OGT, MIRBuilder.buildConstant(DstTy, MaxInt), Max);
7987 auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_UNO,
7988 DstTy.changeElementSize(1), Src, Src);
7989 MIRBuilder.buildSelect(Dst, IsZero, MIRBuilder.buildConstant(DstTy, 0), Min);
7990 MI.eraseFromParent();
7991 return Legalized;
7992 }
7993
7994 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
7995 LegalizerHelper::LegalizeResult
lowerFPTRUNC_F64_TO_F16(MachineInstr & MI)7996 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
7997 const LLT S1 = LLT::scalar(1);
7998 const LLT S32 = LLT::scalar(32);
7999
8000 auto [Dst, Src] = MI.getFirst2Regs();
8001 assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
8002 MRI.getType(Src).getScalarType() == LLT::scalar(64));
8003
8004 if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
8005 return UnableToLegalize;
8006
8007 if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
8008 unsigned Flags = MI.getFlags();
8009 auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
8010 MIRBuilder.buildFPTrunc(Dst, Src32, Flags);
8011 MI.eraseFromParent();
8012 return Legalized;
8013 }
8014
8015 const unsigned ExpMask = 0x7ff;
8016 const unsigned ExpBiasf64 = 1023;
8017 const unsigned ExpBiasf16 = 15;
8018
8019 auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
8020 Register U = Unmerge.getReg(0);
8021 Register UH = Unmerge.getReg(1);
8022
8023 auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
8024 E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
8025
8026 // Subtract the fp64 exponent bias (1023) to get the real exponent and
8027 // add the f16 bias (15) to get the biased exponent for the f16 format.
8028 E = MIRBuilder.buildAdd(
8029 S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
8030
8031 auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
8032 M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
8033
8034 auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
8035 MIRBuilder.buildConstant(S32, 0x1ff));
8036 MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
8037
8038 auto Zero = MIRBuilder.buildConstant(S32, 0);
8039 auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
8040 auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
8041 M = MIRBuilder.buildOr(S32, M, Lo40Set);
8042
8043 // (M != 0 ? 0x0200 : 0) | 0x7c00;
8044 auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
8045 auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
8046 auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
8047
8048 auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
8049 auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
8050
8051 // N = M | (E << 12);
8052 auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
8053 auto N = MIRBuilder.buildOr(S32, M, EShl12);
8054
8055 // B = clamp(1-E, 0, 13);
8056 auto One = MIRBuilder.buildConstant(S32, 1);
8057 auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
8058 auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
8059 B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
8060
8061 auto SigSetHigh = MIRBuilder.buildOr(S32, M,
8062 MIRBuilder.buildConstant(S32, 0x1000));
8063
8064 auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
8065 auto D0 = MIRBuilder.buildShl(S32, D, B);
8066
8067 auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
8068 D0, SigSetHigh);
8069 auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
8070 D = MIRBuilder.buildOr(S32, D, D1);
8071
8072 auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
8073 auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
8074
8075 auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
8076 V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
8077
8078 auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
8079 MIRBuilder.buildConstant(S32, 3));
8080 auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
8081
8082 auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
8083 MIRBuilder.buildConstant(S32, 5));
8084 auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
8085
8086 V1 = MIRBuilder.buildOr(S32, V0, V1);
8087 V = MIRBuilder.buildAdd(S32, V, V1);
8088
8089 auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1,
8090 E, MIRBuilder.buildConstant(S32, 30));
8091 V = MIRBuilder.buildSelect(S32, CmpEGt30,
8092 MIRBuilder.buildConstant(S32, 0x7c00), V);
8093
8094 auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
8095 E, MIRBuilder.buildConstant(S32, 1039));
8096 V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
8097
8098 // Extract the sign bit.
8099 auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
8100 Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
8101
8102 // Insert the sign bit
8103 V = MIRBuilder.buildOr(S32, Sign, V);
8104
8105 MIRBuilder.buildTrunc(Dst, V);
8106 MI.eraseFromParent();
8107 return Legalized;
8108 }
8109
8110 LegalizerHelper::LegalizeResult
lowerFPTRUNC(MachineInstr & MI)8111 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
8112 auto [DstTy, SrcTy] = MI.getFirst2LLTs();
8113 const LLT S64 = LLT::scalar(64);
8114 const LLT S16 = LLT::scalar(16);
8115
8116 if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
8117 return lowerFPTRUNC_F64_TO_F16(MI);
8118
8119 return UnableToLegalize;
8120 }
8121
lowerFPOWI(MachineInstr & MI)8122 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
8123 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8124 LLT Ty = MRI.getType(Dst);
8125
8126 auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
8127 MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
8128 MI.eraseFromParent();
8129 return Legalized;
8130 }
8131
minMaxToCompare(unsigned Opc)8132 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
8133 switch (Opc) {
8134 case TargetOpcode::G_SMIN:
8135 return CmpInst::ICMP_SLT;
8136 case TargetOpcode::G_SMAX:
8137 return CmpInst::ICMP_SGT;
8138 case TargetOpcode::G_UMIN:
8139 return CmpInst::ICMP_ULT;
8140 case TargetOpcode::G_UMAX:
8141 return CmpInst::ICMP_UGT;
8142 default:
8143 llvm_unreachable("not in integer min/max");
8144 }
8145 }
8146
lowerMinMax(MachineInstr & MI)8147 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
8148 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8149
8150 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
8151 LLT CmpType = MRI.getType(Dst).changeElementSize(1);
8152
8153 auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
8154 MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
8155
8156 MI.eraseFromParent();
8157 return Legalized;
8158 }
8159
8160 LegalizerHelper::LegalizeResult
lowerThreewayCompare(MachineInstr & MI)8161 LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) {
8162 GSUCmp *Cmp = cast<GSUCmp>(&MI);
8163
8164 Register Dst = Cmp->getReg(0);
8165 LLT DstTy = MRI.getType(Dst);
8166 LLT SrcTy = MRI.getType(Cmp->getReg(1));
8167 LLT CmpTy = DstTy.changeElementSize(1);
8168
8169 CmpInst::Predicate LTPredicate = Cmp->isSigned()
8170 ? CmpInst::Predicate::ICMP_SLT
8171 : CmpInst::Predicate::ICMP_ULT;
8172 CmpInst::Predicate GTPredicate = Cmp->isSigned()
8173 ? CmpInst::Predicate::ICMP_SGT
8174 : CmpInst::Predicate::ICMP_UGT;
8175
8176 auto Zero = MIRBuilder.buildConstant(DstTy, 0);
8177 auto IsGT = MIRBuilder.buildICmp(GTPredicate, CmpTy, Cmp->getLHSReg(),
8178 Cmp->getRHSReg());
8179 auto IsLT = MIRBuilder.buildICmp(LTPredicate, CmpTy, Cmp->getLHSReg(),
8180 Cmp->getRHSReg());
8181
8182 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
8183 auto BC = TLI.getBooleanContents(DstTy.isVector(), /*isFP=*/false);
8184 if (TLI.shouldExpandCmpUsingSelects(getApproximateEVTForLLT(SrcTy, Ctx)) ||
8185 BC == TargetLowering::UndefinedBooleanContent) {
8186 auto One = MIRBuilder.buildConstant(DstTy, 1);
8187 auto SelectZeroOrOne = MIRBuilder.buildSelect(DstTy, IsGT, One, Zero);
8188
8189 auto MinusOne = MIRBuilder.buildConstant(DstTy, -1);
8190 MIRBuilder.buildSelect(Dst, IsLT, MinusOne, SelectZeroOrOne);
8191 } else {
8192 if (BC == TargetLowering::ZeroOrNegativeOneBooleanContent)
8193 std::swap(IsGT, IsLT);
8194 // Extend boolean results to DstTy, which is at least i2, before subtracting
8195 // them.
8196 unsigned BoolExtOp =
8197 MIRBuilder.getBoolExtOp(DstTy.isVector(), /*isFP=*/false);
8198 IsGT = MIRBuilder.buildInstr(BoolExtOp, {DstTy}, {IsGT});
8199 IsLT = MIRBuilder.buildInstr(BoolExtOp, {DstTy}, {IsLT});
8200 MIRBuilder.buildSub(Dst, IsGT, IsLT);
8201 }
8202
8203 MI.eraseFromParent();
8204 return Legalized;
8205 }
8206
8207 LegalizerHelper::LegalizeResult
lowerFCopySign(MachineInstr & MI)8208 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
8209 auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
8210 const int Src0Size = Src0Ty.getScalarSizeInBits();
8211 const int Src1Size = Src1Ty.getScalarSizeInBits();
8212
8213 auto SignBitMask = MIRBuilder.buildConstant(
8214 Src0Ty, APInt::getSignMask(Src0Size));
8215
8216 auto NotSignBitMask = MIRBuilder.buildConstant(
8217 Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
8218
8219 Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
8220 Register And1;
8221 if (Src0Ty == Src1Ty) {
8222 And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
8223 } else if (Src0Size > Src1Size) {
8224 auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
8225 auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
8226 auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
8227 And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
8228 } else {
8229 auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
8230 auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
8231 auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
8232 And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
8233 }
8234
8235 // Be careful about setting nsz/nnan/ninf on every instruction, since the
8236 // constants are a nan and -0.0, but the final result should preserve
8237 // everything.
8238 unsigned Flags = MI.getFlags();
8239
8240 // We masked the sign bit and the not-sign bit, so these are disjoint.
8241 Flags |= MachineInstr::Disjoint;
8242
8243 MIRBuilder.buildOr(Dst, And0, And1, Flags);
8244
8245 MI.eraseFromParent();
8246 return Legalized;
8247 }
8248
8249 LegalizerHelper::LegalizeResult
lowerFMinNumMaxNum(MachineInstr & MI)8250 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
8251 // FIXME: fminnum/fmaxnum and fminimumnum/fmaximumnum should not have
8252 // identical handling. fminimumnum/fmaximumnum also need a path that do not
8253 // depend on fminnum/fmaxnum.
8254
8255 unsigned NewOp;
8256 switch (MI.getOpcode()) {
8257 case TargetOpcode::G_FMINNUM:
8258 NewOp = TargetOpcode::G_FMINNUM_IEEE;
8259 break;
8260 case TargetOpcode::G_FMINIMUMNUM:
8261 NewOp = TargetOpcode::G_FMINNUM;
8262 break;
8263 case TargetOpcode::G_FMAXNUM:
8264 NewOp = TargetOpcode::G_FMAXNUM_IEEE;
8265 break;
8266 case TargetOpcode::G_FMAXIMUMNUM:
8267 NewOp = TargetOpcode::G_FMAXNUM;
8268 break;
8269 default:
8270 llvm_unreachable("unexpected min/max opcode");
8271 }
8272
8273 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8274 LLT Ty = MRI.getType(Dst);
8275
8276 if (!MI.getFlag(MachineInstr::FmNoNans)) {
8277 // Insert canonicalizes if it's possible we need to quiet to get correct
8278 // sNaN behavior.
8279
8280 // Note this must be done here, and not as an optimization combine in the
8281 // absence of a dedicate quiet-snan instruction as we're using an
8282 // omni-purpose G_FCANONICALIZE.
8283 if (!isKnownNeverSNaN(Src0, MRI))
8284 Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
8285
8286 if (!isKnownNeverSNaN(Src1, MRI))
8287 Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
8288 }
8289
8290 // If there are no nans, it's safe to simply replace this with the non-IEEE
8291 // version.
8292 MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
8293 MI.eraseFromParent();
8294 return Legalized;
8295 }
8296
lowerFMad(MachineInstr & MI)8297 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
8298 // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
8299 Register DstReg = MI.getOperand(0).getReg();
8300 LLT Ty = MRI.getType(DstReg);
8301 unsigned Flags = MI.getFlags();
8302
8303 auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
8304 Flags);
8305 MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
8306 MI.eraseFromParent();
8307 return Legalized;
8308 }
8309
8310 LegalizerHelper::LegalizeResult
lowerIntrinsicRound(MachineInstr & MI)8311 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
8312 auto [DstReg, X] = MI.getFirst2Regs();
8313 const unsigned Flags = MI.getFlags();
8314 const LLT Ty = MRI.getType(DstReg);
8315 const LLT CondTy = Ty.changeElementSize(1);
8316
8317 // round(x) =>
8318 // t = trunc(x);
8319 // d = fabs(x - t);
8320 // o = copysign(d >= 0.5 ? 1.0 : 0.0, x);
8321 // return t + o;
8322
8323 auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
8324
8325 auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
8326 auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
8327
8328 auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
8329 auto Cmp =
8330 MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half, Flags);
8331
8332 // Could emit G_UITOFP instead
8333 auto One = MIRBuilder.buildFConstant(Ty, 1.0);
8334 auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
8335 auto BoolFP = MIRBuilder.buildSelect(Ty, Cmp, One, Zero);
8336 auto SignedOffset = MIRBuilder.buildFCopysign(Ty, BoolFP, X);
8337
8338 MIRBuilder.buildFAdd(DstReg, T, SignedOffset, Flags);
8339
8340 MI.eraseFromParent();
8341 return Legalized;
8342 }
8343
lowerFFloor(MachineInstr & MI)8344 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
8345 auto [DstReg, SrcReg] = MI.getFirst2Regs();
8346 unsigned Flags = MI.getFlags();
8347 LLT Ty = MRI.getType(DstReg);
8348 const LLT CondTy = Ty.changeElementSize(1);
8349
8350 // result = trunc(src);
8351 // if (src < 0.0 && src != result)
8352 // result += -1.0.
8353
8354 auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
8355 auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
8356
8357 auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
8358 SrcReg, Zero, Flags);
8359 auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
8360 SrcReg, Trunc, Flags);
8361 auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
8362 auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
8363
8364 MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
8365 MI.eraseFromParent();
8366 return Legalized;
8367 }
8368
8369 LegalizerHelper::LegalizeResult
lowerMergeValues(MachineInstr & MI)8370 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
8371 const unsigned NumOps = MI.getNumOperands();
8372 auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
8373 unsigned PartSize = Src0Ty.getSizeInBits();
8374
8375 LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
8376 Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
8377
8378 for (unsigned I = 2; I != NumOps; ++I) {
8379 const unsigned Offset = (I - 1) * PartSize;
8380
8381 Register SrcReg = MI.getOperand(I).getReg();
8382 auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
8383
8384 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
8385 MRI.createGenericVirtualRegister(WideTy);
8386
8387 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
8388 auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
8389 MIRBuilder.buildOr(NextResult, ResultReg, Shl);
8390 ResultReg = NextResult;
8391 }
8392
8393 if (DstTy.isPointer()) {
8394 if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
8395 DstTy.getAddressSpace())) {
8396 LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
8397 return UnableToLegalize;
8398 }
8399
8400 MIRBuilder.buildIntToPtr(DstReg, ResultReg);
8401 }
8402
8403 MI.eraseFromParent();
8404 return Legalized;
8405 }
8406
8407 LegalizerHelper::LegalizeResult
lowerUnmergeValues(MachineInstr & MI)8408 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
8409 const unsigned NumDst = MI.getNumOperands() - 1;
8410 Register SrcReg = MI.getOperand(NumDst).getReg();
8411 Register Dst0Reg = MI.getOperand(0).getReg();
8412 LLT DstTy = MRI.getType(Dst0Reg);
8413 if (DstTy.isPointer())
8414 return UnableToLegalize; // TODO
8415
8416 SrcReg = coerceToScalar(SrcReg);
8417 if (!SrcReg)
8418 return UnableToLegalize;
8419
8420 // Expand scalarizing unmerge as bitcast to integer and shift.
8421 LLT IntTy = MRI.getType(SrcReg);
8422
8423 MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
8424
8425 const unsigned DstSize = DstTy.getSizeInBits();
8426 unsigned Offset = DstSize;
8427 for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
8428 auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
8429 auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
8430 MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
8431 }
8432
8433 MI.eraseFromParent();
8434 return Legalized;
8435 }
8436
8437 /// Lower a vector extract or insert by writing the vector to a stack temporary
8438 /// and reloading the element or vector.
8439 ///
8440 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
8441 /// =>
8442 /// %stack_temp = G_FRAME_INDEX
8443 /// G_STORE %vec, %stack_temp
8444 /// %idx = clamp(%idx, %vec.getNumElements())
8445 /// %element_ptr = G_PTR_ADD %stack_temp, %idx
8446 /// %dst = G_LOAD %element_ptr
8447 LegalizerHelper::LegalizeResult
lowerExtractInsertVectorElt(MachineInstr & MI)8448 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
8449 Register DstReg = MI.getOperand(0).getReg();
8450 Register SrcVec = MI.getOperand(1).getReg();
8451 Register InsertVal;
8452 if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
8453 InsertVal = MI.getOperand(2).getReg();
8454
8455 Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
8456
8457 LLT VecTy = MRI.getType(SrcVec);
8458 LLT EltTy = VecTy.getElementType();
8459 unsigned NumElts = VecTy.getNumElements();
8460
8461 int64_t IdxVal;
8462 if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
8463 SmallVector<Register, 8> SrcRegs;
8464 extractParts(SrcVec, EltTy, NumElts, SrcRegs, MIRBuilder, MRI);
8465
8466 if (InsertVal) {
8467 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
8468 MIRBuilder.buildMergeLikeInstr(DstReg, SrcRegs);
8469 } else {
8470 MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
8471 }
8472
8473 MI.eraseFromParent();
8474 return Legalized;
8475 }
8476
8477 if (!EltTy.isByteSized()) { // Not implemented.
8478 LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
8479 return UnableToLegalize;
8480 }
8481
8482 unsigned EltBytes = EltTy.getSizeInBytes();
8483 Align VecAlign = getStackTemporaryAlignment(VecTy);
8484 Align EltAlign;
8485
8486 MachinePointerInfo PtrInfo;
8487 auto StackTemp = createStackTemporary(
8488 TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign, PtrInfo);
8489 MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
8490
8491 // Get the pointer to the element, and be sure not to hit undefined behavior
8492 // if the index is out of bounds.
8493 Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
8494
8495 if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
8496 int64_t Offset = IdxVal * EltBytes;
8497 PtrInfo = PtrInfo.getWithOffset(Offset);
8498 EltAlign = commonAlignment(VecAlign, Offset);
8499 } else {
8500 // We lose information with a variable offset.
8501 EltAlign = getStackTemporaryAlignment(EltTy);
8502 PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
8503 }
8504
8505 if (InsertVal) {
8506 // Write the inserted element
8507 MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
8508
8509 // Reload the whole vector.
8510 MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
8511 } else {
8512 MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
8513 }
8514
8515 MI.eraseFromParent();
8516 return Legalized;
8517 }
8518
8519 LegalizerHelper::LegalizeResult
lowerShuffleVector(MachineInstr & MI)8520 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
8521 auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
8522 MI.getFirst3RegLLTs();
8523 LLT IdxTy = LLT::scalar(32);
8524
8525 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
8526 Register Undef;
8527 SmallVector<Register, 32> BuildVec;
8528 LLT EltTy = DstTy.getScalarType();
8529
8530 for (int Idx : Mask) {
8531 if (Idx < 0) {
8532 if (!Undef.isValid())
8533 Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
8534 BuildVec.push_back(Undef);
8535 continue;
8536 }
8537
8538 if (Src0Ty.isScalar()) {
8539 BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
8540 } else {
8541 int NumElts = Src0Ty.getNumElements();
8542 Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
8543 int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
8544 auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
8545 auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
8546 BuildVec.push_back(Extract.getReg(0));
8547 }
8548 }
8549
8550 if (DstTy.isVector())
8551 MIRBuilder.buildBuildVector(DstReg, BuildVec);
8552 else
8553 MIRBuilder.buildCopy(DstReg, BuildVec[0]);
8554 MI.eraseFromParent();
8555 return Legalized;
8556 }
8557
8558 LegalizerHelper::LegalizeResult
lowerVECTOR_COMPRESS(llvm::MachineInstr & MI)8559 LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) {
8560 auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy, Passthru, PassthruTy] =
8561 MI.getFirst4RegLLTs();
8562
8563 if (VecTy.isScalableVector())
8564 report_fatal_error("Cannot expand masked_compress for scalable vectors.");
8565
8566 Align VecAlign = getStackTemporaryAlignment(VecTy);
8567 MachinePointerInfo PtrInfo;
8568 Register StackPtr =
8569 createStackTemporary(TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign,
8570 PtrInfo)
8571 .getReg(0);
8572 MachinePointerInfo ValPtrInfo =
8573 MachinePointerInfo::getUnknownStack(*MI.getMF());
8574
8575 LLT IdxTy = LLT::scalar(32);
8576 LLT ValTy = VecTy.getElementType();
8577 Align ValAlign = getStackTemporaryAlignment(ValTy);
8578
8579 auto OutPos = MIRBuilder.buildConstant(IdxTy, 0);
8580
8581 bool HasPassthru =
8582 MRI.getVRegDef(Passthru)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF;
8583
8584 if (HasPassthru)
8585 MIRBuilder.buildStore(Passthru, StackPtr, PtrInfo, VecAlign);
8586
8587 Register LastWriteVal;
8588 std::optional<APInt> PassthruSplatVal =
8589 isConstantOrConstantSplatVector(*MRI.getVRegDef(Passthru), MRI);
8590
8591 if (PassthruSplatVal.has_value()) {
8592 LastWriteVal =
8593 MIRBuilder.buildConstant(ValTy, PassthruSplatVal.value()).getReg(0);
8594 } else if (HasPassthru) {
8595 auto Popcount = MIRBuilder.buildZExt(MaskTy.changeElementSize(32), Mask);
8596 Popcount = MIRBuilder.buildInstr(TargetOpcode::G_VECREDUCE_ADD,
8597 {LLT::scalar(32)}, {Popcount});
8598
8599 Register LastElmtPtr =
8600 getVectorElementPointer(StackPtr, VecTy, Popcount.getReg(0));
8601 LastWriteVal =
8602 MIRBuilder.buildLoad(ValTy, LastElmtPtr, ValPtrInfo, ValAlign)
8603 .getReg(0);
8604 }
8605
8606 unsigned NumElmts = VecTy.getNumElements();
8607 for (unsigned I = 0; I < NumElmts; ++I) {
8608 auto Idx = MIRBuilder.buildConstant(IdxTy, I);
8609 auto Val = MIRBuilder.buildExtractVectorElement(ValTy, Vec, Idx);
8610 Register ElmtPtr =
8611 getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0));
8612 MIRBuilder.buildStore(Val, ElmtPtr, ValPtrInfo, ValAlign);
8613
8614 LLT MaskITy = MaskTy.getElementType();
8615 auto MaskI = MIRBuilder.buildExtractVectorElement(MaskITy, Mask, Idx);
8616 if (MaskITy.getSizeInBits() > 1)
8617 MaskI = MIRBuilder.buildTrunc(LLT::scalar(1), MaskI);
8618
8619 MaskI = MIRBuilder.buildZExt(IdxTy, MaskI);
8620 OutPos = MIRBuilder.buildAdd(IdxTy, OutPos, MaskI);
8621
8622 if (HasPassthru && I == NumElmts - 1) {
8623 auto EndOfVector =
8624 MIRBuilder.buildConstant(IdxTy, VecTy.getNumElements() - 1);
8625 auto AllLanesSelected = MIRBuilder.buildICmp(
8626 CmpInst::ICMP_UGT, LLT::scalar(1), OutPos, EndOfVector);
8627 OutPos = MIRBuilder.buildInstr(TargetOpcode::G_UMIN, {IdxTy},
8628 {OutPos, EndOfVector});
8629 ElmtPtr = getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0));
8630
8631 LastWriteVal =
8632 MIRBuilder.buildSelect(ValTy, AllLanesSelected, Val, LastWriteVal)
8633 .getReg(0);
8634 MIRBuilder.buildStore(LastWriteVal, ElmtPtr, ValPtrInfo, ValAlign);
8635 }
8636 }
8637
8638 // TODO: Use StackPtr's FrameIndex alignment.
8639 MIRBuilder.buildLoad(Dst, StackPtr, PtrInfo, VecAlign);
8640
8641 MI.eraseFromParent();
8642 return Legalized;
8643 }
8644
getDynStackAllocTargetPtr(Register SPReg,Register AllocSize,Align Alignment,LLT PtrTy)8645 Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
8646 Register AllocSize,
8647 Align Alignment,
8648 LLT PtrTy) {
8649 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
8650
8651 auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
8652 SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
8653
8654 // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
8655 // have to generate an extra instruction to negate the alloc and then use
8656 // G_PTR_ADD to add the negative offset.
8657 auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
8658 if (Alignment > Align(1)) {
8659 APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
8660 AlignMask.negate();
8661 auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
8662 Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
8663 }
8664
8665 return MIRBuilder.buildCast(PtrTy, Alloc).getReg(0);
8666 }
8667
8668 LegalizerHelper::LegalizeResult
lowerDynStackAlloc(MachineInstr & MI)8669 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
8670 const auto &MF = *MI.getMF();
8671 const auto &TFI = *MF.getSubtarget().getFrameLowering();
8672 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
8673 return UnableToLegalize;
8674
8675 Register Dst = MI.getOperand(0).getReg();
8676 Register AllocSize = MI.getOperand(1).getReg();
8677 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
8678
8679 LLT PtrTy = MRI.getType(Dst);
8680 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
8681 Register SPTmp =
8682 getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
8683
8684 MIRBuilder.buildCopy(SPReg, SPTmp);
8685 MIRBuilder.buildCopy(Dst, SPTmp);
8686
8687 MI.eraseFromParent();
8688 return Legalized;
8689 }
8690
8691 LegalizerHelper::LegalizeResult
lowerStackSave(MachineInstr & MI)8692 LegalizerHelper::lowerStackSave(MachineInstr &MI) {
8693 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
8694 if (!StackPtr)
8695 return UnableToLegalize;
8696
8697 MIRBuilder.buildCopy(MI.getOperand(0), StackPtr);
8698 MI.eraseFromParent();
8699 return Legalized;
8700 }
8701
8702 LegalizerHelper::LegalizeResult
lowerStackRestore(MachineInstr & MI)8703 LegalizerHelper::lowerStackRestore(MachineInstr &MI) {
8704 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
8705 if (!StackPtr)
8706 return UnableToLegalize;
8707
8708 MIRBuilder.buildCopy(StackPtr, MI.getOperand(0));
8709 MI.eraseFromParent();
8710 return Legalized;
8711 }
8712
8713 LegalizerHelper::LegalizeResult
lowerExtract(MachineInstr & MI)8714 LegalizerHelper::lowerExtract(MachineInstr &MI) {
8715 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
8716 unsigned Offset = MI.getOperand(2).getImm();
8717
8718 // Extract sub-vector or one element
8719 if (SrcTy.isVector()) {
8720 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
8721 unsigned DstSize = DstTy.getSizeInBits();
8722
8723 if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
8724 (Offset + DstSize <= SrcTy.getSizeInBits())) {
8725 // Unmerge and allow access to each Src element for the artifact combiner.
8726 auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), SrcReg);
8727
8728 // Take element(s) we need to extract and copy it (merge them).
8729 SmallVector<Register, 8> SubVectorElts;
8730 for (unsigned Idx = Offset / SrcEltSize;
8731 Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
8732 SubVectorElts.push_back(Unmerge.getReg(Idx));
8733 }
8734 if (SubVectorElts.size() == 1)
8735 MIRBuilder.buildCopy(DstReg, SubVectorElts[0]);
8736 else
8737 MIRBuilder.buildMergeLikeInstr(DstReg, SubVectorElts);
8738
8739 MI.eraseFromParent();
8740 return Legalized;
8741 }
8742 }
8743
8744 if (DstTy.isScalar() &&
8745 (SrcTy.isScalar() ||
8746 (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
8747 LLT SrcIntTy = SrcTy;
8748 if (!SrcTy.isScalar()) {
8749 SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
8750 SrcReg = MIRBuilder.buildBitcast(SrcIntTy, SrcReg).getReg(0);
8751 }
8752
8753 if (Offset == 0)
8754 MIRBuilder.buildTrunc(DstReg, SrcReg);
8755 else {
8756 auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
8757 auto Shr = MIRBuilder.buildLShr(SrcIntTy, SrcReg, ShiftAmt);
8758 MIRBuilder.buildTrunc(DstReg, Shr);
8759 }
8760
8761 MI.eraseFromParent();
8762 return Legalized;
8763 }
8764
8765 return UnableToLegalize;
8766 }
8767
lowerInsert(MachineInstr & MI)8768 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
8769 auto [Dst, Src, InsertSrc] = MI.getFirst3Regs();
8770 uint64_t Offset = MI.getOperand(3).getImm();
8771
8772 LLT DstTy = MRI.getType(Src);
8773 LLT InsertTy = MRI.getType(InsertSrc);
8774
8775 // Insert sub-vector or one element
8776 if (DstTy.isVector() && !InsertTy.isPointer()) {
8777 LLT EltTy = DstTy.getElementType();
8778 unsigned EltSize = EltTy.getSizeInBits();
8779 unsigned InsertSize = InsertTy.getSizeInBits();
8780
8781 if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
8782 (Offset + InsertSize <= DstTy.getSizeInBits())) {
8783 auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
8784 SmallVector<Register, 8> DstElts;
8785 unsigned Idx = 0;
8786 // Elements from Src before insert start Offset
8787 for (; Idx < Offset / EltSize; ++Idx) {
8788 DstElts.push_back(UnmergeSrc.getReg(Idx));
8789 }
8790
8791 // Replace elements in Src with elements from InsertSrc
8792 if (InsertTy.getSizeInBits() > EltSize) {
8793 auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
8794 for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
8795 ++Idx, ++i) {
8796 DstElts.push_back(UnmergeInsertSrc.getReg(i));
8797 }
8798 } else {
8799 DstElts.push_back(InsertSrc);
8800 ++Idx;
8801 }
8802
8803 // Remaining elements from Src after insert
8804 for (; Idx < DstTy.getNumElements(); ++Idx) {
8805 DstElts.push_back(UnmergeSrc.getReg(Idx));
8806 }
8807
8808 MIRBuilder.buildMergeLikeInstr(Dst, DstElts);
8809 MI.eraseFromParent();
8810 return Legalized;
8811 }
8812 }
8813
8814 if (InsertTy.isVector() ||
8815 (DstTy.isVector() && DstTy.getElementType() != InsertTy))
8816 return UnableToLegalize;
8817
8818 const DataLayout &DL = MIRBuilder.getDataLayout();
8819 if ((DstTy.isPointer() &&
8820 DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
8821 (InsertTy.isPointer() &&
8822 DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
8823 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
8824 return UnableToLegalize;
8825 }
8826
8827 LLT IntDstTy = DstTy;
8828
8829 if (!DstTy.isScalar()) {
8830 IntDstTy = LLT::scalar(DstTy.getSizeInBits());
8831 Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
8832 }
8833
8834 if (!InsertTy.isScalar()) {
8835 const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
8836 InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
8837 }
8838
8839 Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
8840 if (Offset != 0) {
8841 auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
8842 ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
8843 }
8844
8845 APInt MaskVal = APInt::getBitsSetWithWrap(
8846 DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
8847
8848 auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
8849 auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
8850 auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
8851
8852 MIRBuilder.buildCast(Dst, Or);
8853 MI.eraseFromParent();
8854 return Legalized;
8855 }
8856
8857 LegalizerHelper::LegalizeResult
lowerSADDO_SSUBO(MachineInstr & MI)8858 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
8859 auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] =
8860 MI.getFirst4RegLLTs();
8861 const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
8862
8863 LLT Ty = Dst0Ty;
8864 LLT BoolTy = Dst1Ty;
8865
8866 Register NewDst0 = MRI.cloneVirtualRegister(Dst0);
8867
8868 if (IsAdd)
8869 MIRBuilder.buildAdd(NewDst0, LHS, RHS);
8870 else
8871 MIRBuilder.buildSub(NewDst0, LHS, RHS);
8872
8873 // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
8874
8875 auto Zero = MIRBuilder.buildConstant(Ty, 0);
8876
8877 // For an addition, the result should be less than one of the operands (LHS)
8878 // if and only if the other operand (RHS) is negative, otherwise there will
8879 // be overflow.
8880 // For a subtraction, the result should be less than one of the operands
8881 // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
8882 // otherwise there will be overflow.
8883 auto ResultLowerThanLHS =
8884 MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, NewDst0, LHS);
8885 auto ConditionRHS = MIRBuilder.buildICmp(
8886 IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
8887
8888 MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
8889
8890 MIRBuilder.buildCopy(Dst0, NewDst0);
8891 MI.eraseFromParent();
8892
8893 return Legalized;
8894 }
8895
8896 LegalizerHelper::LegalizeResult
lowerAddSubSatToMinMax(MachineInstr & MI)8897 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
8898 auto [Res, LHS, RHS] = MI.getFirst3Regs();
8899 LLT Ty = MRI.getType(Res);
8900 bool IsSigned;
8901 bool IsAdd;
8902 unsigned BaseOp;
8903 switch (MI.getOpcode()) {
8904 default:
8905 llvm_unreachable("unexpected addsat/subsat opcode");
8906 case TargetOpcode::G_UADDSAT:
8907 IsSigned = false;
8908 IsAdd = true;
8909 BaseOp = TargetOpcode::G_ADD;
8910 break;
8911 case TargetOpcode::G_SADDSAT:
8912 IsSigned = true;
8913 IsAdd = true;
8914 BaseOp = TargetOpcode::G_ADD;
8915 break;
8916 case TargetOpcode::G_USUBSAT:
8917 IsSigned = false;
8918 IsAdd = false;
8919 BaseOp = TargetOpcode::G_SUB;
8920 break;
8921 case TargetOpcode::G_SSUBSAT:
8922 IsSigned = true;
8923 IsAdd = false;
8924 BaseOp = TargetOpcode::G_SUB;
8925 break;
8926 }
8927
8928 if (IsSigned) {
8929 // sadd.sat(a, b) ->
8930 // hi = 0x7fffffff - smax(a, 0)
8931 // lo = 0x80000000 - smin(a, 0)
8932 // a + smin(smax(lo, b), hi)
8933 // ssub.sat(a, b) ->
8934 // lo = smax(a, -1) - 0x7fffffff
8935 // hi = smin(a, -1) - 0x80000000
8936 // a - smin(smax(lo, b), hi)
8937 // TODO: AMDGPU can use a "median of 3" instruction here:
8938 // a +/- med3(lo, b, hi)
8939 uint64_t NumBits = Ty.getScalarSizeInBits();
8940 auto MaxVal =
8941 MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
8942 auto MinVal =
8943 MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
8944 MachineInstrBuilder Hi, Lo;
8945 if (IsAdd) {
8946 auto Zero = MIRBuilder.buildConstant(Ty, 0);
8947 Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
8948 Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
8949 } else {
8950 auto NegOne = MIRBuilder.buildConstant(Ty, -1);
8951 Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
8952 MaxVal);
8953 Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
8954 MinVal);
8955 }
8956 auto RHSClamped =
8957 MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
8958 MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
8959 } else {
8960 // uadd.sat(a, b) -> a + umin(~a, b)
8961 // usub.sat(a, b) -> a - umin(a, b)
8962 Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
8963 auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
8964 MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
8965 }
8966
8967 MI.eraseFromParent();
8968 return Legalized;
8969 }
8970
8971 LegalizerHelper::LegalizeResult
lowerAddSubSatToAddoSubo(MachineInstr & MI)8972 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
8973 auto [Res, LHS, RHS] = MI.getFirst3Regs();
8974 LLT Ty = MRI.getType(Res);
8975 LLT BoolTy = Ty.changeElementSize(1);
8976 bool IsSigned;
8977 bool IsAdd;
8978 unsigned OverflowOp;
8979 switch (MI.getOpcode()) {
8980 default:
8981 llvm_unreachable("unexpected addsat/subsat opcode");
8982 case TargetOpcode::G_UADDSAT:
8983 IsSigned = false;
8984 IsAdd = true;
8985 OverflowOp = TargetOpcode::G_UADDO;
8986 break;
8987 case TargetOpcode::G_SADDSAT:
8988 IsSigned = true;
8989 IsAdd = true;
8990 OverflowOp = TargetOpcode::G_SADDO;
8991 break;
8992 case TargetOpcode::G_USUBSAT:
8993 IsSigned = false;
8994 IsAdd = false;
8995 OverflowOp = TargetOpcode::G_USUBO;
8996 break;
8997 case TargetOpcode::G_SSUBSAT:
8998 IsSigned = true;
8999 IsAdd = false;
9000 OverflowOp = TargetOpcode::G_SSUBO;
9001 break;
9002 }
9003
9004 auto OverflowRes =
9005 MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
9006 Register Tmp = OverflowRes.getReg(0);
9007 Register Ov = OverflowRes.getReg(1);
9008 MachineInstrBuilder Clamp;
9009 if (IsSigned) {
9010 // sadd.sat(a, b) ->
9011 // {tmp, ov} = saddo(a, b)
9012 // ov ? (tmp >>s 31) + 0x80000000 : r
9013 // ssub.sat(a, b) ->
9014 // {tmp, ov} = ssubo(a, b)
9015 // ov ? (tmp >>s 31) + 0x80000000 : r
9016 uint64_t NumBits = Ty.getScalarSizeInBits();
9017 auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
9018 auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
9019 auto MinVal =
9020 MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
9021 Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
9022 } else {
9023 // uadd.sat(a, b) ->
9024 // {tmp, ov} = uaddo(a, b)
9025 // ov ? 0xffffffff : tmp
9026 // usub.sat(a, b) ->
9027 // {tmp, ov} = usubo(a, b)
9028 // ov ? 0 : tmp
9029 Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
9030 }
9031 MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
9032
9033 MI.eraseFromParent();
9034 return Legalized;
9035 }
9036
9037 LegalizerHelper::LegalizeResult
lowerShlSat(MachineInstr & MI)9038 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
9039 assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
9040 MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
9041 "Expected shlsat opcode!");
9042 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
9043 auto [Res, LHS, RHS] = MI.getFirst3Regs();
9044 LLT Ty = MRI.getType(Res);
9045 LLT BoolTy = Ty.changeElementSize(1);
9046
9047 unsigned BW = Ty.getScalarSizeInBits();
9048 auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
9049 auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
9050 : MIRBuilder.buildLShr(Ty, Result, RHS);
9051
9052 MachineInstrBuilder SatVal;
9053 if (IsSigned) {
9054 auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
9055 auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
9056 auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
9057 MIRBuilder.buildConstant(Ty, 0));
9058 SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
9059 } else {
9060 SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
9061 }
9062 auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
9063 MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
9064
9065 MI.eraseFromParent();
9066 return Legalized;
9067 }
9068
lowerBswap(MachineInstr & MI)9069 LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
9070 auto [Dst, Src] = MI.getFirst2Regs();
9071 const LLT Ty = MRI.getType(Src);
9072 unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
9073 unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
9074
9075 // Swap most and least significant byte, set remaining bytes in Res to zero.
9076 auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
9077 auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
9078 auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
9079 auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
9080
9081 // Set i-th high/low byte in Res to i-th low/high byte from Src.
9082 for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
9083 // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
9084 APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
9085 auto Mask = MIRBuilder.buildConstant(Ty, APMask);
9086 auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
9087 // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
9088 auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
9089 auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
9090 Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
9091 // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
9092 auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
9093 auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
9094 Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
9095 }
9096 Res.getInstr()->getOperand(0).setReg(Dst);
9097
9098 MI.eraseFromParent();
9099 return Legalized;
9100 }
9101
9102 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
SwapN(unsigned N,DstOp Dst,MachineIRBuilder & B,MachineInstrBuilder Src,const APInt & Mask)9103 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
9104 MachineInstrBuilder Src, const APInt &Mask) {
9105 const LLT Ty = Dst.getLLTTy(*B.getMRI());
9106 MachineInstrBuilder C_N = B.buildConstant(Ty, N);
9107 MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
9108 auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
9109 auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
9110 return B.buildOr(Dst, LHS, RHS);
9111 }
9112
9113 LegalizerHelper::LegalizeResult
lowerBitreverse(MachineInstr & MI)9114 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
9115 auto [Dst, Src] = MI.getFirst2Regs();
9116 const LLT SrcTy = MRI.getType(Src);
9117 unsigned Size = SrcTy.getScalarSizeInBits();
9118 unsigned VSize = SrcTy.getSizeInBits();
9119
9120 if (Size >= 8) {
9121 if (SrcTy.isVector() && (VSize % 8 == 0) &&
9122 (LI.isLegal({TargetOpcode::G_BITREVERSE,
9123 {LLT::fixed_vector(VSize / 8, 8),
9124 LLT::fixed_vector(VSize / 8, 8)}}))) {
9125 // If bitreverse is legal for i8 vector of the same size, then cast
9126 // to i8 vector type.
9127 // e.g. v4s32 -> v16s8
9128 LLT VTy = LLT::fixed_vector(VSize / 8, 8);
9129 auto BSWAP = MIRBuilder.buildBSwap(SrcTy, Src);
9130 auto Cast = MIRBuilder.buildBitcast(VTy, BSWAP);
9131 auto RBIT = MIRBuilder.buildBitReverse(VTy, Cast);
9132 MIRBuilder.buildBitcast(Dst, RBIT);
9133 } else {
9134 MachineInstrBuilder BSWAP =
9135 MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {SrcTy}, {Src});
9136
9137 // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
9138 // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
9139 // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
9140 MachineInstrBuilder Swap4 = SwapN(4, SrcTy, MIRBuilder, BSWAP,
9141 APInt::getSplat(Size, APInt(8, 0xF0)));
9142
9143 // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
9144 // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
9145 // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
9146 MachineInstrBuilder Swap2 = SwapN(2, SrcTy, MIRBuilder, Swap4,
9147 APInt::getSplat(Size, APInt(8, 0xCC)));
9148
9149 // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5
9150 // 6|7
9151 // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
9152 // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
9153 SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
9154 }
9155 } else {
9156 // Expand bitreverse for types smaller than 8 bits.
9157 MachineInstrBuilder Tmp;
9158 for (unsigned I = 0, J = Size - 1; I < Size; ++I, --J) {
9159 MachineInstrBuilder Tmp2;
9160 if (I < J) {
9161 auto ShAmt = MIRBuilder.buildConstant(SrcTy, J - I);
9162 Tmp2 = MIRBuilder.buildShl(SrcTy, Src, ShAmt);
9163 } else {
9164 auto ShAmt = MIRBuilder.buildConstant(SrcTy, I - J);
9165 Tmp2 = MIRBuilder.buildLShr(SrcTy, Src, ShAmt);
9166 }
9167
9168 auto Mask = MIRBuilder.buildConstant(SrcTy, 1ULL << J);
9169 Tmp2 = MIRBuilder.buildAnd(SrcTy, Tmp2, Mask);
9170 if (I == 0)
9171 Tmp = Tmp2;
9172 else
9173 Tmp = MIRBuilder.buildOr(SrcTy, Tmp, Tmp2);
9174 }
9175 MIRBuilder.buildCopy(Dst, Tmp);
9176 }
9177
9178 MI.eraseFromParent();
9179 return Legalized;
9180 }
9181
9182 LegalizerHelper::LegalizeResult
lowerReadWriteRegister(MachineInstr & MI)9183 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
9184 MachineFunction &MF = MIRBuilder.getMF();
9185
9186 bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
9187 int NameOpIdx = IsRead ? 1 : 0;
9188 int ValRegIndex = IsRead ? 0 : 1;
9189
9190 Register ValReg = MI.getOperand(ValRegIndex).getReg();
9191 const LLT Ty = MRI.getType(ValReg);
9192 const MDString *RegStr = cast<MDString>(
9193 cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
9194
9195 Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
9196 if (!PhysReg) {
9197 const Function &Fn = MF.getFunction();
9198 Fn.getContext().diagnose(DiagnosticInfoGenericWithLoc(
9199 "invalid register \"" + Twine(RegStr->getString().data()) + "\" for " +
9200 (IsRead ? "llvm.read_register" : "llvm.write_register"),
9201 Fn, MI.getDebugLoc()));
9202 if (IsRead)
9203 MIRBuilder.buildUndef(ValReg);
9204
9205 MI.eraseFromParent();
9206 return Legalized;
9207 }
9208
9209 if (IsRead)
9210 MIRBuilder.buildCopy(ValReg, PhysReg);
9211 else
9212 MIRBuilder.buildCopy(PhysReg, ValReg);
9213
9214 MI.eraseFromParent();
9215 return Legalized;
9216 }
9217
9218 LegalizerHelper::LegalizeResult
lowerSMULH_UMULH(MachineInstr & MI)9219 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
9220 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
9221 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
9222 Register Result = MI.getOperand(0).getReg();
9223 LLT OrigTy = MRI.getType(Result);
9224 auto SizeInBits = OrigTy.getScalarSizeInBits();
9225 LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
9226
9227 auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
9228 auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
9229 auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
9230 unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
9231
9232 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
9233 auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
9234 MIRBuilder.buildTrunc(Result, Shifted);
9235
9236 MI.eraseFromParent();
9237 return Legalized;
9238 }
9239
9240 LegalizerHelper::LegalizeResult
lowerISFPCLASS(MachineInstr & MI)9241 LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
9242 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
9243 FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm());
9244
9245 if (Mask == fcNone) {
9246 MIRBuilder.buildConstant(DstReg, 0);
9247 MI.eraseFromParent();
9248 return Legalized;
9249 }
9250 if (Mask == fcAllFlags) {
9251 MIRBuilder.buildConstant(DstReg, 1);
9252 MI.eraseFromParent();
9253 return Legalized;
9254 }
9255
9256 // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
9257 // version
9258
9259 unsigned BitSize = SrcTy.getScalarSizeInBits();
9260 const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
9261
9262 LLT IntTy = LLT::scalar(BitSize);
9263 if (SrcTy.isVector())
9264 IntTy = LLT::vector(SrcTy.getElementCount(), IntTy);
9265 auto AsInt = MIRBuilder.buildCopy(IntTy, SrcReg);
9266
9267 // Various masks.
9268 APInt SignBit = APInt::getSignMask(BitSize);
9269 APInt ValueMask = APInt::getSignedMaxValue(BitSize); // All bits but sign.
9270 APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit.
9271 APInt ExpMask = Inf;
9272 APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
9273 APInt QNaNBitMask =
9274 APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
9275 APInt InversionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits());
9276
9277 auto SignBitC = MIRBuilder.buildConstant(IntTy, SignBit);
9278 auto ValueMaskC = MIRBuilder.buildConstant(IntTy, ValueMask);
9279 auto InfC = MIRBuilder.buildConstant(IntTy, Inf);
9280 auto ExpMaskC = MIRBuilder.buildConstant(IntTy, ExpMask);
9281 auto ZeroC = MIRBuilder.buildConstant(IntTy, 0);
9282
9283 auto Abs = MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC);
9284 auto Sign =
9285 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs);
9286
9287 auto Res = MIRBuilder.buildConstant(DstTy, 0);
9288 // Clang doesn't support capture of structured bindings:
9289 LLT DstTyCopy = DstTy;
9290 const auto appendToRes = [&](MachineInstrBuilder ToAppend) {
9291 Res = MIRBuilder.buildOr(DstTyCopy, Res, ToAppend);
9292 };
9293
9294 // Tests that involve more than one class should be processed first.
9295 if ((Mask & fcFinite) == fcFinite) {
9296 // finite(V) ==> abs(V) u< exp_mask
9297 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
9298 ExpMaskC));
9299 Mask &= ~fcFinite;
9300 } else if ((Mask & fcFinite) == fcPosFinite) {
9301 // finite(V) && V > 0 ==> V u< exp_mask
9302 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt,
9303 ExpMaskC));
9304 Mask &= ~fcPosFinite;
9305 } else if ((Mask & fcFinite) == fcNegFinite) {
9306 // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
9307 auto Cmp = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
9308 ExpMaskC);
9309 auto And = MIRBuilder.buildAnd(DstTy, Cmp, Sign);
9310 appendToRes(And);
9311 Mask &= ~fcNegFinite;
9312 }
9313
9314 if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
9315 // fcZero | fcSubnormal => test all exponent bits are 0
9316 // TODO: Handle sign bit specific cases
9317 // TODO: Handle inverted case
9318 if (PartialCheck == (fcZero | fcSubnormal)) {
9319 auto ExpBits = MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC);
9320 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9321 ExpBits, ZeroC));
9322 Mask &= ~PartialCheck;
9323 }
9324 }
9325
9326 // Check for individual classes.
9327 if (FPClassTest PartialCheck = Mask & fcZero) {
9328 if (PartialCheck == fcPosZero)
9329 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9330 AsInt, ZeroC));
9331 else if (PartialCheck == fcZero)
9332 appendToRes(
9333 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC));
9334 else // fcNegZero
9335 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9336 AsInt, SignBitC));
9337 }
9338
9339 if (FPClassTest PartialCheck = Mask & fcSubnormal) {
9340 // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
9341 // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
9342 auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
9343 auto OneC = MIRBuilder.buildConstant(IntTy, 1);
9344 auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC);
9345 auto SubnormalRes =
9346 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne,
9347 MIRBuilder.buildConstant(IntTy, AllOneMantissa));
9348 if (PartialCheck == fcNegSubnormal)
9349 SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign);
9350 appendToRes(SubnormalRes);
9351 }
9352
9353 if (FPClassTest PartialCheck = Mask & fcInf) {
9354 if (PartialCheck == fcPosInf)
9355 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9356 AsInt, InfC));
9357 else if (PartialCheck == fcInf)
9358 appendToRes(
9359 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC));
9360 else { // fcNegInf
9361 APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt();
9362 auto NegInfC = MIRBuilder.buildConstant(IntTy, NegInf);
9363 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9364 AsInt, NegInfC));
9365 }
9366 }
9367
9368 if (FPClassTest PartialCheck = Mask & fcNan) {
9369 auto InfWithQnanBitC = MIRBuilder.buildConstant(IntTy, Inf | QNaNBitMask);
9370 if (PartialCheck == fcNan) {
9371 // isnan(V) ==> abs(V) u> int(inf)
9372 appendToRes(
9373 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC));
9374 } else if (PartialCheck == fcQNan) {
9375 // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
9376 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs,
9377 InfWithQnanBitC));
9378 } else { // fcSNan
9379 // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
9380 // abs(V) u< (unsigned(Inf) | quiet_bit)
9381 auto IsNan =
9382 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC);
9383 auto IsNotQnan = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy,
9384 Abs, InfWithQnanBitC);
9385 appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan));
9386 }
9387 }
9388
9389 if (FPClassTest PartialCheck = Mask & fcNormal) {
9390 // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
9391 // (max_exp-1))
9392 APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
9393 auto ExpMinusOne = MIRBuilder.buildSub(
9394 IntTy, Abs, MIRBuilder.buildConstant(IntTy, ExpLSB));
9395 APInt MaxExpMinusOne = ExpMask - ExpLSB;
9396 auto NormalRes =
9397 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
9398 MIRBuilder.buildConstant(IntTy, MaxExpMinusOne));
9399 if (PartialCheck == fcNegNormal)
9400 NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign);
9401 else if (PartialCheck == fcPosNormal) {
9402 auto PosSign = MIRBuilder.buildXor(
9403 DstTy, Sign, MIRBuilder.buildConstant(DstTy, InversionMask));
9404 NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign);
9405 }
9406 appendToRes(NormalRes);
9407 }
9408
9409 MIRBuilder.buildCopy(DstReg, Res);
9410 MI.eraseFromParent();
9411 return Legalized;
9412 }
9413
lowerSelect(MachineInstr & MI)9414 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
9415 // Implement G_SELECT in terms of XOR, AND, OR.
9416 auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
9417 MI.getFirst4RegLLTs();
9418
9419 bool IsEltPtr = DstTy.isPointerOrPointerVector();
9420 if (IsEltPtr) {
9421 LLT ScalarPtrTy = LLT::scalar(DstTy.getScalarSizeInBits());
9422 LLT NewTy = DstTy.changeElementType(ScalarPtrTy);
9423 Op1Reg = MIRBuilder.buildPtrToInt(NewTy, Op1Reg).getReg(0);
9424 Op2Reg = MIRBuilder.buildPtrToInt(NewTy, Op2Reg).getReg(0);
9425 DstTy = NewTy;
9426 }
9427
9428 if (MaskTy.isScalar()) {
9429 // Turn the scalar condition into a vector condition mask if needed.
9430
9431 Register MaskElt = MaskReg;
9432
9433 // The condition was potentially zero extended before, but we want a sign
9434 // extended boolean.
9435 if (MaskTy != LLT::scalar(1))
9436 MaskElt = MIRBuilder.buildSExtInReg(MaskTy, MaskElt, 1).getReg(0);
9437
9438 // Continue the sign extension (or truncate) to match the data type.
9439 MaskElt =
9440 MIRBuilder.buildSExtOrTrunc(DstTy.getScalarType(), MaskElt).getReg(0);
9441
9442 if (DstTy.isVector()) {
9443 // Generate a vector splat idiom.
9444 auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
9445 MaskReg = ShufSplat.getReg(0);
9446 } else {
9447 MaskReg = MaskElt;
9448 }
9449 MaskTy = DstTy;
9450 } else if (!DstTy.isVector()) {
9451 // Cannot handle the case that mask is a vector and dst is a scalar.
9452 return UnableToLegalize;
9453 }
9454
9455 if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
9456 return UnableToLegalize;
9457 }
9458
9459 auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
9460 auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
9461 auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
9462 if (IsEltPtr) {
9463 auto Or = MIRBuilder.buildOr(DstTy, NewOp1, NewOp2);
9464 MIRBuilder.buildIntToPtr(DstReg, Or);
9465 } else {
9466 MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
9467 }
9468 MI.eraseFromParent();
9469 return Legalized;
9470 }
9471
lowerDIVREM(MachineInstr & MI)9472 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
9473 // Split DIVREM into individual instructions.
9474 unsigned Opcode = MI.getOpcode();
9475
9476 MIRBuilder.buildInstr(
9477 Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
9478 : TargetOpcode::G_UDIV,
9479 {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
9480 MIRBuilder.buildInstr(
9481 Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
9482 : TargetOpcode::G_UREM,
9483 {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
9484 MI.eraseFromParent();
9485 return Legalized;
9486 }
9487
9488 LegalizerHelper::LegalizeResult
lowerAbsToAddXor(MachineInstr & MI)9489 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
9490 // Expand %res = G_ABS %a into:
9491 // %v1 = G_ASHR %a, scalar_size-1
9492 // %v2 = G_ADD %a, %v1
9493 // %res = G_XOR %v2, %v1
9494 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
9495 Register OpReg = MI.getOperand(1).getReg();
9496 auto ShiftAmt =
9497 MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
9498 auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
9499 auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
9500 MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
9501 MI.eraseFromParent();
9502 return Legalized;
9503 }
9504
9505 LegalizerHelper::LegalizeResult
lowerAbsToMaxNeg(MachineInstr & MI)9506 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
9507 // Expand %res = G_ABS %a into:
9508 // %v1 = G_CONSTANT 0
9509 // %v2 = G_SUB %v1, %a
9510 // %res = G_SMAX %a, %v2
9511 Register SrcReg = MI.getOperand(1).getReg();
9512 LLT Ty = MRI.getType(SrcReg);
9513 auto Zero = MIRBuilder.buildConstant(Ty, 0);
9514 auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg);
9515 MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
9516 MI.eraseFromParent();
9517 return Legalized;
9518 }
9519
9520 LegalizerHelper::LegalizeResult
lowerAbsToCNeg(MachineInstr & MI)9521 LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) {
9522 Register SrcReg = MI.getOperand(1).getReg();
9523 Register DestReg = MI.getOperand(0).getReg();
9524 LLT Ty = MRI.getType(SrcReg), IType = LLT::scalar(1);
9525 auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
9526 auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
9527 auto ICmp = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, IType, SrcReg, Zero);
9528 MIRBuilder.buildSelect(DestReg, ICmp, SrcReg, Sub);
9529 MI.eraseFromParent();
9530 return Legalized;
9531 }
9532
lowerFAbs(MachineInstr & MI)9533 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFAbs(MachineInstr &MI) {
9534 Register SrcReg = MI.getOperand(1).getReg();
9535 Register DstReg = MI.getOperand(0).getReg();
9536
9537 LLT Ty = MRI.getType(DstReg);
9538
9539 // Reset sign bit
9540 MIRBuilder.buildAnd(
9541 DstReg, SrcReg,
9542 MIRBuilder.buildConstant(
9543 Ty, APInt::getSignedMaxValue(Ty.getScalarSizeInBits())));
9544
9545 MI.eraseFromParent();
9546 return Legalized;
9547 }
9548
9549 LegalizerHelper::LegalizeResult
lowerVectorReduction(MachineInstr & MI)9550 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
9551 Register SrcReg = MI.getOperand(1).getReg();
9552 LLT SrcTy = MRI.getType(SrcReg);
9553 LLT DstTy = MRI.getType(SrcReg);
9554
9555 // The source could be a scalar if the IR type was <1 x sN>.
9556 if (SrcTy.isScalar()) {
9557 if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
9558 return UnableToLegalize; // FIXME: handle extension.
9559 // This can be just a plain copy.
9560 Observer.changingInstr(MI);
9561 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
9562 Observer.changedInstr(MI);
9563 return Legalized;
9564 }
9565 return UnableToLegalize;
9566 }
9567
lowerVAArg(MachineInstr & MI)9568 LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) {
9569 MachineFunction &MF = *MI.getMF();
9570 const DataLayout &DL = MIRBuilder.getDataLayout();
9571 LLVMContext &Ctx = MF.getFunction().getContext();
9572 Register ListPtr = MI.getOperand(1).getReg();
9573 LLT PtrTy = MRI.getType(ListPtr);
9574
9575 // LstPtr is a pointer to the head of the list. Get the address
9576 // of the head of the list.
9577 Align PtrAlignment = DL.getABITypeAlign(getTypeForLLT(PtrTy, Ctx));
9578 MachineMemOperand *PtrLoadMMO = MF.getMachineMemOperand(
9579 MachinePointerInfo(), MachineMemOperand::MOLoad, PtrTy, PtrAlignment);
9580 auto VAList = MIRBuilder.buildLoad(PtrTy, ListPtr, *PtrLoadMMO).getReg(0);
9581
9582 const Align A(MI.getOperand(2).getImm());
9583 LLT PtrTyAsScalarTy = LLT::scalar(PtrTy.getSizeInBits());
9584 if (A > TLI.getMinStackArgumentAlignment()) {
9585 Register AlignAmt =
9586 MIRBuilder.buildConstant(PtrTyAsScalarTy, A.value() - 1).getReg(0);
9587 auto AddDst = MIRBuilder.buildPtrAdd(PtrTy, VAList, AlignAmt);
9588 auto AndDst = MIRBuilder.buildMaskLowPtrBits(PtrTy, AddDst, Log2(A));
9589 VAList = AndDst.getReg(0);
9590 }
9591
9592 // Increment the pointer, VAList, to the next vaarg
9593 // The list should be bumped by the size of element in the current head of
9594 // list.
9595 Register Dst = MI.getOperand(0).getReg();
9596 LLT LLTTy = MRI.getType(Dst);
9597 Type *Ty = getTypeForLLT(LLTTy, Ctx);
9598 auto IncAmt =
9599 MIRBuilder.buildConstant(PtrTyAsScalarTy, DL.getTypeAllocSize(Ty));
9600 auto Succ = MIRBuilder.buildPtrAdd(PtrTy, VAList, IncAmt);
9601
9602 // Store the increment VAList to the legalized pointer
9603 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
9604 MachinePointerInfo(), MachineMemOperand::MOStore, PtrTy, PtrAlignment);
9605 MIRBuilder.buildStore(Succ, ListPtr, *StoreMMO);
9606 // Load the actual argument out of the pointer VAList
9607 Align EltAlignment = DL.getABITypeAlign(Ty);
9608 MachineMemOperand *EltLoadMMO = MF.getMachineMemOperand(
9609 MachinePointerInfo(), MachineMemOperand::MOLoad, LLTTy, EltAlignment);
9610 MIRBuilder.buildLoad(Dst, VAList, *EltLoadMMO);
9611
9612 MI.eraseFromParent();
9613 return Legalized;
9614 }
9615
shouldLowerMemFuncForSize(const MachineFunction & MF)9616 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
9617 // On Darwin, -Os means optimize for size without hurting performance, so
9618 // only really optimize for size when -Oz (MinSize) is used.
9619 if (MF.getTarget().getTargetTriple().isOSDarwin())
9620 return MF.getFunction().hasMinSize();
9621 return MF.getFunction().hasOptSize();
9622 }
9623
9624 // Returns a list of types to use for memory op lowering in MemOps. A partial
9625 // port of findOptimalMemOpLowering in TargetLowering.
findGISelOptimalMemOpLowering(std::vector<LLT> & MemOps,unsigned Limit,const MemOp & Op,unsigned DstAS,unsigned SrcAS,const AttributeList & FuncAttributes,const TargetLowering & TLI)9626 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
9627 unsigned Limit, const MemOp &Op,
9628 unsigned DstAS, unsigned SrcAS,
9629 const AttributeList &FuncAttributes,
9630 const TargetLowering &TLI) {
9631 if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
9632 return false;
9633
9634 LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
9635
9636 if (Ty == LLT()) {
9637 // Use the largest scalar type whose alignment constraints are satisfied.
9638 // We only need to check DstAlign here as SrcAlign is always greater or
9639 // equal to DstAlign (or zero).
9640 Ty = LLT::scalar(64);
9641 if (Op.isFixedDstAlign())
9642 while (Op.getDstAlign() < Ty.getSizeInBytes() &&
9643 !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
9644 Ty = LLT::scalar(Ty.getSizeInBytes());
9645 assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
9646 // FIXME: check for the largest legal type we can load/store to.
9647 }
9648
9649 unsigned NumMemOps = 0;
9650 uint64_t Size = Op.size();
9651 while (Size) {
9652 unsigned TySize = Ty.getSizeInBytes();
9653 while (TySize > Size) {
9654 // For now, only use non-vector load / store's for the left-over pieces.
9655 LLT NewTy = Ty;
9656 // FIXME: check for mem op safety and legality of the types. Not all of
9657 // SDAGisms map cleanly to GISel concepts.
9658 if (NewTy.isVector())
9659 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
9660 NewTy = LLT::scalar(llvm::bit_floor(NewTy.getSizeInBits() - 1));
9661 unsigned NewTySize = NewTy.getSizeInBytes();
9662 assert(NewTySize > 0 && "Could not find appropriate type");
9663
9664 // If the new LLT cannot cover all of the remaining bits, then consider
9665 // issuing a (or a pair of) unaligned and overlapping load / store.
9666 unsigned Fast;
9667 // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
9668 MVT VT = getMVTForLLT(Ty);
9669 if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
9670 TLI.allowsMisalignedMemoryAccesses(
9671 VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
9672 MachineMemOperand::MONone, &Fast) &&
9673 Fast)
9674 TySize = Size;
9675 else {
9676 Ty = NewTy;
9677 TySize = NewTySize;
9678 }
9679 }
9680
9681 if (++NumMemOps > Limit)
9682 return false;
9683
9684 MemOps.push_back(Ty);
9685 Size -= TySize;
9686 }
9687
9688 return true;
9689 }
9690
9691 // Get a vectorized representation of the memset value operand, GISel edition.
getMemsetValue(Register Val,LLT Ty,MachineIRBuilder & MIB)9692 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
9693 MachineRegisterInfo &MRI = *MIB.getMRI();
9694 unsigned NumBits = Ty.getScalarSizeInBits();
9695 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
9696 if (!Ty.isVector() && ValVRegAndVal) {
9697 APInt Scalar = ValVRegAndVal->Value.trunc(8);
9698 APInt SplatVal = APInt::getSplat(NumBits, Scalar);
9699 return MIB.buildConstant(Ty, SplatVal).getReg(0);
9700 }
9701
9702 // Extend the byte value to the larger type, and then multiply by a magic
9703 // value 0x010101... in order to replicate it across every byte.
9704 // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
9705 if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
9706 return MIB.buildConstant(Ty, 0).getReg(0);
9707 }
9708
9709 LLT ExtType = Ty.getScalarType();
9710 auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
9711 if (NumBits > 8) {
9712 APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
9713 auto MagicMI = MIB.buildConstant(ExtType, Magic);
9714 Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
9715 }
9716
9717 // For vector types create a G_BUILD_VECTOR.
9718 if (Ty.isVector())
9719 Val = MIB.buildSplatBuildVector(Ty, Val).getReg(0);
9720
9721 return Val;
9722 }
9723
9724 LegalizerHelper::LegalizeResult
lowerMemset(MachineInstr & MI,Register Dst,Register Val,uint64_t KnownLen,Align Alignment,bool IsVolatile)9725 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
9726 uint64_t KnownLen, Align Alignment,
9727 bool IsVolatile) {
9728 auto &MF = *MI.getParent()->getParent();
9729 const auto &TLI = *MF.getSubtarget().getTargetLowering();
9730 auto &DL = MF.getDataLayout();
9731 LLVMContext &C = MF.getFunction().getContext();
9732
9733 assert(KnownLen != 0 && "Have a zero length memset length!");
9734
9735 bool DstAlignCanChange = false;
9736 MachineFrameInfo &MFI = MF.getFrameInfo();
9737 bool OptSize = shouldLowerMemFuncForSize(MF);
9738
9739 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
9740 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
9741 DstAlignCanChange = true;
9742
9743 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
9744 std::vector<LLT> MemOps;
9745
9746 const auto &DstMMO = **MI.memoperands_begin();
9747 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
9748
9749 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
9750 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
9751
9752 if (!findGISelOptimalMemOpLowering(MemOps, Limit,
9753 MemOp::Set(KnownLen, DstAlignCanChange,
9754 Alignment,
9755 /*IsZeroMemset=*/IsZeroVal,
9756 /*IsVolatile=*/IsVolatile),
9757 DstPtrInfo.getAddrSpace(), ~0u,
9758 MF.getFunction().getAttributes(), TLI))
9759 return UnableToLegalize;
9760
9761 if (DstAlignCanChange) {
9762 // Get an estimate of the type from the LLT.
9763 Type *IRTy = getTypeForLLT(MemOps[0], C);
9764 Align NewAlign = DL.getABITypeAlign(IRTy);
9765 if (NewAlign > Alignment) {
9766 Alignment = NewAlign;
9767 unsigned FI = FIDef->getOperand(1).getIndex();
9768 // Give the stack frame object a larger alignment if needed.
9769 if (MFI.getObjectAlign(FI) < Alignment)
9770 MFI.setObjectAlignment(FI, Alignment);
9771 }
9772 }
9773
9774 MachineIRBuilder MIB(MI);
9775 // Find the largest store and generate the bit pattern for it.
9776 LLT LargestTy = MemOps[0];
9777 for (unsigned i = 1; i < MemOps.size(); i++)
9778 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
9779 LargestTy = MemOps[i];
9780
9781 // The memset stored value is always defined as an s8, so in order to make it
9782 // work with larger store types we need to repeat the bit pattern across the
9783 // wider type.
9784 Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
9785
9786 if (!MemSetValue)
9787 return UnableToLegalize;
9788
9789 // Generate the stores. For each store type in the list, we generate the
9790 // matching store of that type to the destination address.
9791 LLT PtrTy = MRI.getType(Dst);
9792 unsigned DstOff = 0;
9793 unsigned Size = KnownLen;
9794 for (unsigned I = 0; I < MemOps.size(); I++) {
9795 LLT Ty = MemOps[I];
9796 unsigned TySize = Ty.getSizeInBytes();
9797 if (TySize > Size) {
9798 // Issuing an unaligned load / store pair that overlaps with the previous
9799 // pair. Adjust the offset accordingly.
9800 assert(I == MemOps.size() - 1 && I != 0);
9801 DstOff -= TySize - Size;
9802 }
9803
9804 // If this store is smaller than the largest store see whether we can get
9805 // the smaller value for free with a truncate.
9806 Register Value = MemSetValue;
9807 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
9808 MVT VT = getMVTForLLT(Ty);
9809 MVT LargestVT = getMVTForLLT(LargestTy);
9810 if (!LargestTy.isVector() && !Ty.isVector() &&
9811 TLI.isTruncateFree(LargestVT, VT))
9812 Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
9813 else
9814 Value = getMemsetValue(Val, Ty, MIB);
9815 if (!Value)
9816 return UnableToLegalize;
9817 }
9818
9819 auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
9820
9821 Register Ptr = Dst;
9822 if (DstOff != 0) {
9823 auto Offset =
9824 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
9825 Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
9826 }
9827
9828 MIB.buildStore(Value, Ptr, *StoreMMO);
9829 DstOff += Ty.getSizeInBytes();
9830 Size -= TySize;
9831 }
9832
9833 MI.eraseFromParent();
9834 return Legalized;
9835 }
9836
9837 LegalizerHelper::LegalizeResult
lowerMemcpyInline(MachineInstr & MI)9838 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
9839 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
9840
9841 auto [Dst, Src, Len] = MI.getFirst3Regs();
9842
9843 const auto *MMOIt = MI.memoperands_begin();
9844 const MachineMemOperand *MemOp = *MMOIt;
9845 bool IsVolatile = MemOp->isVolatile();
9846
9847 // See if this is a constant length copy
9848 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
9849 // FIXME: support dynamically sized G_MEMCPY_INLINE
9850 assert(LenVRegAndVal &&
9851 "inline memcpy with dynamic size is not yet supported");
9852 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
9853 if (KnownLen == 0) {
9854 MI.eraseFromParent();
9855 return Legalized;
9856 }
9857
9858 const auto &DstMMO = **MI.memoperands_begin();
9859 const auto &SrcMMO = **std::next(MI.memoperands_begin());
9860 Align DstAlign = DstMMO.getBaseAlign();
9861 Align SrcAlign = SrcMMO.getBaseAlign();
9862
9863 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
9864 IsVolatile);
9865 }
9866
9867 LegalizerHelper::LegalizeResult
lowerMemcpyInline(MachineInstr & MI,Register Dst,Register Src,uint64_t KnownLen,Align DstAlign,Align SrcAlign,bool IsVolatile)9868 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
9869 uint64_t KnownLen, Align DstAlign,
9870 Align SrcAlign, bool IsVolatile) {
9871 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
9872 return lowerMemcpy(MI, Dst, Src, KnownLen,
9873 std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
9874 IsVolatile);
9875 }
9876
9877 LegalizerHelper::LegalizeResult
lowerMemcpy(MachineInstr & MI,Register Dst,Register Src,uint64_t KnownLen,uint64_t Limit,Align DstAlign,Align SrcAlign,bool IsVolatile)9878 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
9879 uint64_t KnownLen, uint64_t Limit, Align DstAlign,
9880 Align SrcAlign, bool IsVolatile) {
9881 auto &MF = *MI.getParent()->getParent();
9882 const auto &TLI = *MF.getSubtarget().getTargetLowering();
9883 auto &DL = MF.getDataLayout();
9884 LLVMContext &C = MF.getFunction().getContext();
9885
9886 assert(KnownLen != 0 && "Have a zero length memcpy length!");
9887
9888 bool DstAlignCanChange = false;
9889 MachineFrameInfo &MFI = MF.getFrameInfo();
9890 Align Alignment = std::min(DstAlign, SrcAlign);
9891
9892 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
9893 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
9894 DstAlignCanChange = true;
9895
9896 // FIXME: infer better src pointer alignment like SelectionDAG does here.
9897 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
9898 // if the memcpy is in a tail call position.
9899
9900 std::vector<LLT> MemOps;
9901
9902 const auto &DstMMO = **MI.memoperands_begin();
9903 const auto &SrcMMO = **std::next(MI.memoperands_begin());
9904 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
9905 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
9906
9907 if (!findGISelOptimalMemOpLowering(
9908 MemOps, Limit,
9909 MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
9910 IsVolatile),
9911 DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
9912 MF.getFunction().getAttributes(), TLI))
9913 return UnableToLegalize;
9914
9915 if (DstAlignCanChange) {
9916 // Get an estimate of the type from the LLT.
9917 Type *IRTy = getTypeForLLT(MemOps[0], C);
9918 Align NewAlign = DL.getABITypeAlign(IRTy);
9919
9920 // Don't promote to an alignment that would require dynamic stack
9921 // realignment.
9922 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
9923 if (!TRI->hasStackRealignment(MF))
9924 if (MaybeAlign StackAlign = DL.getStackAlignment())
9925 NewAlign = std::min(NewAlign, *StackAlign);
9926
9927 if (NewAlign > Alignment) {
9928 Alignment = NewAlign;
9929 unsigned FI = FIDef->getOperand(1).getIndex();
9930 // Give the stack frame object a larger alignment if needed.
9931 if (MFI.getObjectAlign(FI) < Alignment)
9932 MFI.setObjectAlignment(FI, Alignment);
9933 }
9934 }
9935
9936 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
9937
9938 MachineIRBuilder MIB(MI);
9939 // Now we need to emit a pair of load and stores for each of the types we've
9940 // collected. I.e. for each type, generate a load from the source pointer of
9941 // that type width, and then generate a corresponding store to the dest buffer
9942 // of that value loaded. This can result in a sequence of loads and stores
9943 // mixed types, depending on what the target specifies as good types to use.
9944 unsigned CurrOffset = 0;
9945 unsigned Size = KnownLen;
9946 for (auto CopyTy : MemOps) {
9947 // Issuing an unaligned load / store pair that overlaps with the previous
9948 // pair. Adjust the offset accordingly.
9949 if (CopyTy.getSizeInBytes() > Size)
9950 CurrOffset -= CopyTy.getSizeInBytes() - Size;
9951
9952 // Construct MMOs for the accesses.
9953 auto *LoadMMO =
9954 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
9955 auto *StoreMMO =
9956 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
9957
9958 // Create the load.
9959 Register LoadPtr = Src;
9960 Register Offset;
9961 if (CurrOffset != 0) {
9962 LLT SrcTy = MRI.getType(Src);
9963 Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
9964 .getReg(0);
9965 LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
9966 }
9967 auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
9968
9969 // Create the store.
9970 Register StorePtr = Dst;
9971 if (CurrOffset != 0) {
9972 LLT DstTy = MRI.getType(Dst);
9973 StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
9974 }
9975 MIB.buildStore(LdVal, StorePtr, *StoreMMO);
9976 CurrOffset += CopyTy.getSizeInBytes();
9977 Size -= CopyTy.getSizeInBytes();
9978 }
9979
9980 MI.eraseFromParent();
9981 return Legalized;
9982 }
9983
9984 LegalizerHelper::LegalizeResult
lowerMemmove(MachineInstr & MI,Register Dst,Register Src,uint64_t KnownLen,Align DstAlign,Align SrcAlign,bool IsVolatile)9985 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
9986 uint64_t KnownLen, Align DstAlign, Align SrcAlign,
9987 bool IsVolatile) {
9988 auto &MF = *MI.getParent()->getParent();
9989 const auto &TLI = *MF.getSubtarget().getTargetLowering();
9990 auto &DL = MF.getDataLayout();
9991 LLVMContext &C = MF.getFunction().getContext();
9992
9993 assert(KnownLen != 0 && "Have a zero length memmove length!");
9994
9995 bool DstAlignCanChange = false;
9996 MachineFrameInfo &MFI = MF.getFrameInfo();
9997 bool OptSize = shouldLowerMemFuncForSize(MF);
9998 Align Alignment = std::min(DstAlign, SrcAlign);
9999
10000 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
10001 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
10002 DstAlignCanChange = true;
10003
10004 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
10005 std::vector<LLT> MemOps;
10006
10007 const auto &DstMMO = **MI.memoperands_begin();
10008 const auto &SrcMMO = **std::next(MI.memoperands_begin());
10009 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
10010 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
10011
10012 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
10013 // to a bug in it's findOptimalMemOpLowering implementation. For now do the
10014 // same thing here.
10015 if (!findGISelOptimalMemOpLowering(
10016 MemOps, Limit,
10017 MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
10018 /*IsVolatile*/ true),
10019 DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
10020 MF.getFunction().getAttributes(), TLI))
10021 return UnableToLegalize;
10022
10023 if (DstAlignCanChange) {
10024 // Get an estimate of the type from the LLT.
10025 Type *IRTy = getTypeForLLT(MemOps[0], C);
10026 Align NewAlign = DL.getABITypeAlign(IRTy);
10027
10028 // Don't promote to an alignment that would require dynamic stack
10029 // realignment.
10030 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
10031 if (!TRI->hasStackRealignment(MF))
10032 if (MaybeAlign StackAlign = DL.getStackAlignment())
10033 NewAlign = std::min(NewAlign, *StackAlign);
10034
10035 if (NewAlign > Alignment) {
10036 Alignment = NewAlign;
10037 unsigned FI = FIDef->getOperand(1).getIndex();
10038 // Give the stack frame object a larger alignment if needed.
10039 if (MFI.getObjectAlign(FI) < Alignment)
10040 MFI.setObjectAlignment(FI, Alignment);
10041 }
10042 }
10043
10044 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
10045
10046 MachineIRBuilder MIB(MI);
10047 // Memmove requires that we perform the loads first before issuing the stores.
10048 // Apart from that, this loop is pretty much doing the same thing as the
10049 // memcpy codegen function.
10050 unsigned CurrOffset = 0;
10051 SmallVector<Register, 16> LoadVals;
10052 for (auto CopyTy : MemOps) {
10053 // Construct MMO for the load.
10054 auto *LoadMMO =
10055 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
10056
10057 // Create the load.
10058 Register LoadPtr = Src;
10059 if (CurrOffset != 0) {
10060 LLT SrcTy = MRI.getType(Src);
10061 auto Offset =
10062 MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
10063 LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
10064 }
10065 LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
10066 CurrOffset += CopyTy.getSizeInBytes();
10067 }
10068
10069 CurrOffset = 0;
10070 for (unsigned I = 0; I < MemOps.size(); ++I) {
10071 LLT CopyTy = MemOps[I];
10072 // Now store the values loaded.
10073 auto *StoreMMO =
10074 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
10075
10076 Register StorePtr = Dst;
10077 if (CurrOffset != 0) {
10078 LLT DstTy = MRI.getType(Dst);
10079 auto Offset =
10080 MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
10081 StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
10082 }
10083 MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
10084 CurrOffset += CopyTy.getSizeInBytes();
10085 }
10086 MI.eraseFromParent();
10087 return Legalized;
10088 }
10089
10090 LegalizerHelper::LegalizeResult
lowerMemCpyFamily(MachineInstr & MI,unsigned MaxLen)10091 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
10092 const unsigned Opc = MI.getOpcode();
10093 // This combine is fairly complex so it's not written with a separate
10094 // matcher function.
10095 assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
10096 Opc == TargetOpcode::G_MEMSET) &&
10097 "Expected memcpy like instruction");
10098
10099 auto MMOIt = MI.memoperands_begin();
10100 const MachineMemOperand *MemOp = *MMOIt;
10101
10102 Align DstAlign = MemOp->getBaseAlign();
10103 Align SrcAlign;
10104 auto [Dst, Src, Len] = MI.getFirst3Regs();
10105
10106 if (Opc != TargetOpcode::G_MEMSET) {
10107 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
10108 MemOp = *(++MMOIt);
10109 SrcAlign = MemOp->getBaseAlign();
10110 }
10111
10112 // See if this is a constant length copy
10113 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
10114 if (!LenVRegAndVal)
10115 return UnableToLegalize;
10116 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
10117
10118 if (KnownLen == 0) {
10119 MI.eraseFromParent();
10120 return Legalized;
10121 }
10122
10123 bool IsVolatile = MemOp->isVolatile();
10124 // Don't try to optimize volatile.
10125 if (IsVolatile)
10126 return UnableToLegalize;
10127
10128 if (MaxLen && KnownLen > MaxLen)
10129 return UnableToLegalize;
10130
10131 if (Opc == TargetOpcode::G_MEMCPY) {
10132 auto &MF = *MI.getParent()->getParent();
10133 const auto &TLI = *MF.getSubtarget().getTargetLowering();
10134 bool OptSize = shouldLowerMemFuncForSize(MF);
10135 uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
10136 return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
10137 IsVolatile);
10138 }
10139 if (Opc == TargetOpcode::G_MEMMOVE)
10140 return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
10141 if (Opc == TargetOpcode::G_MEMSET)
10142 return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
10143 return UnableToLegalize;
10144 }
10145