xref: /freebsd/contrib/llvm-project/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp (revision fe6060f10f634930ff71b7c50291ddc610da2475)
1*fe6060f1SDimitry Andric //===- MVELaneInterleaving.cpp - Inverleave for MVE instructions ----------===//
2*fe6060f1SDimitry Andric //
3*fe6060f1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*fe6060f1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*fe6060f1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*fe6060f1SDimitry Andric //
7*fe6060f1SDimitry Andric //===----------------------------------------------------------------------===//
8*fe6060f1SDimitry Andric //
9*fe6060f1SDimitry Andric // This pass interleaves around sext/zext/trunc instructions. MVE does not have
10*fe6060f1SDimitry Andric // a single sext/zext or trunc instruction that takes the bottom half of a
11*fe6060f1SDimitry Andric // vector and extends to a full width, like NEON has with MOVL. Instead it is
12*fe6060f1SDimitry Andric // expected that this happens through top/bottom instructions. So the MVE
13*fe6060f1SDimitry Andric // equivalent VMOVLT/B instructions take either the even or odd elements of the
14*fe6060f1SDimitry Andric // input and extend them to the larger type, producing a vector with half the
15*fe6060f1SDimitry Andric // number of elements each of double the bitwidth. As there is no simple
16*fe6060f1SDimitry Andric // instruction, we often have to turn sext/zext/trunc into a series of lane
17*fe6060f1SDimitry Andric // moves (or stack loads/stores, which we do not do yet).
18*fe6060f1SDimitry Andric //
19*fe6060f1SDimitry Andric // This pass takes vector code that starts at truncs, looks for interconnected
20*fe6060f1SDimitry Andric // blobs of operations that end with sext/zext (or constants/splats) of the
21*fe6060f1SDimitry Andric // form:
22*fe6060f1SDimitry Andric //   %sa = sext v8i16 %a to v8i32
23*fe6060f1SDimitry Andric //   %sb = sext v8i16 %b to v8i32
24*fe6060f1SDimitry Andric //   %add = add v8i32 %sa, %sb
25*fe6060f1SDimitry Andric //   %r = trunc %add to v8i16
26*fe6060f1SDimitry Andric // And adds shuffles to allow the use of VMOVL/VMOVN instrctions:
27*fe6060f1SDimitry Andric //   %sha = shuffle v8i16 %a, undef, <0, 2, 4, 6, 1, 3, 5, 7>
28*fe6060f1SDimitry Andric //   %sa = sext v8i16 %sha to v8i32
29*fe6060f1SDimitry Andric //   %shb = shuffle v8i16 %b, undef, <0, 2, 4, 6, 1, 3, 5, 7>
30*fe6060f1SDimitry Andric //   %sb = sext v8i16 %shb to v8i32
31*fe6060f1SDimitry Andric //   %add = add v8i32 %sa, %sb
32*fe6060f1SDimitry Andric //   %r = trunc %add to v8i16
33*fe6060f1SDimitry Andric //   %shr = shuffle v8i16 %r, undef, <0, 4, 1, 5, 2, 6, 3, 7>
34*fe6060f1SDimitry Andric // Which can then be split and lowered to MVE instructions efficiently:
35*fe6060f1SDimitry Andric //   %sa_b = VMOVLB.s16 %a
36*fe6060f1SDimitry Andric //   %sa_t = VMOVLT.s16 %a
37*fe6060f1SDimitry Andric //   %sb_b = VMOVLB.s16 %b
38*fe6060f1SDimitry Andric //   %sb_t = VMOVLT.s16 %b
39*fe6060f1SDimitry Andric //   %add_b = VADD.i32 %sa_b, %sb_b
40*fe6060f1SDimitry Andric //   %add_t = VADD.i32 %sa_t, %sb_t
41*fe6060f1SDimitry Andric //   %r = VMOVNT.i16 %add_b, %add_t
42*fe6060f1SDimitry Andric //
43*fe6060f1SDimitry Andric //===----------------------------------------------------------------------===//
44*fe6060f1SDimitry Andric 
45*fe6060f1SDimitry Andric #include "ARM.h"
46*fe6060f1SDimitry Andric #include "ARMBaseInstrInfo.h"
47*fe6060f1SDimitry Andric #include "ARMSubtarget.h"
48*fe6060f1SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
49*fe6060f1SDimitry Andric #include "llvm/CodeGen/TargetLowering.h"
50*fe6060f1SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
51*fe6060f1SDimitry Andric #include "llvm/CodeGen/TargetSubtargetInfo.h"
52*fe6060f1SDimitry Andric #include "llvm/IR/BasicBlock.h"
53*fe6060f1SDimitry Andric #include "llvm/IR/Constant.h"
54*fe6060f1SDimitry Andric #include "llvm/IR/Constants.h"
55*fe6060f1SDimitry Andric #include "llvm/IR/DerivedTypes.h"
56*fe6060f1SDimitry Andric #include "llvm/IR/Function.h"
57*fe6060f1SDimitry Andric #include "llvm/IR/IRBuilder.h"
58*fe6060f1SDimitry Andric #include "llvm/IR/InstIterator.h"
59*fe6060f1SDimitry Andric #include "llvm/IR/InstrTypes.h"
60*fe6060f1SDimitry Andric #include "llvm/IR/Instruction.h"
61*fe6060f1SDimitry Andric #include "llvm/IR/Instructions.h"
62*fe6060f1SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
63*fe6060f1SDimitry Andric #include "llvm/IR/Intrinsics.h"
64*fe6060f1SDimitry Andric #include "llvm/IR/IntrinsicsARM.h"
65*fe6060f1SDimitry Andric #include "llvm/IR/PatternMatch.h"
66*fe6060f1SDimitry Andric #include "llvm/IR/Type.h"
67*fe6060f1SDimitry Andric #include "llvm/IR/Value.h"
68*fe6060f1SDimitry Andric #include "llvm/InitializePasses.h"
69*fe6060f1SDimitry Andric #include "llvm/Pass.h"
70*fe6060f1SDimitry Andric #include "llvm/Support/Casting.h"
71*fe6060f1SDimitry Andric #include <algorithm>
72*fe6060f1SDimitry Andric #include <cassert>
73*fe6060f1SDimitry Andric 
74*fe6060f1SDimitry Andric using namespace llvm;
75*fe6060f1SDimitry Andric 
76*fe6060f1SDimitry Andric #define DEBUG_TYPE "mve-laneinterleave"
77*fe6060f1SDimitry Andric 
78*fe6060f1SDimitry Andric cl::opt<bool> EnableInterleave(
79*fe6060f1SDimitry Andric     "enable-mve-interleave", cl::Hidden, cl::init(true),
80*fe6060f1SDimitry Andric     cl::desc("Enable interleave MVE vector operation lowering"));
81*fe6060f1SDimitry Andric 
82*fe6060f1SDimitry Andric namespace {
83*fe6060f1SDimitry Andric 
84*fe6060f1SDimitry Andric class MVELaneInterleaving : public FunctionPass {
85*fe6060f1SDimitry Andric public:
86*fe6060f1SDimitry Andric   static char ID; // Pass identification, replacement for typeid
87*fe6060f1SDimitry Andric 
88*fe6060f1SDimitry Andric   explicit MVELaneInterleaving() : FunctionPass(ID) {
89*fe6060f1SDimitry Andric     initializeMVELaneInterleavingPass(*PassRegistry::getPassRegistry());
90*fe6060f1SDimitry Andric   }
91*fe6060f1SDimitry Andric 
92*fe6060f1SDimitry Andric   bool runOnFunction(Function &F) override;
93*fe6060f1SDimitry Andric 
94*fe6060f1SDimitry Andric   StringRef getPassName() const override { return "MVE lane interleaving"; }
95*fe6060f1SDimitry Andric 
96*fe6060f1SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
97*fe6060f1SDimitry Andric     AU.setPreservesCFG();
98*fe6060f1SDimitry Andric     AU.addRequired<TargetPassConfig>();
99*fe6060f1SDimitry Andric     FunctionPass::getAnalysisUsage(AU);
100*fe6060f1SDimitry Andric   }
101*fe6060f1SDimitry Andric };
102*fe6060f1SDimitry Andric 
103*fe6060f1SDimitry Andric } // end anonymous namespace
104*fe6060f1SDimitry Andric 
105*fe6060f1SDimitry Andric char MVELaneInterleaving::ID = 0;
106*fe6060f1SDimitry Andric 
107*fe6060f1SDimitry Andric INITIALIZE_PASS(MVELaneInterleaving, DEBUG_TYPE, "MVE lane interleaving", false,
108*fe6060f1SDimitry Andric                 false)
109*fe6060f1SDimitry Andric 
110*fe6060f1SDimitry Andric Pass *llvm::createMVELaneInterleavingPass() {
111*fe6060f1SDimitry Andric   return new MVELaneInterleaving();
112*fe6060f1SDimitry Andric }
113*fe6060f1SDimitry Andric 
114*fe6060f1SDimitry Andric static bool isProfitableToInterleave(SmallSetVector<Instruction *, 4> &Exts,
115*fe6060f1SDimitry Andric                                      SmallSetVector<Instruction *, 4> &Truncs) {
116*fe6060f1SDimitry Andric   // This is not always beneficial to transform. Exts can be incorporated into
117*fe6060f1SDimitry Andric   // loads, Truncs can be folded into stores.
118*fe6060f1SDimitry Andric   // Truncs are usually the same number of instructions,
119*fe6060f1SDimitry Andric   //  VSTRH.32(A);VSTRH.32(B) vs VSTRH.16(VMOVNT A, B) with interleaving
120*fe6060f1SDimitry Andric   // Exts are unfortunately more instructions in the general case:
121*fe6060f1SDimitry Andric   //  A=VLDRH.32; B=VLDRH.32;
122*fe6060f1SDimitry Andric   // vs with interleaving:
123*fe6060f1SDimitry Andric   //  T=VLDRH.16; A=VMOVNB T; B=VMOVNT T
124*fe6060f1SDimitry Andric   // But those VMOVL may be folded into a VMULL.
125*fe6060f1SDimitry Andric 
126*fe6060f1SDimitry Andric   // But expensive extends/truncs are always good to remove. FPExts always
127*fe6060f1SDimitry Andric   // involve extra VCVT's so are always considered to be beneficial to convert.
128*fe6060f1SDimitry Andric   for (auto *E : Exts) {
129*fe6060f1SDimitry Andric     if (isa<FPExtInst>(E) || !isa<LoadInst>(E->getOperand(0))) {
130*fe6060f1SDimitry Andric       LLVM_DEBUG(dbgs() << "Beneficial due to " << *E << "\n");
131*fe6060f1SDimitry Andric       return true;
132*fe6060f1SDimitry Andric     }
133*fe6060f1SDimitry Andric   }
134*fe6060f1SDimitry Andric   for (auto *T : Truncs) {
135*fe6060f1SDimitry Andric     if (T->hasOneUse() && !isa<StoreInst>(*T->user_begin())) {
136*fe6060f1SDimitry Andric       LLVM_DEBUG(dbgs() << "Beneficial due to " << *T << "\n");
137*fe6060f1SDimitry Andric       return true;
138*fe6060f1SDimitry Andric     }
139*fe6060f1SDimitry Andric   }
140*fe6060f1SDimitry Andric 
141*fe6060f1SDimitry Andric   // Otherwise, we know we have a load(ext), see if any of the Extends are a
142*fe6060f1SDimitry Andric   // vmull. This is a simple heuristic and certainly not perfect.
143*fe6060f1SDimitry Andric   for (auto *E : Exts) {
144*fe6060f1SDimitry Andric     if (!E->hasOneUse() ||
145*fe6060f1SDimitry Andric         cast<Instruction>(*E->user_begin())->getOpcode() != Instruction::Mul) {
146*fe6060f1SDimitry Andric       LLVM_DEBUG(dbgs() << "Not beneficial due to " << *E << "\n");
147*fe6060f1SDimitry Andric       return false;
148*fe6060f1SDimitry Andric     }
149*fe6060f1SDimitry Andric   }
150*fe6060f1SDimitry Andric   return true;
151*fe6060f1SDimitry Andric }
152*fe6060f1SDimitry Andric 
153*fe6060f1SDimitry Andric static bool tryInterleave(Instruction *Start,
154*fe6060f1SDimitry Andric                           SmallPtrSetImpl<Instruction *> &Visited) {
155*fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "tryInterleave from " << *Start << "\n");
156*fe6060f1SDimitry Andric   auto *VT = cast<FixedVectorType>(Start->getType());
157*fe6060f1SDimitry Andric 
158*fe6060f1SDimitry Andric   if (!isa<Instruction>(Start->getOperand(0)))
159*fe6060f1SDimitry Andric     return false;
160*fe6060f1SDimitry Andric 
161*fe6060f1SDimitry Andric   // Look for connected operations starting from Ext's, terminating at Truncs.
162*fe6060f1SDimitry Andric   std::vector<Instruction *> Worklist;
163*fe6060f1SDimitry Andric   Worklist.push_back(Start);
164*fe6060f1SDimitry Andric   Worklist.push_back(cast<Instruction>(Start->getOperand(0)));
165*fe6060f1SDimitry Andric 
166*fe6060f1SDimitry Andric   SmallSetVector<Instruction *, 4> Truncs;
167*fe6060f1SDimitry Andric   SmallSetVector<Instruction *, 4> Exts;
168*fe6060f1SDimitry Andric   SmallSetVector<Use *, 4> OtherLeafs;
169*fe6060f1SDimitry Andric   SmallSetVector<Instruction *, 4> Ops;
170*fe6060f1SDimitry Andric 
171*fe6060f1SDimitry Andric   while (!Worklist.empty()) {
172*fe6060f1SDimitry Andric     Instruction *I = Worklist.back();
173*fe6060f1SDimitry Andric     Worklist.pop_back();
174*fe6060f1SDimitry Andric 
175*fe6060f1SDimitry Andric     switch (I->getOpcode()) {
176*fe6060f1SDimitry Andric     // Truncs
177*fe6060f1SDimitry Andric     case Instruction::Trunc:
178*fe6060f1SDimitry Andric     case Instruction::FPTrunc:
179*fe6060f1SDimitry Andric       if (Truncs.count(I))
180*fe6060f1SDimitry Andric         continue;
181*fe6060f1SDimitry Andric       Truncs.insert(I);
182*fe6060f1SDimitry Andric       Visited.insert(I);
183*fe6060f1SDimitry Andric       break;
184*fe6060f1SDimitry Andric 
185*fe6060f1SDimitry Andric     // Extend leafs
186*fe6060f1SDimitry Andric     case Instruction::SExt:
187*fe6060f1SDimitry Andric     case Instruction::ZExt:
188*fe6060f1SDimitry Andric     case Instruction::FPExt:
189*fe6060f1SDimitry Andric       if (Exts.count(I))
190*fe6060f1SDimitry Andric         continue;
191*fe6060f1SDimitry Andric       for (auto *Use : I->users())
192*fe6060f1SDimitry Andric         Worklist.push_back(cast<Instruction>(Use));
193*fe6060f1SDimitry Andric       Exts.insert(I);
194*fe6060f1SDimitry Andric       break;
195*fe6060f1SDimitry Andric 
196*fe6060f1SDimitry Andric     case Instruction::Call: {
197*fe6060f1SDimitry Andric       IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
198*fe6060f1SDimitry Andric       if (!II)
199*fe6060f1SDimitry Andric         return false;
200*fe6060f1SDimitry Andric 
201*fe6060f1SDimitry Andric       switch (II->getIntrinsicID()) {
202*fe6060f1SDimitry Andric       case Intrinsic::abs:
203*fe6060f1SDimitry Andric       case Intrinsic::smin:
204*fe6060f1SDimitry Andric       case Intrinsic::smax:
205*fe6060f1SDimitry Andric       case Intrinsic::umin:
206*fe6060f1SDimitry Andric       case Intrinsic::umax:
207*fe6060f1SDimitry Andric       case Intrinsic::sadd_sat:
208*fe6060f1SDimitry Andric       case Intrinsic::ssub_sat:
209*fe6060f1SDimitry Andric       case Intrinsic::uadd_sat:
210*fe6060f1SDimitry Andric       case Intrinsic::usub_sat:
211*fe6060f1SDimitry Andric       case Intrinsic::minnum:
212*fe6060f1SDimitry Andric       case Intrinsic::maxnum:
213*fe6060f1SDimitry Andric       case Intrinsic::fabs:
214*fe6060f1SDimitry Andric       case Intrinsic::fma:
215*fe6060f1SDimitry Andric       case Intrinsic::ceil:
216*fe6060f1SDimitry Andric       case Intrinsic::floor:
217*fe6060f1SDimitry Andric       case Intrinsic::rint:
218*fe6060f1SDimitry Andric       case Intrinsic::round:
219*fe6060f1SDimitry Andric       case Intrinsic::trunc:
220*fe6060f1SDimitry Andric         break;
221*fe6060f1SDimitry Andric       default:
222*fe6060f1SDimitry Andric         return false;
223*fe6060f1SDimitry Andric       }
224*fe6060f1SDimitry Andric       LLVM_FALLTHROUGH; // Fall through to treating these like an operator below.
225*fe6060f1SDimitry Andric     }
226*fe6060f1SDimitry Andric     // Binary/tertiary ops
227*fe6060f1SDimitry Andric     case Instruction::Add:
228*fe6060f1SDimitry Andric     case Instruction::Sub:
229*fe6060f1SDimitry Andric     case Instruction::Mul:
230*fe6060f1SDimitry Andric     case Instruction::AShr:
231*fe6060f1SDimitry Andric     case Instruction::LShr:
232*fe6060f1SDimitry Andric     case Instruction::Shl:
233*fe6060f1SDimitry Andric     case Instruction::ICmp:
234*fe6060f1SDimitry Andric     case Instruction::FCmp:
235*fe6060f1SDimitry Andric     case Instruction::FAdd:
236*fe6060f1SDimitry Andric     case Instruction::FMul:
237*fe6060f1SDimitry Andric     case Instruction::Select:
238*fe6060f1SDimitry Andric       if (Ops.count(I))
239*fe6060f1SDimitry Andric         continue;
240*fe6060f1SDimitry Andric       Ops.insert(I);
241*fe6060f1SDimitry Andric 
242*fe6060f1SDimitry Andric       for (Use &Op : I->operands()) {
243*fe6060f1SDimitry Andric         if (!isa<FixedVectorType>(Op->getType()))
244*fe6060f1SDimitry Andric           continue;
245*fe6060f1SDimitry Andric         if (isa<Instruction>(Op))
246*fe6060f1SDimitry Andric           Worklist.push_back(cast<Instruction>(&Op));
247*fe6060f1SDimitry Andric         else
248*fe6060f1SDimitry Andric           OtherLeafs.insert(&Op);
249*fe6060f1SDimitry Andric       }
250*fe6060f1SDimitry Andric 
251*fe6060f1SDimitry Andric       for (auto *Use : I->users())
252*fe6060f1SDimitry Andric         Worklist.push_back(cast<Instruction>(Use));
253*fe6060f1SDimitry Andric       break;
254*fe6060f1SDimitry Andric 
255*fe6060f1SDimitry Andric     case Instruction::ShuffleVector:
256*fe6060f1SDimitry Andric       // A shuffle of a splat is a splat.
257*fe6060f1SDimitry Andric       if (cast<ShuffleVectorInst>(I)->isZeroEltSplat())
258*fe6060f1SDimitry Andric         continue;
259*fe6060f1SDimitry Andric       LLVM_FALLTHROUGH;
260*fe6060f1SDimitry Andric 
261*fe6060f1SDimitry Andric     default:
262*fe6060f1SDimitry Andric       LLVM_DEBUG(dbgs() << "  Unhandled instruction: " << *I << "\n");
263*fe6060f1SDimitry Andric       return false;
264*fe6060f1SDimitry Andric     }
265*fe6060f1SDimitry Andric   }
266*fe6060f1SDimitry Andric 
267*fe6060f1SDimitry Andric   if (Exts.empty() && OtherLeafs.empty())
268*fe6060f1SDimitry Andric     return false;
269*fe6060f1SDimitry Andric 
270*fe6060f1SDimitry Andric   LLVM_DEBUG({
271*fe6060f1SDimitry Andric     dbgs() << "Found group:\n  Exts:";
272*fe6060f1SDimitry Andric     for (auto *I : Exts)
273*fe6060f1SDimitry Andric       dbgs() << "  " << *I << "\n";
274*fe6060f1SDimitry Andric     dbgs() << "  Ops:";
275*fe6060f1SDimitry Andric     for (auto *I : Ops)
276*fe6060f1SDimitry Andric       dbgs() << "  " << *I << "\n";
277*fe6060f1SDimitry Andric     dbgs() << "  OtherLeafs:";
278*fe6060f1SDimitry Andric     for (auto *I : OtherLeafs)
279*fe6060f1SDimitry Andric       dbgs() << "  " << *I->get() << " of " << *I->getUser() << "\n";
280*fe6060f1SDimitry Andric     dbgs() << "Truncs:";
281*fe6060f1SDimitry Andric     for (auto *I : Truncs)
282*fe6060f1SDimitry Andric       dbgs() << "  " << *I << "\n";
283*fe6060f1SDimitry Andric   });
284*fe6060f1SDimitry Andric 
285*fe6060f1SDimitry Andric   assert(!Truncs.empty() && "Expected some truncs");
286*fe6060f1SDimitry Andric 
287*fe6060f1SDimitry Andric   // Check types
288*fe6060f1SDimitry Andric   unsigned NumElts = VT->getNumElements();
289*fe6060f1SDimitry Andric   unsigned BaseElts = VT->getScalarSizeInBits() == 16
290*fe6060f1SDimitry Andric                           ? 8
291*fe6060f1SDimitry Andric                           : (VT->getScalarSizeInBits() == 8 ? 16 : 0);
292*fe6060f1SDimitry Andric   if (BaseElts == 0 || NumElts % BaseElts != 0) {
293*fe6060f1SDimitry Andric     LLVM_DEBUG(dbgs() << "  Type is unsupported\n");
294*fe6060f1SDimitry Andric     return false;
295*fe6060f1SDimitry Andric   }
296*fe6060f1SDimitry Andric   if (Start->getOperand(0)->getType()->getScalarSizeInBits() !=
297*fe6060f1SDimitry Andric       VT->getScalarSizeInBits() * 2) {
298*fe6060f1SDimitry Andric     LLVM_DEBUG(dbgs() << "  Type not double sized\n");
299*fe6060f1SDimitry Andric     return false;
300*fe6060f1SDimitry Andric   }
301*fe6060f1SDimitry Andric   for (Instruction *I : Exts)
302*fe6060f1SDimitry Andric     if (I->getOperand(0)->getType() != VT) {
303*fe6060f1SDimitry Andric       LLVM_DEBUG(dbgs() << "  Wrong type on " << *I << "\n");
304*fe6060f1SDimitry Andric       return false;
305*fe6060f1SDimitry Andric     }
306*fe6060f1SDimitry Andric   for (Instruction *I : Truncs)
307*fe6060f1SDimitry Andric     if (I->getType() != VT) {
308*fe6060f1SDimitry Andric       LLVM_DEBUG(dbgs() << "  Wrong type on " << *I << "\n");
309*fe6060f1SDimitry Andric       return false;
310*fe6060f1SDimitry Andric     }
311*fe6060f1SDimitry Andric 
312*fe6060f1SDimitry Andric   // Check that it looks beneficial
313*fe6060f1SDimitry Andric   if (!isProfitableToInterleave(Exts, Truncs))
314*fe6060f1SDimitry Andric     return false;
315*fe6060f1SDimitry Andric 
316*fe6060f1SDimitry Andric   // Create new shuffles around the extends / truncs / other leaves.
317*fe6060f1SDimitry Andric   IRBuilder<> Builder(Start);
318*fe6060f1SDimitry Andric 
319*fe6060f1SDimitry Andric   SmallVector<int, 16> LeafMask;
320*fe6060f1SDimitry Andric   SmallVector<int, 16> TruncMask;
321*fe6060f1SDimitry Andric   // LeafMask : 0, 2, 4, 6, 1, 3, 5, 7   8, 10, 12, 14,  9, 11, 13, 15
322*fe6060f1SDimitry Andric   // TruncMask: 0, 4, 1, 5, 2, 6, 3, 7   8, 12,  9, 13, 10, 14, 11, 15
323*fe6060f1SDimitry Andric   for (unsigned Base = 0; Base < NumElts; Base += BaseElts) {
324*fe6060f1SDimitry Andric     for (unsigned i = 0; i < BaseElts / 2; i++)
325*fe6060f1SDimitry Andric       LeafMask.push_back(Base + i * 2);
326*fe6060f1SDimitry Andric     for (unsigned i = 0; i < BaseElts / 2; i++)
327*fe6060f1SDimitry Andric       LeafMask.push_back(Base + i * 2 + 1);
328*fe6060f1SDimitry Andric   }
329*fe6060f1SDimitry Andric   for (unsigned Base = 0; Base < NumElts; Base += BaseElts) {
330*fe6060f1SDimitry Andric     for (unsigned i = 0; i < BaseElts / 2; i++) {
331*fe6060f1SDimitry Andric       TruncMask.push_back(Base + i);
332*fe6060f1SDimitry Andric       TruncMask.push_back(Base + i + BaseElts / 2);
333*fe6060f1SDimitry Andric     }
334*fe6060f1SDimitry Andric   }
335*fe6060f1SDimitry Andric 
336*fe6060f1SDimitry Andric   for (Instruction *I : Exts) {
337*fe6060f1SDimitry Andric     LLVM_DEBUG(dbgs() << "Replacing ext " << *I << "\n");
338*fe6060f1SDimitry Andric     Builder.SetInsertPoint(I);
339*fe6060f1SDimitry Andric     Value *Shuffle = Builder.CreateShuffleVector(I->getOperand(0), LeafMask);
340*fe6060f1SDimitry Andric     bool FPext = isa<FPExtInst>(I);
341*fe6060f1SDimitry Andric     bool Sext = isa<SExtInst>(I);
342*fe6060f1SDimitry Andric     Value *Ext = FPext ? Builder.CreateFPExt(Shuffle, I->getType())
343*fe6060f1SDimitry Andric                        : Sext ? Builder.CreateSExt(Shuffle, I->getType())
344*fe6060f1SDimitry Andric                               : Builder.CreateZExt(Shuffle, I->getType());
345*fe6060f1SDimitry Andric     I->replaceAllUsesWith(Ext);
346*fe6060f1SDimitry Andric     LLVM_DEBUG(dbgs() << "  with " << *Shuffle << "\n");
347*fe6060f1SDimitry Andric   }
348*fe6060f1SDimitry Andric 
349*fe6060f1SDimitry Andric   for (Use *I : OtherLeafs) {
350*fe6060f1SDimitry Andric     LLVM_DEBUG(dbgs() << "Replacing leaf " << *I << "\n");
351*fe6060f1SDimitry Andric     Builder.SetInsertPoint(cast<Instruction>(I->getUser()));
352*fe6060f1SDimitry Andric     Value *Shuffle = Builder.CreateShuffleVector(I->get(), LeafMask);
353*fe6060f1SDimitry Andric     I->getUser()->setOperand(I->getOperandNo(), Shuffle);
354*fe6060f1SDimitry Andric     LLVM_DEBUG(dbgs() << "  with " << *Shuffle << "\n");
355*fe6060f1SDimitry Andric   }
356*fe6060f1SDimitry Andric 
357*fe6060f1SDimitry Andric   for (Instruction *I : Truncs) {
358*fe6060f1SDimitry Andric     LLVM_DEBUG(dbgs() << "Replacing trunc " << *I << "\n");
359*fe6060f1SDimitry Andric 
360*fe6060f1SDimitry Andric     Builder.SetInsertPoint(I->getParent(), ++I->getIterator());
361*fe6060f1SDimitry Andric     Value *Shuf = Builder.CreateShuffleVector(I, TruncMask);
362*fe6060f1SDimitry Andric     I->replaceAllUsesWith(Shuf);
363*fe6060f1SDimitry Andric     cast<Instruction>(Shuf)->setOperand(0, I);
364*fe6060f1SDimitry Andric 
365*fe6060f1SDimitry Andric     LLVM_DEBUG(dbgs() << "  with " << *Shuf << "\n");
366*fe6060f1SDimitry Andric   }
367*fe6060f1SDimitry Andric 
368*fe6060f1SDimitry Andric   return true;
369*fe6060f1SDimitry Andric }
370*fe6060f1SDimitry Andric 
371*fe6060f1SDimitry Andric bool MVELaneInterleaving::runOnFunction(Function &F) {
372*fe6060f1SDimitry Andric   if (!EnableInterleave)
373*fe6060f1SDimitry Andric     return false;
374*fe6060f1SDimitry Andric   auto &TPC = getAnalysis<TargetPassConfig>();
375*fe6060f1SDimitry Andric   auto &TM = TPC.getTM<TargetMachine>();
376*fe6060f1SDimitry Andric   auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
377*fe6060f1SDimitry Andric   if (!ST->hasMVEIntegerOps())
378*fe6060f1SDimitry Andric     return false;
379*fe6060f1SDimitry Andric 
380*fe6060f1SDimitry Andric   bool Changed = false;
381*fe6060f1SDimitry Andric 
382*fe6060f1SDimitry Andric   SmallPtrSet<Instruction *, 16> Visited;
383*fe6060f1SDimitry Andric   for (Instruction &I : reverse(instructions(F))) {
384*fe6060f1SDimitry Andric     if (I.getType()->isVectorTy() &&
385*fe6060f1SDimitry Andric         (isa<TruncInst>(I) || isa<FPTruncInst>(I)) && !Visited.count(&I))
386*fe6060f1SDimitry Andric       Changed |= tryInterleave(&I, Visited);
387*fe6060f1SDimitry Andric   }
388*fe6060f1SDimitry Andric 
389*fe6060f1SDimitry Andric   return Changed;
390*fe6060f1SDimitry Andric }
391