xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp (revision b4af4f93c682e445bf159f0d1ec90b636296c946)
1 //===- X86InterleavedAccess.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file contains the X86 implementation of the interleaved accesses
11 /// optimization generating X86-specific instructions/intrinsics for
12 /// interleaved access groups.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86ISelLowering.h"
17 #include "X86Subtarget.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/Analysis/VectorUtils.h"
21 #include "llvm/IR/Constants.h"
22 #include "llvm/IR/DataLayout.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/IR/IRBuilder.h"
25 #include "llvm/IR/Instruction.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/Module.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/IR/Value.h"
30 #include "llvm/Support/Casting.h"
31 #include "llvm/Support/MachineValueType.h"
32 #include <algorithm>
33 #include <cassert>
34 #include <cmath>
35 #include <cstdint>
36 
37 using namespace llvm;
38 
39 namespace {
40 
41 /// This class holds necessary information to represent an interleaved
42 /// access group and supports utilities to lower the group into
43 /// X86-specific instructions/intrinsics.
44 ///  E.g. A group of interleaving access loads (Factor = 2; accessing every
45 ///       other element)
46 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
47 ///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
48 ///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
49 class X86InterleavedAccessGroup {
50   /// Reference to the wide-load instruction of an interleaved access
51   /// group.
52   Instruction *const Inst;
53 
54   /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
55   ArrayRef<ShuffleVectorInst *> Shuffles;
56 
57   /// Reference to the starting index of each user-shuffle.
58   ArrayRef<unsigned> Indices;
59 
60   /// Reference to the interleaving stride in terms of elements.
61   const unsigned Factor;
62 
63   /// Reference to the underlying target.
64   const X86Subtarget &Subtarget;
65 
66   const DataLayout &DL;
67 
68   IRBuilder<> &Builder;
69 
70   /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
71   /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
72   void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
73                  SmallVectorImpl<Instruction *> &DecomposedVectors);
74 
75   /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
76   /// returns the transposed-vectors in \p TransposedVectors.
77   /// E.g.
78   /// InputVectors:
79   ///   In-V0 = p1, p2, p3, p4
80   ///   In-V1 = q1, q2, q3, q4
81   ///   In-V2 = r1, r2, r3, r4
82   ///   In-V3 = s1, s2, s3, s4
83   /// OutputVectors:
84   ///   Out-V0 = p1, q1, r1, s1
85   ///   Out-V1 = p2, q2, r2, s2
86   ///   Out-V2 = p3, q3, r3, s3
87   ///   Out-V3 = P4, q4, r4, s4
88   void transpose_4x4(ArrayRef<Instruction *> InputVectors,
89                      SmallVectorImpl<Value *> &TransposedMatrix);
90   void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
91                              SmallVectorImpl<Value *> &TransposedMatrix,
92                              unsigned NumSubVecElems);
93   void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
94                                 SmallVectorImpl<Value *> &TransposedMatrix);
95   void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
96                              SmallVectorImpl<Value *> &TransposedMatrix,
97                              unsigned NumSubVecElems);
98   void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
99                                SmallVectorImpl<Value *> &TransposedMatrix,
100                                unsigned NumSubVecElems);
101 
102 public:
103   /// In order to form an interleaved access group X86InterleavedAccessGroup
104   /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
105   /// \p Shuffs, reference to the first indices of each interleaved-vector
106   /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
107   /// X86-specific instructions/intrinsics it also requires the underlying
108   /// target information \p STarget.
109   explicit X86InterleavedAccessGroup(Instruction *I,
110                                      ArrayRef<ShuffleVectorInst *> Shuffs,
111                                      ArrayRef<unsigned> Ind, const unsigned F,
112                                      const X86Subtarget &STarget,
113                                      IRBuilder<> &B)
114       : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
115         DL(Inst->getModule()->getDataLayout()), Builder(B) {}
116 
117   /// Returns true if this interleaved access group can be lowered into
118   /// x86-specific instructions/intrinsics, false otherwise.
119   bool isSupported() const;
120 
121   /// Lowers this interleaved access group into X86-specific
122   /// instructions/intrinsics.
123   bool lowerIntoOptimizedSequence();
124 };
125 
126 } // end anonymous namespace
127 
128 bool X86InterleavedAccessGroup::isSupported() const {
129   VectorType *ShuffleVecTy = Shuffles[0]->getType();
130   Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
131   unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
132   unsigned WideInstSize;
133 
134   // Currently, lowering is supported for the following vectors:
135   // Stride 4:
136   //    1. Store and load of 4-element vectors of 64 bits on AVX.
137   //    2. Store of 16/32-element vectors of 8 bits on AVX.
138   // Stride 3:
139   //    1. Load of 16/32-element vectors of 8 bits on AVX.
140   if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
141     return false;
142 
143   if (isa<LoadInst>(Inst)) {
144     WideInstSize = DL.getTypeSizeInBits(Inst->getType());
145     if (cast<LoadInst>(Inst)->getPointerAddressSpace())
146       return false;
147   } else
148     WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
149 
150   // We support shuffle represents stride 4 for byte type with size of
151   // WideInstSize.
152   if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
153      return true;
154 
155   if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
156       (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
157        WideInstSize == 2048))
158     return true;
159 
160   if (ShuffleElemSize == 8 && Factor == 3 &&
161       (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
162     return true;
163 
164   return false;
165 }
166 
167 void X86InterleavedAccessGroup::decompose(
168     Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
169     SmallVectorImpl<Instruction *> &DecomposedVectors) {
170   assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
171          "Expected Load or Shuffle");
172 
173   Type *VecWidth = VecInst->getType();
174   (void)VecWidth;
175   assert(VecWidth->isVectorTy() &&
176          DL.getTypeSizeInBits(VecWidth) >=
177              DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
178          "Invalid Inst-size!!!");
179 
180   if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
181     Value *Op0 = SVI->getOperand(0);
182     Value *Op1 = SVI->getOperand(1);
183 
184     // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
185     for (unsigned i = 0; i < NumSubVectors; ++i)
186       DecomposedVectors.push_back(
187           cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
188               Op0, Op1,
189               createSequentialMask(Builder, Indices[i],
190                                    SubVecTy->getVectorNumElements(), 0))));
191     return;
192   }
193 
194   // Decompose the load instruction.
195   LoadInst *LI = cast<LoadInst>(VecInst);
196   Type *VecBaseTy, *VecBasePtrTy;
197   Value *VecBasePtr;
198   unsigned int NumLoads = NumSubVectors;
199   // In the case of stride 3 with a vector of 32 elements load the information
200   // in the following way:
201   // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
202   unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
203   if (VecLength == 768 || VecLength == 1536) {
204     VecBaseTy = VectorType::get(Type::getInt8Ty(LI->getContext()), 16);
205     VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
206     VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
207     NumLoads = NumSubVectors * (VecLength / 384);
208   } else {
209     VecBaseTy = SubVecTy;
210     VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
211     VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
212   }
213   // Generate N loads of T type.
214   for (unsigned i = 0; i < NumLoads; i++) {
215     // TODO: Support inbounds GEP.
216     Value *NewBasePtr =
217         Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
218     Instruction *NewLoad =
219         Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlignment());
220     DecomposedVectors.push_back(NewLoad);
221   }
222 }
223 
224 // Changing the scale of the vector type by reducing the number of elements and
225 // doubling the scalar size.
226 static MVT scaleVectorType(MVT VT) {
227   unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
228   return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
229                           VT.getVectorNumElements() / 2);
230 }
231 
232 static uint32_t Concat[] = {
233   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
234   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
235   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
236   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 };
237 
238 // genShuffleBland - Creates shuffle according to two vectors.This function is
239 // only works on instructions with lane inside 256 registers. According to
240 // the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
241 // offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
242 // Where the 'LowOffset' refers to the first vector and the highOffset refers to
243 // the second vector.
244 // |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
245 // |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
246 // |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
247 // For the sequence to work as a mirror to the load.
248 // We must consider the elements order as above.
249 // In this function we are combining two types of shuffles.
250 // The first one is vpshufed and the second is a type of "blend" shuffle.
251 // By computing the shuffle on a sequence of 16 elements(one lane) and add the
252 // correct offset. We are creating a vpsuffed + blend sequence between two
253 // shuffles.
254 static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
255   SmallVectorImpl<uint32_t> &Out, int LowOffset,
256   int HighOffset) {
257   assert(VT.getSizeInBits() >= 256 &&
258     "This function doesn't accept width smaller then 256");
259   unsigned NumOfElm = VT.getVectorNumElements();
260   for (unsigned i = 0; i < Mask.size(); i++)
261     Out.push_back(Mask[i] + LowOffset);
262   for (unsigned i = 0; i < Mask.size(); i++)
263     Out.push_back(Mask[i] + HighOffset + NumOfElm);
264 }
265 
266 // reorderSubVector returns the data to is the original state. And de-facto is
267 // the opposite of  the function concatSubVector.
268 
269 // For VecElems = 16
270 // Invec[0] -  |0|      TransposedMatrix[0] - |0|
271 // Invec[1] -  |1|  =>  TransposedMatrix[1] - |1|
272 // Invec[2] -  |2|      TransposedMatrix[2] - |2|
273 
274 // For VecElems = 32
275 // Invec[0] -  |0|3|      TransposedMatrix[0] - |0|1|
276 // Invec[1] -  |1|4|  =>  TransposedMatrix[1] - |2|3|
277 // Invec[2] -  |2|5|      TransposedMatrix[2] - |4|5|
278 
279 // For VecElems = 64
280 // Invec[0] -  |0|3|6|9 |     TransposedMatrix[0] - |0|1|2 |3 |
281 // Invec[1] -  |1|4|7|10| =>  TransposedMatrix[1] - |4|5|6 |7 |
282 // Invec[2] -  |2|5|8|11|     TransposedMatrix[2] - |8|9|10|11|
283 
284 static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
285   ArrayRef<Value *> Vec, ArrayRef<uint32_t> VPShuf,
286   unsigned VecElems, unsigned Stride,
287   IRBuilder<> Builder) {
288 
289   if (VecElems == 16) {
290     for (unsigned i = 0; i < Stride; i++)
291       TransposedMatrix[i] = Builder.CreateShuffleVector(
292         Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
293     return;
294   }
295 
296   SmallVector<uint32_t, 32> OptimizeShuf;
297   Value *Temp[8];
298 
299   for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
300     genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
301       (i + 1) / Stride * 16);
302     Temp[i / 2] = Builder.CreateShuffleVector(
303       Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
304     OptimizeShuf.clear();
305   }
306 
307   if (VecElems == 32) {
308     std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
309     return;
310   }
311   else
312     for (unsigned i = 0; i < Stride; i++)
313       TransposedMatrix[i] =
314       Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
315 }
316 
317 void X86InterleavedAccessGroup::interleave8bitStride4VF8(
318     ArrayRef<Instruction *> Matrix,
319     SmallVectorImpl<Value *> &TransposedMatrix) {
320   // Assuming we start from the following vectors:
321   // Matrix[0]= c0 c1 c2 c3 c4 ... c7
322   // Matrix[1]= m0 m1 m2 m3 m4 ... m7
323   // Matrix[2]= y0 y1 y2 y3 y4 ... y7
324   // Matrix[3]= k0 k1 k2 k3 k4 ... k7
325 
326   MVT VT = MVT::v8i16;
327   TransposedMatrix.resize(2);
328   SmallVector<uint32_t, 16> MaskLow;
329   SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
330   SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
331 
332   for (unsigned i = 0; i < 8; ++i) {
333     MaskLow.push_back(i);
334     MaskLow.push_back(i + 8);
335   }
336 
337   createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
338   createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
339   scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
340   scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
341   // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
342   // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
343   Value *IntrVec1Low =
344       Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
345   Value *IntrVec2Low =
346       Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
347 
348   // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
349   // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
350 
351   TransposedMatrix[0] =
352       Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
353   TransposedMatrix[1] =
354       Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
355 }
356 
357 void X86InterleavedAccessGroup::interleave8bitStride4(
358     ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
359     unsigned NumOfElm) {
360   // Example: Assuming we start from the following vectors:
361   // Matrix[0]= c0 c1 c2 c3 c4 ... c31
362   // Matrix[1]= m0 m1 m2 m3 m4 ... m31
363   // Matrix[2]= y0 y1 y2 y3 y4 ... y31
364   // Matrix[3]= k0 k1 k2 k3 k4 ... k31
365 
366   MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
367   MVT HalfVT = scaleVectorType(VT);
368 
369   TransposedMatrix.resize(4);
370   SmallVector<uint32_t, 32> MaskHigh;
371   SmallVector<uint32_t, 32> MaskLow;
372   SmallVector<uint32_t, 32> LowHighMask[2];
373   SmallVector<uint32_t, 32> MaskHighTemp;
374   SmallVector<uint32_t, 32> MaskLowTemp;
375 
376   // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
377   // shuffle pattern.
378 
379   createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
380   createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
381 
382   // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
383   // shuffle pattern.
384 
385   createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false);
386   createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false);
387   scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
388   scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
389 
390   // IntrVec1Low  = c0  m0  c1  m1 ... c7  m7  | c16 m16 c17 m17 ... c23 m23
391   // IntrVec1High = c8  m8  c9  m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
392   // IntrVec2Low  = y0  k0  y1  k1 ... y7  k7  | y16 k16 y17 k17 ... y23 k23
393   // IntrVec2High = y8  k8  y9  k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
394   Value *IntrVec[4];
395 
396   IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
397   IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
398   IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
399   IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
400 
401   // cmyk4  cmyk5  cmyk6   cmyk7  | cmyk20 cmyk21 cmyk22 cmyk23
402   // cmyk12 cmyk13 cmyk14  cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
403   // cmyk0  cmyk1  cmyk2   cmyk3  | cmyk16 cmyk17 cmyk18 cmyk19
404   // cmyk8  cmyk9  cmyk10  cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
405 
406   Value *VecOut[4];
407   for (int i = 0; i < 4; i++)
408     VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
409                                             LowHighMask[i % 2]);
410 
411   // cmyk0  cmyk1  cmyk2  cmyk3   | cmyk4  cmyk5  cmyk6  cmyk7
412   // cmyk8  cmyk9  cmyk10 cmyk11  | cmyk12 cmyk13 cmyk14 cmyk15
413   // cmyk16 cmyk17 cmyk18 cmyk19  | cmyk20 cmyk21 cmyk22 cmyk23
414   // cmyk24 cmyk25 cmyk26 cmyk27  | cmyk28 cmyk29 cmyk30 cmyk31
415 
416   if (VT == MVT::v16i8) {
417     std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
418     return;
419   }
420 
421   reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
422                    NumOfElm, 4, Builder);
423 }
424 
425 //  createShuffleStride returns shuffle mask of size N.
426 //  The shuffle pattern is as following :
427 //  {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
428 //  (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
429 //  (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
430 //  Where Lane is the # of lanes in a register:
431 //  VectorSize = 128 => Lane = 1
432 //  VectorSize = 256 => Lane = 2
433 //  For example shuffle pattern for VF 16 register size 256 -> lanes = 2
434 //  {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
435 static void createShuffleStride(MVT VT, int Stride,
436                                 SmallVectorImpl<uint32_t> &Mask) {
437   int VectorSize = VT.getSizeInBits();
438   int VF = VT.getVectorNumElements();
439   int LaneCount = std::max(VectorSize / 128, 1);
440   for (int Lane = 0; Lane < LaneCount; Lane++)
441     for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
442       Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
443 }
444 
445 //  setGroupSize sets 'SizeInfo' to the size(number of elements) of group
446 //  inside mask a shuffleMask. A mask contains exactly 3 groups, where
447 //  each group is a monotonically increasing sequence with stride 3.
448 //  For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
449 static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
450   int VectorSize = VT.getSizeInBits();
451   int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
452   for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
453     int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
454     SizeInfo.push_back(GroupSize);
455     FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
456   }
457 }
458 
459 //  DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
460 //  vpalign works according to lanes
461 //  Where Lane is the # of lanes in a register:
462 //  VectorWide = 128 => Lane = 1
463 //  VectorWide = 256 => Lane = 2
464 //  For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
465 //  For Lane = 2 shuffle pattern is:
466 //  {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
467 //  Imm variable sets the offset amount. The result of the
468 //  function is stored inside ShuffleMask vector and it built as described in
469 //  the begin of the description. AlignDirection is a boolean that indicates the
470 //  direction of the alignment. (false - align to the "right" side while true -
471 //  align to the "left" side)
472 static void DecodePALIGNRMask(MVT VT, unsigned Imm,
473                               SmallVectorImpl<uint32_t> &ShuffleMask,
474                               bool AlignDirection = true, bool Unary = false) {
475   unsigned NumElts = VT.getVectorNumElements();
476   unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
477   unsigned NumLaneElts = NumElts / NumLanes;
478 
479   Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
480   unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
481 
482   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
483     for (unsigned i = 0; i != NumLaneElts; ++i) {
484       unsigned Base = i + Offset;
485       // if i+offset is out of this lane then we actually need the other source
486       // If Unary the other source is the first source.
487       if (Base >= NumLaneElts)
488         Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
489       ShuffleMask.push_back(Base + l);
490     }
491   }
492 }
493 
494 // concatSubVector - The function rebuilds the data to a correct expected
495 // order. An assumption(The shape of the matrix) was taken for the
496 // deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
497 // This function ensures that the data is built in correct way for the lane
498 // instructions. Each lane inside the vector is a 128-bit length.
499 //
500 // The 'InVec' argument contains the data in increasing order. In InVec[0] You
501 // can find the first 128 bit data. The number of different lanes inside a
502 // vector depends on the 'VecElems'.In general, the formula is
503 // VecElems * type / 128. The size of the array 'InVec' depends and equal to
504 // 'VecElems'.
505 
506 // For VecElems = 16
507 // Invec[0] - |0|      Vec[0] - |0|
508 // Invec[1] - |1|  =>  Vec[1] - |1|
509 // Invec[2] - |2|      Vec[2] - |2|
510 
511 // For VecElems = 32
512 // Invec[0] - |0|1|      Vec[0] - |0|3|
513 // Invec[1] - |2|3|  =>  Vec[1] - |1|4|
514 // Invec[2] - |4|5|      Vec[2] - |2|5|
515 
516 // For VecElems = 64
517 // Invec[0] - |0|1|2 |3 |      Vec[0] - |0|3|6|9 |
518 // Invec[1] - |4|5|6 |7 |  =>  Vec[1] - |1|4|7|10|
519 // Invec[2] - |8|9|10|11|      Vec[2] - |2|5|8|11|
520 
521 static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
522                             unsigned VecElems, IRBuilder<> Builder) {
523   if (VecElems == 16) {
524     for (int i = 0; i < 3; i++)
525       Vec[i] = InVec[i];
526     return;
527   }
528 
529   for (unsigned j = 0; j < VecElems / 32; j++)
530     for (int i = 0; i < 3; i++)
531       Vec[i + j * 3] = Builder.CreateShuffleVector(
532           InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32));
533 
534   if (VecElems == 32)
535     return;
536 
537   for (int i = 0; i < 3; i++)
538     Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
539 }
540 
541 void X86InterleavedAccessGroup::deinterleave8bitStride3(
542     ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
543     unsigned VecElems) {
544   // Example: Assuming we start from the following vectors:
545   // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
546   // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
547   // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
548 
549   TransposedMatrix.resize(3);
550   SmallVector<uint32_t, 32> VPShuf;
551   SmallVector<uint32_t, 32> VPAlign[2];
552   SmallVector<uint32_t, 32> VPAlign2;
553   SmallVector<uint32_t, 32> VPAlign3;
554   SmallVector<uint32_t, 3> GroupSize;
555   Value *Vec[6], *TempVector[3];
556 
557   MVT VT = MVT::getVT(Shuffles[0]->getType());
558 
559   createShuffleStride(VT, 3, VPShuf);
560   setGroupSize(VT, GroupSize);
561 
562   for (int i = 0; i < 2; i++)
563     DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
564 
565   DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
566   DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
567 
568   concatSubVector(Vec, InVec, VecElems, Builder);
569   // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
570   // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
571   // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
572 
573   for (int i = 0; i < 3; i++)
574     Vec[i] = Builder.CreateShuffleVector(
575         Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
576 
577   // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
578   // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
579   // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
580 
581   for (int i = 0; i < 3; i++)
582     TempVector[i] =
583         Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
584 
585   // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
586   // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
587   // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
588 
589   for (int i = 0; i < 3; i++)
590     Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
591                                          VPAlign[1]);
592 
593   // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
594   // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
595   // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
596 
597   Value *TempVec = Builder.CreateShuffleVector(
598       Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
599   TransposedMatrix[0] = Builder.CreateShuffleVector(
600       Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
601   TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
602   TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
603 }
604 
605 // group2Shuffle reorder the shuffle stride back into continuous order.
606 // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
607 // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
608 static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask,
609                           SmallVectorImpl<uint32_t> &Output) {
610   int IndexGroup[3] = {0, 0, 0};
611   int Index = 0;
612   int VectorWidth = VT.getSizeInBits();
613   int VF = VT.getVectorNumElements();
614   // Find the index of the different groups.
615   int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
616   for (int i = 0; i < 3; i++) {
617     IndexGroup[(Index * 3) % (VF / Lane)] = Index;
618     Index += Mask[i];
619   }
620   // According to the index compute the convert mask.
621   for (int i = 0; i < VF / Lane; i++) {
622     Output.push_back(IndexGroup[i % 3]);
623     IndexGroup[i % 3]++;
624   }
625 }
626 
627 void X86InterleavedAccessGroup::interleave8bitStride3(
628     ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
629     unsigned VecElems) {
630   // Example: Assuming we start from the following vectors:
631   // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
632   // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
633   // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
634 
635   TransposedMatrix.resize(3);
636   SmallVector<uint32_t, 3> GroupSize;
637   SmallVector<uint32_t, 32> VPShuf;
638   SmallVector<uint32_t, 32> VPAlign[3];
639   SmallVector<uint32_t, 32> VPAlign2;
640   SmallVector<uint32_t, 32> VPAlign3;
641 
642   Value *Vec[3], *TempVector[3];
643   MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
644 
645   setGroupSize(VT, GroupSize);
646 
647   for (int i = 0; i < 3; i++)
648     DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
649 
650   DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
651   DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
652 
653   // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
654   // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
655   // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
656 
657   Vec[0] = Builder.CreateShuffleVector(
658       InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
659   Vec[1] = Builder.CreateShuffleVector(
660       InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
661   Vec[2] = InVec[2];
662 
663   // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
664   // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
665   // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
666 
667   for (int i = 0; i < 3; i++)
668     TempVector[i] =
669         Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
670 
671   // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
672   // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
673   // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
674 
675   for (int i = 0; i < 3; i++)
676     Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
677                                          VPAlign[2]);
678 
679   // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
680   // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
681   // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
682 
683   unsigned NumOfElm = VT.getVectorNumElements();
684   group2Shuffle(VT, GroupSize, VPShuf);
685   reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);
686 }
687 
688 void X86InterleavedAccessGroup::transpose_4x4(
689     ArrayRef<Instruction *> Matrix,
690     SmallVectorImpl<Value *> &TransposedMatrix) {
691   assert(Matrix.size() == 4 && "Invalid matrix size");
692   TransposedMatrix.resize(4);
693 
694   // dst = src1[0,1],src2[0,1]
695   uint32_t IntMask1[] = {0, 1, 4, 5};
696   ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
697   Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
698   Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
699 
700   // dst = src1[2,3],src2[2,3]
701   uint32_t IntMask2[] = {2, 3, 6, 7};
702   Mask = makeArrayRef(IntMask2, 4);
703   Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
704   Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
705 
706   // dst = src1[0],src2[0],src1[2],src2[2]
707   uint32_t IntMask3[] = {0, 4, 2, 6};
708   Mask = makeArrayRef(IntMask3, 4);
709   TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
710   TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
711 
712   // dst = src1[1],src2[1],src1[3],src2[3]
713   uint32_t IntMask4[] = {1, 5, 3, 7};
714   Mask = makeArrayRef(IntMask4, 4);
715   TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
716   TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
717 }
718 
719 // Lowers this interleaved access group into X86-specific
720 // instructions/intrinsics.
721 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
722   SmallVector<Instruction *, 4> DecomposedVectors;
723   SmallVector<Value *, 4> TransposedVectors;
724   VectorType *ShuffleTy = Shuffles[0]->getType();
725 
726   if (isa<LoadInst>(Inst)) {
727     // Try to generate target-sized register(/instruction).
728     decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
729 
730     Type *ShuffleEltTy = Inst->getType();
731     unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
732     // Perform matrix-transposition in order to compute interleaved
733     // results by generating some sort of (optimized) target-specific
734     // instructions.
735 
736     switch (NumSubVecElems) {
737     default:
738       return false;
739     case 4:
740       transpose_4x4(DecomposedVectors, TransposedVectors);
741       break;
742     case 8:
743     case 16:
744     case 32:
745     case 64:
746       deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
747                               NumSubVecElems);
748       break;
749     }
750 
751     // Now replace the unoptimized-interleaved-vectors with the
752     // transposed-interleaved vectors.
753     for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
754       Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
755 
756     return true;
757   }
758 
759   Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
760   unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
761 
762   // Lower the interleaved stores:
763   //   1. Decompose the interleaved wide shuffle into individual shuffle
764   //   vectors.
765   decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
766             DecomposedVectors);
767 
768   //   2. Transpose the interleaved-vectors into vectors of contiguous
769   //      elements.
770   switch (NumSubVecElems) {
771   case 4:
772     transpose_4x4(DecomposedVectors, TransposedVectors);
773     break;
774   case 8:
775     interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
776     break;
777   case 16:
778   case 32:
779   case 64:
780     if (Factor == 4)
781       interleave8bitStride4(DecomposedVectors, TransposedVectors,
782                             NumSubVecElems);
783     if (Factor == 3)
784       interleave8bitStride3(DecomposedVectors, TransposedVectors,
785                             NumSubVecElems);
786     break;
787   default:
788     return false;
789   }
790 
791   //   3. Concatenate the contiguous-vectors back into a wide vector.
792   Value *WideVec = concatenateVectors(Builder, TransposedVectors);
793 
794   //   4. Generate a store instruction for wide-vec.
795   StoreInst *SI = cast<StoreInst>(Inst);
796   Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
797                              SI->getAlignment());
798 
799   return true;
800 }
801 
802 // Lower interleaved load(s) into target specific instructions/
803 // intrinsics. Lowering sequence varies depending on the vector-types, factor,
804 // number of shuffles and ISA.
805 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
806 bool X86TargetLowering::lowerInterleavedLoad(
807     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
808     ArrayRef<unsigned> Indices, unsigned Factor) const {
809   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
810          "Invalid interleave factor");
811   assert(!Shuffles.empty() && "Empty shufflevector input");
812   assert(Shuffles.size() == Indices.size() &&
813          "Unmatched number of shufflevectors and indices");
814 
815   // Create an interleaved access group.
816   IRBuilder<> Builder(LI);
817   X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
818                                 Builder);
819 
820   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
821 }
822 
823 bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
824                                               ShuffleVectorInst *SVI,
825                                               unsigned Factor) const {
826   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
827          "Invalid interleave factor");
828 
829   assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
830          "Invalid interleaved store");
831 
832   // Holds the indices of SVI that correspond to the starting index of each
833   // interleaved shuffle.
834   SmallVector<unsigned, 4> Indices;
835   auto Mask = SVI->getShuffleMask();
836   for (unsigned i = 0; i < Factor; i++)
837     Indices.push_back(Mask[i]);
838 
839   ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
840 
841   // Create an interleaved access group.
842   IRBuilder<> Builder(SI);
843   X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
844                                 Builder);
845 
846   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
847 }
848