xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp (revision 0e8011faf58b743cc652e3b2ad0f7671227610df)
1 //===- X86InterleavedAccess.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file contains the X86 implementation of the interleaved accesses
11 /// optimization generating X86-specific instructions/intrinsics for
12 /// interleaved access groups.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86ISelLowering.h"
17 #include "X86Subtarget.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/Analysis/VectorUtils.h"
21 #include "llvm/CodeGenTypes/MachineValueType.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DataLayout.h"
24 #include "llvm/IR/DerivedTypes.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/IR/Instruction.h"
27 #include "llvm/IR/Instructions.h"
28 #include "llvm/IR/Module.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/IR/Value.h"
31 #include "llvm/Support/Casting.h"
32 #include <algorithm>
33 #include <cassert>
34 #include <cmath>
35 #include <cstdint>
36 
37 using namespace llvm;
38 
39 namespace {
40 
41 /// This class holds necessary information to represent an interleaved
42 /// access group and supports utilities to lower the group into
43 /// X86-specific instructions/intrinsics.
44 ///  E.g. A group of interleaving access loads (Factor = 2; accessing every
45 ///       other element)
46 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
47 ///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
48 ///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
49 class X86InterleavedAccessGroup {
50   /// Reference to the wide-load instruction of an interleaved access
51   /// group.
52   Instruction *const Inst;
53 
54   /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
55   ArrayRef<ShuffleVectorInst *> Shuffles;
56 
57   /// Reference to the starting index of each user-shuffle.
58   ArrayRef<unsigned> Indices;
59 
60   /// Reference to the interleaving stride in terms of elements.
61   const unsigned Factor;
62 
63   /// Reference to the underlying target.
64   const X86Subtarget &Subtarget;
65 
66   const DataLayout &DL;
67 
68   IRBuilder<> &Builder;
69 
70   /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
71   /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
72   void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T,
73                  SmallVectorImpl<Instruction *> &DecomposedVectors);
74 
75   /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
76   /// returns the transposed-vectors in \p TransposedVectors.
77   /// E.g.
78   /// InputVectors:
79   ///   In-V0 = p1, p2, p3, p4
80   ///   In-V1 = q1, q2, q3, q4
81   ///   In-V2 = r1, r2, r3, r4
82   ///   In-V3 = s1, s2, s3, s4
83   /// OutputVectors:
84   ///   Out-V0 = p1, q1, r1, s1
85   ///   Out-V1 = p2, q2, r2, s2
86   ///   Out-V2 = p3, q3, r3, s3
87   ///   Out-V3 = P4, q4, r4, s4
88   void transpose_4x4(ArrayRef<Instruction *> InputVectors,
89                      SmallVectorImpl<Value *> &TransposedMatrix);
90   void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
91                              SmallVectorImpl<Value *> &TransposedMatrix,
92                              unsigned NumSubVecElems);
93   void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
94                                 SmallVectorImpl<Value *> &TransposedMatrix);
95   void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
96                              SmallVectorImpl<Value *> &TransposedMatrix,
97                              unsigned NumSubVecElems);
98   void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
99                                SmallVectorImpl<Value *> &TransposedMatrix,
100                                unsigned NumSubVecElems);
101 
102 public:
103   /// In order to form an interleaved access group X86InterleavedAccessGroup
104   /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
105   /// \p Shuffs, reference to the first indices of each interleaved-vector
106   /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
107   /// X86-specific instructions/intrinsics it also requires the underlying
108   /// target information \p STarget.
109   explicit X86InterleavedAccessGroup(Instruction *I,
110                                      ArrayRef<ShuffleVectorInst *> Shuffs,
111                                      ArrayRef<unsigned> Ind, const unsigned F,
112                                      const X86Subtarget &STarget,
113                                      IRBuilder<> &B)
114       : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
115         DL(Inst->getDataLayout()), Builder(B) {}
116 
117   /// Returns true if this interleaved access group can be lowered into
118   /// x86-specific instructions/intrinsics, false otherwise.
119   bool isSupported() const;
120 
121   /// Lowers this interleaved access group into X86-specific
122   /// instructions/intrinsics.
123   bool lowerIntoOptimizedSequence();
124 };
125 
126 } // end anonymous namespace
127 
128 bool X86InterleavedAccessGroup::isSupported() const {
129   VectorType *ShuffleVecTy = Shuffles[0]->getType();
130   Type *ShuffleEltTy = ShuffleVecTy->getElementType();
131   unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
132   unsigned WideInstSize;
133 
134   // Currently, lowering is supported for the following vectors:
135   // Stride 4:
136   //    1. Store and load of 4-element vectors of 64 bits on AVX.
137   //    2. Store of 16/32-element vectors of 8 bits on AVX.
138   // Stride 3:
139   //    1. Load of 16/32-element vectors of 8 bits on AVX.
140   if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
141     return false;
142 
143   if (isa<LoadInst>(Inst)) {
144     WideInstSize = DL.getTypeSizeInBits(Inst->getType());
145     if (cast<LoadInst>(Inst)->getPointerAddressSpace())
146       return false;
147   } else
148     WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
149 
150   // We support shuffle represents stride 4 for byte type with size of
151   // WideInstSize.
152   if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
153     return true;
154 
155   if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
156       (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
157        WideInstSize == 2048))
158     return true;
159 
160   if (ShuffleElemSize == 8 && Factor == 3 &&
161       (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
162     return true;
163 
164   return false;
165 }
166 
167 void X86InterleavedAccessGroup::decompose(
168     Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy,
169     SmallVectorImpl<Instruction *> &DecomposedVectors) {
170   assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
171          "Expected Load or Shuffle");
172 
173   Type *VecWidth = VecInst->getType();
174   (void)VecWidth;
175   assert(VecWidth->isVectorTy() &&
176          DL.getTypeSizeInBits(VecWidth) >=
177              DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
178          "Invalid Inst-size!!!");
179 
180   if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
181     Value *Op0 = SVI->getOperand(0);
182     Value *Op1 = SVI->getOperand(1);
183 
184     // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
185     for (unsigned i = 0; i < NumSubVectors; ++i)
186       DecomposedVectors.push_back(
187           cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
188               Op0, Op1,
189               createSequentialMask(Indices[i], SubVecTy->getNumElements(),
190                                    0))));
191     return;
192   }
193 
194   // Decompose the load instruction.
195   LoadInst *LI = cast<LoadInst>(VecInst);
196   Type *VecBaseTy;
197   unsigned int NumLoads = NumSubVectors;
198   // In the case of stride 3 with a vector of 32 elements load the information
199   // in the following way:
200   // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
201   unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
202   Value *VecBasePtr = LI->getPointerOperand();
203   if (VecLength == 768 || VecLength == 1536) {
204     VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16);
205     NumLoads = NumSubVectors * (VecLength / 384);
206   } else {
207     VecBaseTy = SubVecTy;
208   }
209   // Generate N loads of T type.
210   assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) &&
211          "VecBaseTy's size must be a multiple of 8");
212   const Align FirstAlignment = LI->getAlign();
213   const Align SubsequentAlignment = commonAlignment(
214       FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedValue() / 8);
215   Align Alignment = FirstAlignment;
216   for (unsigned i = 0; i < NumLoads; i++) {
217     // TODO: Support inbounds GEP.
218     Value *NewBasePtr =
219         Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
220     Instruction *NewLoad =
221         Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
222     DecomposedVectors.push_back(NewLoad);
223     Alignment = SubsequentAlignment;
224   }
225 }
226 
227 // Changing the scale of the vector type by reducing the number of elements and
228 // doubling the scalar size.
229 static MVT scaleVectorType(MVT VT) {
230   unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
231   return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
232                           VT.getVectorNumElements() / 2);
233 }
234 
235 static constexpr int Concat[] = {
236     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
237     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
238     32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
239     48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
240 
241 // genShuffleBland - Creates shuffle according to two vectors.This function is
242 // only works on instructions with lane inside 256 registers. According to
243 // the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
244 // offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
245 // Where the 'LowOffset' refers to the first vector and the highOffset refers to
246 // the second vector.
247 // |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
248 // |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
249 // |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
250 // For the sequence to work as a mirror to the load.
251 // We must consider the elements order as above.
252 // In this function we are combining two types of shuffles.
253 // The first one is vpshufed and the second is a type of "blend" shuffle.
254 // By computing the shuffle on a sequence of 16 elements(one lane) and add the
255 // correct offset. We are creating a vpsuffed + blend sequence between two
256 // shuffles.
257 static void genShuffleBland(MVT VT, ArrayRef<int> Mask,
258                             SmallVectorImpl<int> &Out, int LowOffset,
259                             int HighOffset) {
260   assert(VT.getSizeInBits() >= 256 &&
261          "This function doesn't accept width smaller then 256");
262   unsigned NumOfElm = VT.getVectorNumElements();
263   for (int I : Mask)
264     Out.push_back(I + LowOffset);
265   for (int I : Mask)
266     Out.push_back(I + HighOffset + NumOfElm);
267 }
268 
269 // reorderSubVector returns the data to is the original state. And de-facto is
270 // the opposite of  the function concatSubVector.
271 
272 // For VecElems = 16
273 // Invec[0] -  |0|      TransposedMatrix[0] - |0|
274 // Invec[1] -  |1|  =>  TransposedMatrix[1] - |1|
275 // Invec[2] -  |2|      TransposedMatrix[2] - |2|
276 
277 // For VecElems = 32
278 // Invec[0] -  |0|3|      TransposedMatrix[0] - |0|1|
279 // Invec[1] -  |1|4|  =>  TransposedMatrix[1] - |2|3|
280 // Invec[2] -  |2|5|      TransposedMatrix[2] - |4|5|
281 
282 // For VecElems = 64
283 // Invec[0] -  |0|3|6|9 |     TransposedMatrix[0] - |0|1|2 |3 |
284 // Invec[1] -  |1|4|7|10| =>  TransposedMatrix[1] - |4|5|6 |7 |
285 // Invec[2] -  |2|5|8|11|     TransposedMatrix[2] - |8|9|10|11|
286 
287 static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
288                              ArrayRef<Value *> Vec, ArrayRef<int> VPShuf,
289                              unsigned VecElems, unsigned Stride,
290                              IRBuilder<> &Builder) {
291 
292   if (VecElems == 16) {
293     for (unsigned i = 0; i < Stride; i++)
294       TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
295     return;
296   }
297 
298   SmallVector<int, 32> OptimizeShuf;
299   Value *Temp[8];
300 
301   for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
302     genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
303                     (i + 1) / Stride * 16);
304     Temp[i / 2] = Builder.CreateShuffleVector(
305         Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
306     OptimizeShuf.clear();
307   }
308 
309   if (VecElems == 32) {
310     std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
311     return;
312   } else
313     for (unsigned i = 0; i < Stride; i++)
314       TransposedMatrix[i] =
315           Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
316 }
317 
318 void X86InterleavedAccessGroup::interleave8bitStride4VF8(
319     ArrayRef<Instruction *> Matrix,
320     SmallVectorImpl<Value *> &TransposedMatrix) {
321   // Assuming we start from the following vectors:
322   // Matrix[0]= c0 c1 c2 c3 c4 ... c7
323   // Matrix[1]= m0 m1 m2 m3 m4 ... m7
324   // Matrix[2]= y0 y1 y2 y3 y4 ... y7
325   // Matrix[3]= k0 k1 k2 k3 k4 ... k7
326 
327   MVT VT = MVT::v8i16;
328   TransposedMatrix.resize(2);
329   SmallVector<int, 16> MaskLow;
330   SmallVector<int, 32> MaskLowTemp1, MaskLowWord;
331   SmallVector<int, 32> MaskHighTemp1, MaskHighWord;
332 
333   for (unsigned i = 0; i < 8; ++i) {
334     MaskLow.push_back(i);
335     MaskLow.push_back(i + 8);
336   }
337 
338   createUnpackShuffleMask(VT, MaskLowTemp1, true, false);
339   createUnpackShuffleMask(VT, MaskHighTemp1, false, false);
340   narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord);
341   narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord);
342   // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
343   // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
344   Value *IntrVec1Low =
345       Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
346   Value *IntrVec2Low =
347       Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
348 
349   // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
350   // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
351 
352   TransposedMatrix[0] =
353       Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
354   TransposedMatrix[1] =
355       Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
356 }
357 
358 void X86InterleavedAccessGroup::interleave8bitStride4(
359     ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
360     unsigned NumOfElm) {
361   // Example: Assuming we start from the following vectors:
362   // Matrix[0]= c0 c1 c2 c3 c4 ... c31
363   // Matrix[1]= m0 m1 m2 m3 m4 ... m31
364   // Matrix[2]= y0 y1 y2 y3 y4 ... y31
365   // Matrix[3]= k0 k1 k2 k3 k4 ... k31
366 
367   MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
368   MVT HalfVT = scaleVectorType(VT);
369 
370   TransposedMatrix.resize(4);
371   SmallVector<int, 32> MaskHigh;
372   SmallVector<int, 32> MaskLow;
373   SmallVector<int, 32> LowHighMask[2];
374   SmallVector<int, 32> MaskHighTemp;
375   SmallVector<int, 32> MaskLowTemp;
376 
377   // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
378   // shuffle pattern.
379 
380   createUnpackShuffleMask(VT, MaskLow, true, false);
381   createUnpackShuffleMask(VT, MaskHigh, false, false);
382 
383   // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
384   // shuffle pattern.
385 
386   createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false);
387   createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false);
388   narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]);
389   narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]);
390 
391   // IntrVec1Low  = c0  m0  c1  m1 ... c7  m7  | c16 m16 c17 m17 ... c23 m23
392   // IntrVec1High = c8  m8  c9  m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
393   // IntrVec2Low  = y0  k0  y1  k1 ... y7  k7  | y16 k16 y17 k17 ... y23 k23
394   // IntrVec2High = y8  k8  y9  k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
395   Value *IntrVec[4];
396 
397   IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
398   IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
399   IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
400   IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
401 
402   // cmyk4  cmyk5  cmyk6   cmyk7  | cmyk20 cmyk21 cmyk22 cmyk23
403   // cmyk12 cmyk13 cmyk14  cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
404   // cmyk0  cmyk1  cmyk2   cmyk3  | cmyk16 cmyk17 cmyk18 cmyk19
405   // cmyk8  cmyk9  cmyk10  cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
406 
407   Value *VecOut[4];
408   for (int i = 0; i < 4; i++)
409     VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
410                                             LowHighMask[i % 2]);
411 
412   // cmyk0  cmyk1  cmyk2  cmyk3   | cmyk4  cmyk5  cmyk6  cmyk7
413   // cmyk8  cmyk9  cmyk10 cmyk11  | cmyk12 cmyk13 cmyk14 cmyk15
414   // cmyk16 cmyk17 cmyk18 cmyk19  | cmyk20 cmyk21 cmyk22 cmyk23
415   // cmyk24 cmyk25 cmyk26 cmyk27  | cmyk28 cmyk29 cmyk30 cmyk31
416 
417   if (VT == MVT::v16i8) {
418     std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
419     return;
420   }
421 
422   reorderSubVector(VT, TransposedMatrix, VecOut, ArrayRef(Concat, 16), NumOfElm,
423                    4, Builder);
424 }
425 
426 //  createShuffleStride returns shuffle mask of size N.
427 //  The shuffle pattern is as following :
428 //  {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
429 //  (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
430 //  (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
431 //  Where Lane is the # of lanes in a register:
432 //  VectorSize = 128 => Lane = 1
433 //  VectorSize = 256 => Lane = 2
434 //  For example shuffle pattern for VF 16 register size 256 -> lanes = 2
435 //  {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
436 static void createShuffleStride(MVT VT, int Stride,
437                                 SmallVectorImpl<int> &Mask) {
438   int VectorSize = VT.getSizeInBits();
439   int VF = VT.getVectorNumElements();
440   int LaneCount = std::max(VectorSize / 128, 1);
441   for (int Lane = 0; Lane < LaneCount; Lane++)
442     for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
443       Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
444 }
445 
446 //  setGroupSize sets 'SizeInfo' to the size(number of elements) of group
447 //  inside mask a shuffleMask. A mask contains exactly 3 groups, where
448 //  each group is a monotonically increasing sequence with stride 3.
449 //  For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
450 static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) {
451   int VectorSize = VT.getSizeInBits();
452   int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
453   for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
454     int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
455     SizeInfo.push_back(GroupSize);
456     FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
457   }
458 }
459 
460 //  DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
461 //  vpalign works according to lanes
462 //  Where Lane is the # of lanes in a register:
463 //  VectorWide = 128 => Lane = 1
464 //  VectorWide = 256 => Lane = 2
465 //  For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
466 //  For Lane = 2 shuffle pattern is:
467 //  {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
468 //  Imm variable sets the offset amount. The result of the
469 //  function is stored inside ShuffleMask vector and it built as described in
470 //  the begin of the description. AlignDirection is a boolean that indicates the
471 //  direction of the alignment. (false - align to the "right" side while true -
472 //  align to the "left" side)
473 static void DecodePALIGNRMask(MVT VT, unsigned Imm,
474                               SmallVectorImpl<int> &ShuffleMask,
475                               bool AlignDirection = true, bool Unary = false) {
476   unsigned NumElts = VT.getVectorNumElements();
477   unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
478   unsigned NumLaneElts = NumElts / NumLanes;
479 
480   Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
481   unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
482 
483   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
484     for (unsigned i = 0; i != NumLaneElts; ++i) {
485       unsigned Base = i + Offset;
486       // if i+offset is out of this lane then we actually need the other source
487       // If Unary the other source is the first source.
488       if (Base >= NumLaneElts)
489         Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
490       ShuffleMask.push_back(Base + l);
491     }
492   }
493 }
494 
495 // concatSubVector - The function rebuilds the data to a correct expected
496 // order. An assumption(The shape of the matrix) was taken for the
497 // deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
498 // This function ensures that the data is built in correct way for the lane
499 // instructions. Each lane inside the vector is a 128-bit length.
500 //
501 // The 'InVec' argument contains the data in increasing order. In InVec[0] You
502 // can find the first 128 bit data. The number of different lanes inside a
503 // vector depends on the 'VecElems'.In general, the formula is
504 // VecElems * type / 128. The size of the array 'InVec' depends and equal to
505 // 'VecElems'.
506 
507 // For VecElems = 16
508 // Invec[0] - |0|      Vec[0] - |0|
509 // Invec[1] - |1|  =>  Vec[1] - |1|
510 // Invec[2] - |2|      Vec[2] - |2|
511 
512 // For VecElems = 32
513 // Invec[0] - |0|1|      Vec[0] - |0|3|
514 // Invec[1] - |2|3|  =>  Vec[1] - |1|4|
515 // Invec[2] - |4|5|      Vec[2] - |2|5|
516 
517 // For VecElems = 64
518 // Invec[0] - |0|1|2 |3 |      Vec[0] - |0|3|6|9 |
519 // Invec[1] - |4|5|6 |7 |  =>  Vec[1] - |1|4|7|10|
520 // Invec[2] - |8|9|10|11|      Vec[2] - |2|5|8|11|
521 
522 static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
523                             unsigned VecElems, IRBuilder<> &Builder) {
524   if (VecElems == 16) {
525     for (int i = 0; i < 3; i++)
526       Vec[i] = InVec[i];
527     return;
528   }
529 
530   for (unsigned j = 0; j < VecElems / 32; j++)
531     for (int i = 0; i < 3; i++)
532       Vec[i + j * 3] = Builder.CreateShuffleVector(
533           InVec[j * 6 + i], InVec[j * 6 + i + 3], ArrayRef(Concat, 32));
534 
535   if (VecElems == 32)
536     return;
537 
538   for (int i = 0; i < 3; i++)
539     Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
540 }
541 
542 void X86InterleavedAccessGroup::deinterleave8bitStride3(
543     ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
544     unsigned VecElems) {
545   // Example: Assuming we start from the following vectors:
546   // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
547   // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
548   // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
549 
550   TransposedMatrix.resize(3);
551   SmallVector<int, 32> VPShuf;
552   SmallVector<int, 32> VPAlign[2];
553   SmallVector<int, 32> VPAlign2;
554   SmallVector<int, 32> VPAlign3;
555   SmallVector<int, 3> GroupSize;
556   Value *Vec[6], *TempVector[3];
557 
558   MVT VT = MVT::getVT(Shuffles[0]->getType());
559 
560   createShuffleStride(VT, 3, VPShuf);
561   setGroupSize(VT, GroupSize);
562 
563   for (int i = 0; i < 2; i++)
564     DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
565 
566   DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
567   DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
568 
569   concatSubVector(Vec, InVec, VecElems, Builder);
570   // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
571   // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
572   // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
573 
574   for (int i = 0; i < 3; i++)
575     Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
576 
577   // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
578   // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
579   // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
580 
581   for (int i = 0; i < 3; i++)
582     TempVector[i] =
583         Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
584 
585   // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
586   // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
587   // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
588 
589   for (int i = 0; i < 3; i++)
590     Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
591                                          VPAlign[1]);
592 
593   // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
594   // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
595   // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
596 
597   Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3);
598   TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2);
599   TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
600   TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
601 }
602 
603 // group2Shuffle reorder the shuffle stride back into continuous order.
604 // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
605 // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
606 static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask,
607                           SmallVectorImpl<int> &Output) {
608   int IndexGroup[3] = {0, 0, 0};
609   int Index = 0;
610   int VectorWidth = VT.getSizeInBits();
611   int VF = VT.getVectorNumElements();
612   // Find the index of the different groups.
613   int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
614   for (int i = 0; i < 3; i++) {
615     IndexGroup[(Index * 3) % (VF / Lane)] = Index;
616     Index += Mask[i];
617   }
618   // According to the index compute the convert mask.
619   for (int i = 0; i < VF / Lane; i++) {
620     Output.push_back(IndexGroup[i % 3]);
621     IndexGroup[i % 3]++;
622   }
623 }
624 
625 void X86InterleavedAccessGroup::interleave8bitStride3(
626     ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
627     unsigned VecElems) {
628   // Example: Assuming we start from the following vectors:
629   // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
630   // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
631   // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
632 
633   TransposedMatrix.resize(3);
634   SmallVector<int, 3> GroupSize;
635   SmallVector<int, 32> VPShuf;
636   SmallVector<int, 32> VPAlign[3];
637   SmallVector<int, 32> VPAlign2;
638   SmallVector<int, 32> VPAlign3;
639 
640   Value *Vec[3], *TempVector[3];
641   MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
642 
643   setGroupSize(VT, GroupSize);
644 
645   for (int i = 0; i < 3; i++)
646     DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
647 
648   DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
649   DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
650 
651   // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
652   // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
653   // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
654 
655   Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2);
656   Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3);
657   Vec[2] = InVec[2];
658 
659   // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
660   // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
661   // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
662 
663   for (int i = 0; i < 3; i++)
664     TempVector[i] =
665         Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
666 
667   // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
668   // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
669   // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
670 
671   for (int i = 0; i < 3; i++)
672     Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
673                                          VPAlign[2]);
674 
675   // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
676   // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
677   // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
678 
679   unsigned NumOfElm = VT.getVectorNumElements();
680   group2Shuffle(VT, GroupSize, VPShuf);
681   reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder);
682 }
683 
684 void X86InterleavedAccessGroup::transpose_4x4(
685     ArrayRef<Instruction *> Matrix,
686     SmallVectorImpl<Value *> &TransposedMatrix) {
687   assert(Matrix.size() == 4 && "Invalid matrix size");
688   TransposedMatrix.resize(4);
689 
690   // dst = src1[0,1],src2[0,1]
691   static constexpr int IntMask1[] = {0, 1, 4, 5};
692   ArrayRef<int> Mask = ArrayRef(IntMask1, 4);
693   Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
694   Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
695 
696   // dst = src1[2,3],src2[2,3]
697   static constexpr int IntMask2[] = {2, 3, 6, 7};
698   Mask = ArrayRef(IntMask2, 4);
699   Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
700   Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
701 
702   // dst = src1[0],src2[0],src1[2],src2[2]
703   static constexpr int IntMask3[] = {0, 4, 2, 6};
704   Mask = ArrayRef(IntMask3, 4);
705   TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
706   TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
707 
708   // dst = src1[1],src2[1],src1[3],src2[3]
709   static constexpr int IntMask4[] = {1, 5, 3, 7};
710   Mask = ArrayRef(IntMask4, 4);
711   TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
712   TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
713 }
714 
715 // Lowers this interleaved access group into X86-specific
716 // instructions/intrinsics.
717 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
718   SmallVector<Instruction *, 4> DecomposedVectors;
719   SmallVector<Value *, 4> TransposedVectors;
720   auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType());
721 
722   if (isa<LoadInst>(Inst)) {
723     auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
724     unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
725     switch (NumSubVecElems) {
726     default:
727       return false;
728     case 4:
729     case 8:
730     case 16:
731     case 32:
732     case 64:
733       if (ShuffleTy->getNumElements() != NumSubVecElems)
734         return false;
735       break;
736     }
737 
738     // Try to generate target-sized register(/instruction).
739     decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
740 
741     // Perform matrix-transposition in order to compute interleaved
742     // results by generating some sort of (optimized) target-specific
743     // instructions.
744 
745     if (NumSubVecElems == 4)
746       transpose_4x4(DecomposedVectors, TransposedVectors);
747     else
748       deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
749                               NumSubVecElems);
750 
751     // Now replace the unoptimized-interleaved-vectors with the
752     // transposed-interleaved vectors.
753     for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
754       Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
755 
756     return true;
757   }
758 
759   Type *ShuffleEltTy = ShuffleTy->getElementType();
760   unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
761 
762   // Lower the interleaved stores:
763   //   1. Decompose the interleaved wide shuffle into individual shuffle
764   //   vectors.
765   decompose(Shuffles[0], Factor,
766             FixedVectorType::get(ShuffleEltTy, NumSubVecElems),
767             DecomposedVectors);
768 
769   //   2. Transpose the interleaved-vectors into vectors of contiguous
770   //      elements.
771   switch (NumSubVecElems) {
772   case 4:
773     transpose_4x4(DecomposedVectors, TransposedVectors);
774     break;
775   case 8:
776     interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
777     break;
778   case 16:
779   case 32:
780   case 64:
781     if (Factor == 4)
782       interleave8bitStride4(DecomposedVectors, TransposedVectors,
783                             NumSubVecElems);
784     if (Factor == 3)
785       interleave8bitStride3(DecomposedVectors, TransposedVectors,
786                             NumSubVecElems);
787     break;
788   default:
789     return false;
790   }
791 
792   //   3. Concatenate the contiguous-vectors back into a wide vector.
793   Value *WideVec = concatenateVectors(Builder, TransposedVectors);
794 
795   //   4. Generate a store instruction for wide-vec.
796   StoreInst *SI = cast<StoreInst>(Inst);
797   Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign());
798 
799   return true;
800 }
801 
802 // Lower interleaved load(s) into target specific instructions/
803 // intrinsics. Lowering sequence varies depending on the vector-types, factor,
804 // number of shuffles and ISA.
805 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
806 bool X86TargetLowering::lowerInterleavedLoad(
807     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
808     ArrayRef<unsigned> Indices, unsigned Factor) const {
809   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
810          "Invalid interleave factor");
811   assert(!Shuffles.empty() && "Empty shufflevector input");
812   assert(Shuffles.size() == Indices.size() &&
813          "Unmatched number of shufflevectors and indices");
814 
815   // Create an interleaved access group.
816   IRBuilder<> Builder(LI);
817   X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
818                                 Builder);
819 
820   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
821 }
822 
823 bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
824                                               ShuffleVectorInst *SVI,
825                                               unsigned Factor) const {
826   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
827          "Invalid interleave factor");
828 
829   assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor ==
830              0 &&
831          "Invalid interleaved store");
832 
833   // Holds the indices of SVI that correspond to the starting index of each
834   // interleaved shuffle.
835   SmallVector<unsigned, 4> Indices;
836   auto Mask = SVI->getShuffleMask();
837   for (unsigned i = 0; i < Factor; i++)
838     Indices.push_back(Mask[i]);
839 
840   ArrayRef<ShuffleVectorInst *> Shuffles = ArrayRef(SVI);
841 
842   // Create an interleaved access group.
843   IRBuilder<> Builder(SI);
844   X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
845                                 Builder);
846 
847   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
848 }
849