Lines Matching +full:2 +full:- +full:lane

1 //===- X86InterleavedAccess.cpp -------------------------------------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
11 /// optimization generating X86-specific instructions/intrinsics for
14 //===----------------------------------------------------------------------===//
43 /// X86-specific instructions/intrinsics.
44 /// E.g. A group of interleaving access loads (Factor = 2; accessing every
47 /// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
50 /// Reference to the wide-load instruction of an interleaved access
57 /// Reference to the starting index of each user-shuffle.
71 /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
76 /// returns the transposed-vectors in \p TransposedVectors.
79 /// In-V0 = p1, p2, p3, p4
80 /// In-V1 = q1, q2, q3, q4
81 /// In-V2 = r1, r2, r3, r4
82 /// In-V3 = s1, s2, s3, s4
84 /// Out-V0 = p1, q1, r1, s1
85 /// Out-V1 = p2, q2, r2, s2
86 /// Out-V2 = p3, q3, r3, s3
87 /// Out-V3 = P4, q4, r4, s4
104 /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
105 /// \p Shuffs, reference to the first indices of each interleaved-vector
107 /// X86-specific instructions/intrinsics it also requires the underlying
115 DL(Inst->getDataLayout()), Builder(B) {} in X86InterleavedAccessGroup()
118 /// x86-specific instructions/intrinsics, false otherwise.
121 /// Lowers this interleaved access group into X86-specific
129 VectorType *ShuffleVecTy = Shuffles[0]->getType(); in isSupported()
130 Type *ShuffleEltTy = ShuffleVecTy->getElementType(); in isSupported()
136 // 1. Store and load of 4-element vectors of 64 bits on AVX. in isSupported()
137 // 2. Store of 16/32-element vectors of 8 bits on AVX. in isSupported()
139 // 1. Load of 16/32-element vectors of 8 bits on AVX. in isSupported()
144 WideInstSize = DL.getTypeSizeInBits(Inst->getType()); in isSupported()
145 if (cast<LoadInst>(Inst)->getPointerAddressSpace()) in isSupported()
148 WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType()); in isSupported()
173 Type *VecWidth = VecInst->getType(); in decompose()
175 assert(VecWidth->isVectorTy() && in decompose()
178 "Invalid Inst-size!!!"); in decompose()
181 Value *Op0 = SVI->getOperand(0); in decompose()
182 Value *Op1 = SVI->getOperand(1); in decompose()
189 createSequentialMask(Indices[i], SubVecTy->getNumElements(), in decompose()
200 // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1] in decompose()
202 Value *VecBasePtr = LI->getPointerOperand(); in decompose()
204 VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16); in decompose()
210 assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) && in decompose()
212 const Align FirstAlignment = LI->getAlign(); in decompose()
214 FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedValue() / 8); in decompose()
230 unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2; in scaleVectorType()
232 VT.getVectorNumElements() / 2); in scaleVectorType()
236 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
241 // genShuffleBland - Creates shuffle according to two vectors.This function is
242 // only works on instructions with lane inside 256 registers. According to
254 // By computing the shuffle on a sequence of 16 elements(one lane) and add the
269 // reorderSubVector returns the data to is the original state. And de-facto is
273 // Invec[0] - |0| TransposedMatrix[0] - |0|
274 // Invec[1] - |1| => TransposedMatrix[1] - |1|
275 // Invec[2] - |2| TransposedMatrix[2] - |2|
278 // Invec[0] - |0|3| TransposedMatrix[0] - |0|1|
279 // Invec[1] - |1|4| => TransposedMatrix[1] - |2|3|
280 // Invec[2] - |2|5| TransposedMatrix[2] - |4|5|
283 // Invec[0] - |0|3|6|9 | TransposedMatrix[0] - |0|1|2 |3 |
284 // Invec[1] - |1|4|7|10| => TransposedMatrix[1] - |4|5|6 |7 |
285 // Invec[2] - |2|5|8|11| TransposedMatrix[2] - |8|9|10|11|
301 for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) { in reorderSubVector()
304 Temp[i / 2] = Builder.CreateShuffleVector( in reorderSubVector()
315 Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat); in reorderSubVector()
324 // Matrix[2]= y0 y1 y2 y3 y4 ... y7 in interleave8bitStride4VF8()
328 TransposedMatrix.resize(2); in interleave8bitStride4VF8()
340 narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord); in interleave8bitStride4VF8()
341 narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord); in interleave8bitStride4VF8()
347 Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow); in interleave8bitStride4VF8()
364 // Matrix[2]= y0 y1 y2 y3 y4 ... y31 in interleave8bitStride4()
373 SmallVector<int, 32> LowHighMask[2]; in interleave8bitStride4()
388 narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]); in interleave8bitStride4()
389 narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]); in interleave8bitStride4()
399 IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow); in interleave8bitStride4()
400 IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh); in interleave8bitStride4()
409 VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2], in interleave8bitStride4()
410 LowHighMask[i % 2]); in interleave8bitStride4()
428 // {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
429 // (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
430 // (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
431 // Where Lane is the # of lanes in a register:
432 // VectorSize = 128 => Lane = 1
433 // VectorSize = 256 => Lane = 2
434 // For example shuffle pattern for VF 16 register size 256 -> lanes = 2
435 // {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
441 for (int Lane = 0; Lane < LaneCount; Lane++) in createShuffleStride() local
443 Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane); in createShuffleStride()
449 // For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
454 int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0); in setGroupSize()
462 // Where Lane is the # of lanes in a register:
463 // VectorWide = 128 => Lane = 1
464 // VectorWide = 256 => Lane = 2
465 // For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
466 // For Lane = 2 shuffle pattern is:
467 // {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
471 // direction of the alignment. (false - align to the "right" side while true -
480 Imm = AlignDirection ? Imm : (NumLaneElts - Imm); in DecodePALIGNRMask()
486 // if i+offset is out of this lane then we actually need the other source in DecodePALIGNRMask()
489 Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts; in DecodePALIGNRMask()
495 // concatSubVector - The function rebuilds the data to a correct expected
497 // deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
498 // This function ensures that the data is built in correct way for the lane
499 // instructions. Each lane inside the vector is a 128-bit length.
508 // Invec[0] - |0| Vec[0] - |0|
509 // Invec[1] - |1| => Vec[1] - |1|
510 // Invec[2] - |2| Vec[2] - |2|
513 // Invec[0] - |0|1| Vec[0] - |0|3|
514 // Invec[1] - |2|3| => Vec[1] - |1|4|
515 // Invec[2] - |4|5| Vec[2] - |2|5|
518 // Invec[0] - |0|1|2 |3 | Vec[0] - |0|3|6|9 |
519 // Invec[1] - |4|5|6 |7 | => Vec[1] - |1|4|7|10|
520 // Invec[2] - |8|9|10|11| Vec[2] - |2|5|8|11|
548 // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7 in deinterleave8bitStride3()
552 SmallVector<int, 32> VPAlign[2]; in deinterleave8bitStride3()
558 MVT VT = MVT::getVT(Shuffles[0]->getType()); in deinterleave8bitStride3()
563 for (int i = 0; i < 2; i++) in deinterleave8bitStride3()
564 DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false); in deinterleave8bitStride3()
566 DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true); in deinterleave8bitStride3()
572 // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7 in deinterleave8bitStride3()
579 // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7 in deinterleave8bitStride3()
583 Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]); in deinterleave8bitStride3()
587 // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7 in deinterleave8bitStride3()
595 // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7 in deinterleave8bitStride3()
599 TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec; in deinterleave8bitStride3()
600 TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2]; in deinterleave8bitStride3()
604 // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
605 // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
613 int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1; in group2Shuffle() local
615 IndexGroup[(Index * 3) % (VF / Lane)] = Index; in group2Shuffle()
619 for (int i = 0; i < VF / Lane; i++) { in group2Shuffle()
631 // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7 in interleave8bitStride3()
648 DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true); in interleave8bitStride3()
653 // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7 in interleave8bitStride3()
657 Vec[2] = InVec[2]; in interleave8bitStride3()
661 // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7 in interleave8bitStride3()
665 Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]); in interleave8bitStride3()
669 // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7 in interleave8bitStride3()
673 VPAlign[2]); in interleave8bitStride3()
677 // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7 in interleave8bitStride3()
693 Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask); in transpose_4x4()
696 // dst = src1[2,3],src2[2,3] in transpose_4x4()
697 static constexpr int IntMask2[] = {2, 3, 6, 7}; in transpose_4x4()
699 Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask); in transpose_4x4()
702 // dst = src1[0],src2[0],src1[2],src2[2] in transpose_4x4()
703 static constexpr int IntMask3[] = {0, 4, 2, 6}; in transpose_4x4()
706 TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask); in transpose_4x4()
715 // Lowers this interleaved access group into X86-specific
720 auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType()); in lowerIntoOptimizedSequence()
723 auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType()); in lowerIntoOptimizedSequence()
724 unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor; in lowerIntoOptimizedSequence()
733 if (ShuffleTy->getNumElements() != NumSubVecElems) in lowerIntoOptimizedSequence()
738 // Try to generate target-sized register(/instruction). in lowerIntoOptimizedSequence()
741 // Perform matrix-transposition in order to compute interleaved in lowerIntoOptimizedSequence()
742 // results by generating some sort of (optimized) target-specific in lowerIntoOptimizedSequence()
751 // Now replace the unoptimized-interleaved-vectors with the in lowerIntoOptimizedSequence()
752 // transposed-interleaved vectors. in lowerIntoOptimizedSequence()
754 Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]); in lowerIntoOptimizedSequence()
759 Type *ShuffleEltTy = ShuffleTy->getElementType(); in lowerIntoOptimizedSequence()
760 unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor; in lowerIntoOptimizedSequence()
769 // 2. Transpose the interleaved-vectors into vectors of contiguous in lowerIntoOptimizedSequence()
792 // 3. Concatenate the contiguous-vectors back into a wide vector. in lowerIntoOptimizedSequence()
795 // 4. Generate a store instruction for wide-vec. in lowerIntoOptimizedSequence()
797 Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign()); in lowerIntoOptimizedSequence()
803 // intrinsics. Lowering sequence varies depending on the vector-types, factor,
809 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && in lowerInterleavedLoad()
826 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && in lowerInterleavedStore()
829 assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor == in lowerInterleavedStore()
836 auto Mask = SVI->getShuffleMask(); in lowerInterleavedStore()