InterleavedLoadCombinePass.cpp - OpenGrok cross reference for /freebsd/contrib/llvm-project/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp

Lines Matching +full:non +full:- +full:interleaved
1 //===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---*- C++ -*-===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
11 // This file defines the interleaved-load-combine pass. The pass searches for
16 // executed just before InterleavedAccesPass to find any left-over instances
19 //===----------------------------------------------------------------------===//
50 #define DEBUG_TYPE "interleaved-load-combine"
59     "disable-" DEBUG_TYPE, cl::init(false), cl::Hidden,
60     cl::desc("Disable combining of interleaved loads"));
70         TLI(*TM.getSubtargetImpl(F)->getTargetLowering()), TTI(TTI) {}  in InterleavedLoadCombineImpl()
72   /// Scan the function for interleaved load candidates and execute the
96   /// Replace interleaved load candidates. It does additional
103   /// factor, find a set that represents a 'factor' interleaved load.
109 /// First Order Polynomial on an n-Bit Integer Value
111 /// Polynomial(Value) = Value * B + A + E*2^(n-e)
113 /// A and B are the coefficients. E*2^(n-e) is an error within 'e' most
160 // Pb_5 - Pa_5 = 16         #0 | subtract to get the offset
175   unsigned ErrorMSBs = (unsigned)-1;
188     IntegerType *Ty = dyn_cast<IntegerType>(V->getType());  in Polynomial()
191       this->V = V;  in Polynomial()
192       A = APInt(Ty->getBitWidth(), 0);  in Polynomial()
206     if (ErrorMSBs == (unsigned)-1)  in incErrorMSBs()
216     if (ErrorMSBs == (unsigned)-1)  in decErrorMSBs()
220       ErrorMSBs -= amt;  in decErrorMSBs()
240     //   (B + A + E*2^(n-e)) + C = B + (A + C) + E*2^(n-e)  in add()
244       ErrorMSBs = (unsigned)-1;  in add()
260     //   (B+A)*C =-  in mul()
273     //   Let B' and A' be the n-Bit inputs with some unknown errors EA,  in mul()
276     //     B' = B + 2^(n-e)*EB  in mul()
277     //     A' = A + 2^(n-e)*EA  in mul()
286     //     (B'*C' + A'*C') = [B + 2^(n-e)*EB] * C' + [A + 2^(n-e)*EA] * C' =  in mul()
287     //                     = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =  in mul()
289     //                     = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =  in mul()
290     //                     = [B + A + 2^(n-e)*EB + 2^(n-e)*EA] * C' =  in mul()
291     //                     = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C' =  in mul()
292     //                     = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C*2^c =  in mul()
293     //                     = (B + A) * C' + C*(EB + EA)*2^(n-e)*2^c =  in mul()
297     //                     = (B + A)*C' + EC*2^(n-e)*2^c =  in mul()
298     //                     = (B + A)*C' + EC*2^(n-(e-c))  in mul()
300     //   Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c  in mul()
305       ErrorMSBs = (unsigned)-1;  in mul()
309     // Multiplying by one is a no-op.  in mul()
331     // Theorem(1): (B + A + E*2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'*2^(n-e')  in lshr()
334     //             E is a e-bit number,  in lshr()
335     //             E' is a e'-bit number,  in lshr()
343     //  B = b_h * 2^(n-1) + b_m * 2 + b_l  in lshr()
344     //  A = a_h * 2^(n-1) + a_m * 2         (pre(1))  in lshr()
346     //  where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers  in lshr()
348     //  Let X = (B + A + E*2^(n-e)) >> 1  in lshr()
349     //  Let Y = (B >> 1) + (A >> 1) + E*2^(n-e) >> 1  in lshr()
351     //    X = [B + A + E*2^(n-e)] >> 1 =  in lshr()
352     //      = [  b_h * 2^(n-1) + b_m * 2 + b_l +  in lshr()
353     //         + a_h * 2^(n-1) + a_m * 2 +  in lshr()
354     //         + E * 2^(n-e) ] >> 1 =  in lshr()
357     //    2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within  in lshr()
362     //      = [  ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-1) +  in lshr()
363     //         + ((b_m + a_m) % 2^(n-2)) * 2 +  in lshr()
364     //         + b_l + E * 2^(n-e) ] >> 1 =  in lshr()
369     //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +  in lshr()
370     //         + ((b_m + a_m) % 2^(n-2)) +  in lshr()
371     //         + E * 2^(n-(e+1)) =  in lshr()
375     //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +  in lshr()
376     //         + ((b_m + a_m) % 2^(n-2)) +  in lshr()
377     //         + E * 2^(n-e') =  in lshr()
381     //    Y =  (B >> 1) + (A >> 1) + E*2^(n-e') =  in lshr()
382     //      =    (b_h * 2^(n-1) + b_m * 2 + b_l) >> 1 +  in lshr()
383     //         + (a_h * 2^(n-1) + a_m * 2) >> 1 +  in lshr()
384     //         + E * 2^(n-e) >> 1 =  in lshr()
389     //      =     b_h * 2^(n-2) + b_m +  in lshr()
390     //         +  a_h * 2^(n-2) + a_m +  in lshr()
391     //         +  E * 2^(n-(e+1)) =  in lshr()
394     //    the term 2^(n-1). But this time there is room for a second bit in the  in lshr()
395     //    term 2^(n-2) we add this bit to a new term and denote it o_h in a  in lshr()
398     //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) * 2^(n-1) +  in lshr()
399     //         + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +  in lshr()
400     //         + ((b_m + a_m) % 2^(n-2)) +  in lshr()
401     //         + E * 2^(n-(e+1)) =  in lshr()
403     //    Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1  in lshr()
406     //      =    o_h * 2^(n-1) +  in lshr()
407     //         + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +  in lshr()
408     //         + ((b_m + a_m) % 2^(n-2)) +  in lshr()
409     //         + E * 2^(n-e') =  in lshr()
414     //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +  in lshr()
415     //         + ((b_m + a_m) % 2^(n-2)) +  in lshr()
416     //         + o_h * 2^(e'-1) * 2^(n-e') +               | pre(2), move 2^(e'-1)  in lshr()
418     //         + E * 2^(n-e') =  in lshr()
419     //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +  in lshr()
420     //         + ((b_m + a_m) % 2^(n-2)) +  in lshr()
421     //         + [o_h * 2^(e'-1) + E] * 2^(n-e') +         | move 2^(e'-1) out of  in lshr()
424     //    Let E' = o_h * 2^(e'-1) + E  in lshr()
426     //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +  in lshr()
427     //         + ((b_m + a_m) % 2^(n-2)) +  in lshr()
428     //         + E' * 2^(n-e')  in lshr()
437     // In this case Theorem(1) transforms to (the pre-condition on A can also be  in lshr()
453     //   hold. This is trivially the case for E' = X - Y.  in lshr()
462       ErrorMSBs = (unsigned)-1;  in lshr()
491   /// Apply a sign-extend or truncate operation on the polynomial.
496       decErrorMSBs(A.getBitWidth() - n);  in sextOrTrunc()
503       incErrorMSBs(n - A.getBitWidth());  in sextOrTrunc()
544   Polynomial operator-(const Polynomial &o) const {  in operator -()
552     return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));  in operator -()
556   Polynomial operator-(uint64_t C) const {  in operator -()
558     Result.A -= C;  in operator -()
572     Polynomial r = *this - o;  in isProvenEqualTo()
657   /// Basic-block the load instructions are within
669   /// Final shuffle-vector instruction
679     EI = new ElementInfo[VTy->getNumElements()];  in VectorInfo()
686   unsigned getDimension() const { return VTy->getNumElements(); }  in getDimension()
688   /// Test if the VectorInfo can be part of an interleaved load with the
696     unsigned Size = DL.getTypeAllocSize(VTy->getElementType());  in isInterleaved()
734     Instruction *Op = dyn_cast<Instruction>(BCI->getOperand(0));  in computeFromBCI()
739     FixedVectorType *VTy = dyn_cast<FixedVectorType>(Op->getType());  in computeFromBCI()
744     if (Result.VTy->getNumElements() % VTy->getNumElements())  in computeFromBCI()
747     unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements();  in computeFromBCI()
748     unsigned NewSize = DL.getTypeAllocSize(Result.VTy->getElementType());  in computeFromBCI()
749     unsigned OldSize = DL.getTypeAllocSize(VTy->getElementType());  in computeFromBCI()
758     for (unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {  in computeFromBCI()
790         cast<FixedVectorType>(SVI->getOperand(0)->getType());  in computeFromSVI()
794     if (!compute(SVI->getOperand(0), LHS, DL))  in computeFromSVI()
799     if (!compute(SVI->getOperand(1), RHS, DL))  in computeFromSVI()
838     for (int i : SVI->getShuffleMask()) {  in computeFromSVI()
839       assert((i < 2 * (signed)ArgTy->getNumElements()) &&  in computeFromSVI()
844       else if (i < (signed)ArgTy->getNumElements()) {  in computeFromSVI()
851           Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()];  in computeFromSVI()
874     if (LI->isVolatile())  in computeFromLI()
877     if (LI->isAtomic())  in computeFromLI()
880     if (!DL.typeSizeEqualsStoreSize(Result.VTy->getElementType()))  in computeFromLI()
884     computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL);  in computeFromLI()
886     Result.BB = LI->getParent();  in computeFromLI()
893           ConstantInt::get(Type::getInt32Ty(LI->getContext()), 0),  in computeFromLI()
894           ConstantInt::get(Type::getInt32Ty(LI->getContext()), i),  in computeFromLI()
925       Result.add(C->getValue());  in computePolynomialBinOp()
933       Result.lshr(C->getValue());  in computePolynomialBinOp()
971         DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());  in computePolynomialFromPointer()
999         // non-constant.  in computePolynomialFromPointer()
1037       Polynomial(DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);  in computePolynomialFromPointer()
1062     // Try to find an interleaved load using the front of Worklist as first line  in findPattern()
1063     unsigned Size = DL.getTypeAllocSize(C0->VTy->getElementType());  in findPattern()
1069       if (C->VTy != C0->VTy)  in findPattern()
1071       if (C->BB != C0->BB)  in findPattern()
1073       if (C->PV != C0->PV)  in findPattern()
1076       // Check the current value matches any of factor - 1 remaining lines  in findPattern()
1078         if (C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) {  in findPattern()
1110   BasicBlock *BB = (*LIs.begin())->getParent();  in findFirstLoad()
1112       *BB, [&LIs](Instruction &I) -> bool { return is_contained(LIs, &I); });  in findFirstLoad()
1113   assert(FLI != BB->end());  in findFirstLoad()
1120   LLVM_DEBUG(dbgs() << "Checking interleaved load\n");  in combine()
1148     // interleaved. This list excludes the instructions necessary for the  in combine()
1173     for (auto *U : I->users()) {  in combine()
1192     auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();  in combine()
1207   Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();  in combine()
1209       cast<FixedVectorType>(InterleavedLoad.front().SVI->getType())  in combine()
1210           ->getNumElements();  in combine()
1215       Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlign(),  in combine()
1216       InsertionPoint->getPointerAddressSpace(), CostKind);  in combine()
1223   auto Ptr = InsertionPoint->getPointerOperand();  in combine()
1224   auto LI = Builder.CreateAlignedLoad(ILTy, Ptr, InsertionPoint->getAlign(),  in combine()
1225                                       "interleaved.wide.load");  in combine()
1239     auto SVI = Builder.CreateShuffleVector(LI, Mask, "interleaved.shuffle");  in combine()
1240     VI.SVI->replaceAllUsesWith(SVI);  in combine()
1246     return OptimizationRemark(DEBUG_TYPE, "Combined Interleaved Load", LI)  in combine()
1247            << "Load interleaved combined with factor "  in combine()
1262   for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {  in run()
1269           if (isa<ScalableVectorType>(SVI->getType()))  in run()
1272           Candidates.emplace_back(cast<FixedVectorType>(SVI->getType()));  in run()
1291         // Remove the first element of the Interleaved Load but put the others  in run()
1305 /// This pass combines interleaved loads into a pattern detectable by
1315     return "Interleaved Load Combine Pass";  in getPassName()
1333                TPC->getTM<TargetMachine>())  in runOnFunction()
1362     "Combine interleaved loads into wide loads and shufflevector instructions",
1369     "Combine interleaved loads into wide loads and shufflevector instructions",  in INITIALIZE_PASS_DEPENDENCY()