Lines Matching +full:non +full:- +full:interleaved

1 //===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---*- C++ -*-===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
11 // This file defines the interleaved-load-combine pass. The pass searches for
16 // executed just before InterleavedAccesPass to find any left-over instances
19 //===----------------------------------------------------------------------===//
50 #define DEBUG_TYPE "interleaved-load-combine"
59 "disable-" DEBUG_TYPE, cl::init(false), cl::Hidden,
60 cl::desc("Disable combining of interleaved loads"));
70 TLI(*TM.getSubtargetImpl(F)->getTargetLowering()), TTI(TTI) {} in InterleavedLoadCombineImpl()
72 /// Scan the function for interleaved load candidates and execute the
96 /// Replace interleaved load candidates. It does additional
103 /// factor, find a set that represents a 'factor' interleaved load.
109 /// First Order Polynomial on an n-Bit Integer Value
111 /// Polynomial(Value) = Value * B + A + E*2^(n-e)
113 /// A and B are the coefficients. E*2^(n-e) is an error within 'e' most
160 // Pb_5 - Pa_5 = 16 #0 | subtract to get the offset
175 unsigned ErrorMSBs = (unsigned)-1;
188 IntegerType *Ty = dyn_cast<IntegerType>(V->getType()); in Polynomial()
191 this->V = V; in Polynomial()
192 A = APInt(Ty->getBitWidth(), 0); in Polynomial()
206 if (ErrorMSBs == (unsigned)-1) in incErrorMSBs()
216 if (ErrorMSBs == (unsigned)-1) in decErrorMSBs()
220 ErrorMSBs -= amt; in decErrorMSBs()
240 // (B + A + E*2^(n-e)) + C = B + (A + C) + E*2^(n-e) in add()
244 ErrorMSBs = (unsigned)-1; in add()
260 // (B+A)*C =- in mul()
273 // Let B' and A' be the n-Bit inputs with some unknown errors EA, in mul()
276 // B' = B + 2^(n-e)*EB in mul()
277 // A' = A + 2^(n-e)*EA in mul()
286 // (B'*C' + A'*C') = [B + 2^(n-e)*EB] * C' + [A + 2^(n-e)*EA] * C' = in mul()
287 // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' = in mul()
289 // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' = in mul()
290 // = [B + A + 2^(n-e)*EB + 2^(n-e)*EA] * C' = in mul()
291 // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C' = in mul()
292 // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C*2^c = in mul()
293 // = (B + A) * C' + C*(EB + EA)*2^(n-e)*2^c = in mul()
297 // = (B + A)*C' + EC*2^(n-e)*2^c = in mul()
298 // = (B + A)*C' + EC*2^(n-(e-c)) in mul()
300 // Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c in mul()
305 ErrorMSBs = (unsigned)-1; in mul()
309 // Multiplying by one is a no-op. in mul()
331 // Theorem(1): (B + A + E*2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'*2^(n-e') in lshr()
334 // E is a e-bit number, in lshr()
335 // E' is a e'-bit number, in lshr()
343 // B = b_h * 2^(n-1) + b_m * 2 + b_l in lshr()
344 // A = a_h * 2^(n-1) + a_m * 2 (pre(1)) in lshr()
346 // where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers in lshr()
348 // Let X = (B + A + E*2^(n-e)) >> 1 in lshr()
349 // Let Y = (B >> 1) + (A >> 1) + E*2^(n-e) >> 1 in lshr()
351 // X = [B + A + E*2^(n-e)] >> 1 = in lshr()
352 // = [ b_h * 2^(n-1) + b_m * 2 + b_l + in lshr()
353 // + a_h * 2^(n-1) + a_m * 2 + in lshr()
354 // + E * 2^(n-e) ] >> 1 = in lshr()
357 // 2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within in lshr()
362 // = [ ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-1) + in lshr()
363 // + ((b_m + a_m) % 2^(n-2)) * 2 + in lshr()
364 // + b_l + E * 2^(n-e) ] >> 1 = in lshr()
369 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + in lshr()
370 // + ((b_m + a_m) % 2^(n-2)) + in lshr()
371 // + E * 2^(n-(e+1)) = in lshr()
375 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + in lshr()
376 // + ((b_m + a_m) % 2^(n-2)) + in lshr()
377 // + E * 2^(n-e') = in lshr()
381 // Y = (B >> 1) + (A >> 1) + E*2^(n-e') = in lshr()
382 // = (b_h * 2^(n-1) + b_m * 2 + b_l) >> 1 + in lshr()
383 // + (a_h * 2^(n-1) + a_m * 2) >> 1 + in lshr()
384 // + E * 2^(n-e) >> 1 = in lshr()
389 // = b_h * 2^(n-2) + b_m + in lshr()
390 // + a_h * 2^(n-2) + a_m + in lshr()
391 // + E * 2^(n-(e+1)) = in lshr()
394 // the term 2^(n-1). But this time there is room for a second bit in the in lshr()
395 // term 2^(n-2) we add this bit to a new term and denote it o_h in a in lshr()
398 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) * 2^(n-1) + in lshr()
399 // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + in lshr()
400 // + ((b_m + a_m) % 2^(n-2)) + in lshr()
401 // + E * 2^(n-(e+1)) = in lshr()
403 // Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1 in lshr()
406 // = o_h * 2^(n-1) + in lshr()
407 // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + in lshr()
408 // + ((b_m + a_m) % 2^(n-2)) + in lshr()
409 // + E * 2^(n-e') = in lshr()
414 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + in lshr()
415 // + ((b_m + a_m) % 2^(n-2)) + in lshr()
416 // + o_h * 2^(e'-1) * 2^(n-e') + | pre(2), move 2^(e'-1) in lshr()
418 // + E * 2^(n-e') = in lshr()
419 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + in lshr()
420 // + ((b_m + a_m) % 2^(n-2)) + in lshr()
421 // + [o_h * 2^(e'-1) + E] * 2^(n-e') + | move 2^(e'-1) out of in lshr()
424 // Let E' = o_h * 2^(e'-1) + E in lshr()
426 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) + in lshr()
427 // + ((b_m + a_m) % 2^(n-2)) + in lshr()
428 // + E' * 2^(n-e') in lshr()
437 // In this case Theorem(1) transforms to (the pre-condition on A can also be in lshr()
453 // hold. This is trivially the case for E' = X - Y. in lshr()
462 ErrorMSBs = (unsigned)-1; in lshr()
491 /// Apply a sign-extend or truncate operation on the polynomial.
496 decErrorMSBs(A.getBitWidth() - n); in sextOrTrunc()
503 incErrorMSBs(n - A.getBitWidth()); in sextOrTrunc()
544 Polynomial operator-(const Polynomial &o) const { in operator -()
552 return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs)); in operator -()
556 Polynomial operator-(uint64_t C) const { in operator -()
558 Result.A -= C; in operator -()
572 Polynomial r = *this - o; in isProvenEqualTo()
657 /// Basic-block the load instructions are within
669 /// Final shuffle-vector instruction
679 EI = new ElementInfo[VTy->getNumElements()]; in VectorInfo()
686 unsigned getDimension() const { return VTy->getNumElements(); } in getDimension()
688 /// Test if the VectorInfo can be part of an interleaved load with the
696 unsigned Size = DL.getTypeAllocSize(VTy->getElementType()); in isInterleaved()
734 Instruction *Op = dyn_cast<Instruction>(BCI->getOperand(0)); in computeFromBCI()
739 FixedVectorType *VTy = dyn_cast<FixedVectorType>(Op->getType()); in computeFromBCI()
744 if (Result.VTy->getNumElements() % VTy->getNumElements()) in computeFromBCI()
747 unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements(); in computeFromBCI()
748 unsigned NewSize = DL.getTypeAllocSize(Result.VTy->getElementType()); in computeFromBCI()
749 unsigned OldSize = DL.getTypeAllocSize(VTy->getElementType()); in computeFromBCI()
758 for (unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) { in computeFromBCI()
790 cast<FixedVectorType>(SVI->getOperand(0)->getType()); in computeFromSVI()
794 if (!compute(SVI->getOperand(0), LHS, DL)) in computeFromSVI()
799 if (!compute(SVI->getOperand(1), RHS, DL)) in computeFromSVI()
838 for (int i : SVI->getShuffleMask()) { in computeFromSVI()
839 assert((i < 2 * (signed)ArgTy->getNumElements()) && in computeFromSVI()
844 else if (i < (signed)ArgTy->getNumElements()) { in computeFromSVI()
851 Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()]; in computeFromSVI()
874 if (LI->isVolatile()) in computeFromLI()
877 if (LI->isAtomic()) in computeFromLI()
880 if (!DL.typeSizeEqualsStoreSize(Result.VTy->getElementType())) in computeFromLI()
884 computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL); in computeFromLI()
886 Result.BB = LI->getParent(); in computeFromLI()
893 ConstantInt::get(Type::getInt32Ty(LI->getContext()), 0), in computeFromLI()
894 ConstantInt::get(Type::getInt32Ty(LI->getContext()), i), in computeFromLI()
925 Result.add(C->getValue()); in computePolynomialBinOp()
933 Result.lshr(C->getValue()); in computePolynomialBinOp()
971 DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()); in computePolynomialFromPointer()
999 // non-constant. in computePolynomialFromPointer()
1037 Polynomial(DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0); in computePolynomialFromPointer()
1062 // Try to find an interleaved load using the front of Worklist as first line in findPattern()
1063 unsigned Size = DL.getTypeAllocSize(C0->VTy->getElementType()); in findPattern()
1069 if (C->VTy != C0->VTy) in findPattern()
1071 if (C->BB != C0->BB) in findPattern()
1073 if (C->PV != C0->PV) in findPattern()
1076 // Check the current value matches any of factor - 1 remaining lines in findPattern()
1078 if (C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) { in findPattern()
1110 BasicBlock *BB = (*LIs.begin())->getParent(); in findFirstLoad()
1112 *BB, [&LIs](Instruction &I) -> bool { return is_contained(LIs, &I); }); in findFirstLoad()
1113 assert(FLI != BB->end()); in findFirstLoad()
1120 LLVM_DEBUG(dbgs() << "Checking interleaved load\n"); in combine()
1148 // interleaved. This list excludes the instructions necessary for the in combine()
1173 for (auto *U : I->users()) { in combine()
1192 auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess(); in combine()
1207 Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType(); in combine()
1209 cast<FixedVectorType>(InterleavedLoad.front().SVI->getType()) in combine()
1210 ->getNumElements(); in combine()
1215 Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlign(), in combine()
1216 InsertionPoint->getPointerAddressSpace(), CostKind); in combine()
1223 auto Ptr = InsertionPoint->getPointerOperand(); in combine()
1224 auto LI = Builder.CreateAlignedLoad(ILTy, Ptr, InsertionPoint->getAlign(), in combine()
1225 "interleaved.wide.load"); in combine()
1239 auto SVI = Builder.CreateShuffleVector(LI, Mask, "interleaved.shuffle"); in combine()
1240 VI.SVI->replaceAllUsesWith(SVI); in combine()
1246 return OptimizationRemark(DEBUG_TYPE, "Combined Interleaved Load", LI) in combine()
1247 << "Load interleaved combined with factor " in combine()
1262 for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) { in run()
1269 if (isa<ScalableVectorType>(SVI->getType())) in run()
1272 Candidates.emplace_back(cast<FixedVectorType>(SVI->getType())); in run()
1291 // Remove the first element of the Interleaved Load but put the others in run()
1305 /// This pass combines interleaved loads into a pattern detectable by
1315 return "Interleaved Load Combine Pass"; in getPassName()
1333 TPC->getTM<TargetMachine>()) in runOnFunction()
1362 "Combine interleaved loads into wide loads and shufflevector instructions",
1369 "Combine interleaved loads into wide loads and shufflevector instructions", in INITIALIZE_PASS_DEPENDENCY()