1*700637cbSDimitry Andric //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//
2*700637cbSDimitry Andric //
3*700637cbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*700637cbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*700637cbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*700637cbSDimitry Andric //
7*700637cbSDimitry Andric //===----------------------------------------------------------------------===//
8*700637cbSDimitry Andric ///
9*700637cbSDimitry Andric /// \file
10*700637cbSDimitry Andric /// This file implements explicit unrolling for VPlans.
11*700637cbSDimitry Andric ///
12*700637cbSDimitry Andric //===----------------------------------------------------------------------===//
13*700637cbSDimitry Andric
14*700637cbSDimitry Andric #include "VPRecipeBuilder.h"
15*700637cbSDimitry Andric #include "VPlan.h"
16*700637cbSDimitry Andric #include "VPlanAnalysis.h"
17*700637cbSDimitry Andric #include "VPlanCFG.h"
18*700637cbSDimitry Andric #include "VPlanHelpers.h"
19*700637cbSDimitry Andric #include "VPlanPatternMatch.h"
20*700637cbSDimitry Andric #include "VPlanTransforms.h"
21*700637cbSDimitry Andric #include "VPlanUtils.h"
22*700637cbSDimitry Andric #include "llvm/ADT/PostOrderIterator.h"
23*700637cbSDimitry Andric #include "llvm/ADT/STLExtras.h"
24*700637cbSDimitry Andric #include "llvm/ADT/ScopeExit.h"
25*700637cbSDimitry Andric #include "llvm/Analysis/IVDescriptors.h"
26*700637cbSDimitry Andric #include "llvm/IR/Intrinsics.h"
27*700637cbSDimitry Andric
28*700637cbSDimitry Andric using namespace llvm;
29*700637cbSDimitry Andric using namespace llvm::VPlanPatternMatch;
30*700637cbSDimitry Andric
31*700637cbSDimitry Andric namespace {
32*700637cbSDimitry Andric
33*700637cbSDimitry Andric /// Helper to hold state needed for unrolling. It holds the Plan to unroll by
34*700637cbSDimitry Andric /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
35*700637cbSDimitry Andric /// the unrolling transformation, where the original VPValues are retained for
36*700637cbSDimitry Andric /// part zero.
37*700637cbSDimitry Andric class UnrollState {
38*700637cbSDimitry Andric /// Plan to unroll.
39*700637cbSDimitry Andric VPlan &Plan;
40*700637cbSDimitry Andric /// Unroll factor to unroll by.
41*700637cbSDimitry Andric const unsigned UF;
42*700637cbSDimitry Andric /// Analysis for types.
43*700637cbSDimitry Andric VPTypeAnalysis TypeInfo;
44*700637cbSDimitry Andric
45*700637cbSDimitry Andric /// Unrolling may create recipes that should not be unrolled themselves.
46*700637cbSDimitry Andric /// Those are tracked in ToSkip.
47*700637cbSDimitry Andric SmallPtrSet<VPRecipeBase *, 8> ToSkip;
48*700637cbSDimitry Andric
49*700637cbSDimitry Andric // Associate with each VPValue of part 0 its unrolled instances of parts 1,
50*700637cbSDimitry Andric // ..., UF-1.
51*700637cbSDimitry Andric DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
52*700637cbSDimitry Andric
53*700637cbSDimitry Andric /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
54*700637cbSDimitry Andric void unrollReplicateRegionByUF(VPRegionBlock *VPR);
55*700637cbSDimitry Andric
56*700637cbSDimitry Andric /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
57*700637cbSDimitry Andric /// all parts.
58*700637cbSDimitry Andric void unrollRecipeByUF(VPRecipeBase &R);
59*700637cbSDimitry Andric
60*700637cbSDimitry Andric /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
61*700637cbSDimitry Andric /// depends on the concrete header phi. Inserts newly created recipes at \p
62*700637cbSDimitry Andric /// InsertPtForPhi.
63*700637cbSDimitry Andric void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
64*700637cbSDimitry Andric VPBasicBlock::iterator InsertPtForPhi);
65*700637cbSDimitry Andric
66*700637cbSDimitry Andric /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
67*700637cbSDimitry Andric /// the induction steps for each part.
68*700637cbSDimitry Andric void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV,
69*700637cbSDimitry Andric VPBasicBlock::iterator InsertPtForPhi);
70*700637cbSDimitry Andric
getConstantVPV(unsigned Part)71*700637cbSDimitry Andric VPValue *getConstantVPV(unsigned Part) {
72*700637cbSDimitry Andric Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
73*700637cbSDimitry Andric return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
74*700637cbSDimitry Andric }
75*700637cbSDimitry Andric
76*700637cbSDimitry Andric public:
UnrollState(VPlan & Plan,unsigned UF,LLVMContext & Ctx)77*700637cbSDimitry Andric UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
78*700637cbSDimitry Andric : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}
79*700637cbSDimitry Andric
80*700637cbSDimitry Andric void unrollBlock(VPBlockBase *VPB);
81*700637cbSDimitry Andric
getValueForPart(VPValue * V,unsigned Part)82*700637cbSDimitry Andric VPValue *getValueForPart(VPValue *V, unsigned Part) {
83*700637cbSDimitry Andric if (Part == 0 || V->isLiveIn())
84*700637cbSDimitry Andric return V;
85*700637cbSDimitry Andric assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
86*700637cbSDimitry Andric "accessed value does not exist");
87*700637cbSDimitry Andric return VPV2Parts[V][Part - 1];
88*700637cbSDimitry Andric }
89*700637cbSDimitry Andric
90*700637cbSDimitry Andric /// Given a single original recipe \p OrigR (of part zero), and its copy \p
91*700637cbSDimitry Andric /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
92*700637cbSDimitry Andric /// corresponding VPValue defined by \p CopyR.
addRecipeForPart(VPRecipeBase * OrigR,VPRecipeBase * CopyR,unsigned Part)93*700637cbSDimitry Andric void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
94*700637cbSDimitry Andric unsigned Part) {
95*700637cbSDimitry Andric for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
96*700637cbSDimitry Andric auto Ins = VPV2Parts.insert({VPV, {}});
97*700637cbSDimitry Andric assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
98*700637cbSDimitry Andric Ins.first->second.push_back(CopyR->getVPValue(Idx));
99*700637cbSDimitry Andric }
100*700637cbSDimitry Andric }
101*700637cbSDimitry Andric
102*700637cbSDimitry Andric /// Given a uniform recipe \p R, add it for all parts.
addUniformForAllParts(VPSingleDefRecipe * R)103*700637cbSDimitry Andric void addUniformForAllParts(VPSingleDefRecipe *R) {
104*700637cbSDimitry Andric auto Ins = VPV2Parts.insert({R, {}});
105*700637cbSDimitry Andric assert(Ins.second && "uniform value already added");
106*700637cbSDimitry Andric for (unsigned Part = 0; Part != UF; ++Part)
107*700637cbSDimitry Andric Ins.first->second.push_back(R);
108*700637cbSDimitry Andric }
109*700637cbSDimitry Andric
contains(VPValue * VPV) const110*700637cbSDimitry Andric bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
111*700637cbSDimitry Andric
112*700637cbSDimitry Andric /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
113*700637cbSDimitry Andric /// \p P.
remapOperand(VPRecipeBase * R,unsigned OpIdx,unsigned Part)114*700637cbSDimitry Andric void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
115*700637cbSDimitry Andric auto *Op = R->getOperand(OpIdx);
116*700637cbSDimitry Andric R->setOperand(OpIdx, getValueForPart(Op, Part));
117*700637cbSDimitry Andric }
118*700637cbSDimitry Andric
119*700637cbSDimitry Andric /// Update \p R's operands with their corresponding VPValues for part \p P.
remapOperands(VPRecipeBase * R,unsigned Part)120*700637cbSDimitry Andric void remapOperands(VPRecipeBase *R, unsigned Part) {
121*700637cbSDimitry Andric for (const auto &[OpIdx, Op] : enumerate(R->operands()))
122*700637cbSDimitry Andric R->setOperand(OpIdx, getValueForPart(Op, Part));
123*700637cbSDimitry Andric }
124*700637cbSDimitry Andric };
125*700637cbSDimitry Andric } // namespace
126*700637cbSDimitry Andric
unrollReplicateRegionByUF(VPRegionBlock * VPR)127*700637cbSDimitry Andric void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
128*700637cbSDimitry Andric VPBlockBase *InsertPt = VPR->getSingleSuccessor();
129*700637cbSDimitry Andric for (unsigned Part = 1; Part != UF; ++Part) {
130*700637cbSDimitry Andric auto *Copy = VPR->clone();
131*700637cbSDimitry Andric VPBlockUtils::insertBlockBefore(Copy, InsertPt);
132*700637cbSDimitry Andric
133*700637cbSDimitry Andric auto PartI = vp_depth_first_shallow(Copy->getEntry());
134*700637cbSDimitry Andric auto Part0 = vp_depth_first_shallow(VPR->getEntry());
135*700637cbSDimitry Andric for (const auto &[PartIVPBB, Part0VPBB] :
136*700637cbSDimitry Andric zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI),
137*700637cbSDimitry Andric VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) {
138*700637cbSDimitry Andric for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
139*700637cbSDimitry Andric remapOperands(&PartIR, Part);
140*700637cbSDimitry Andric if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
141*700637cbSDimitry Andric ScalarIVSteps->addOperand(getConstantVPV(Part));
142*700637cbSDimitry Andric }
143*700637cbSDimitry Andric
144*700637cbSDimitry Andric addRecipeForPart(&Part0R, &PartIR, Part);
145*700637cbSDimitry Andric }
146*700637cbSDimitry Andric }
147*700637cbSDimitry Andric }
148*700637cbSDimitry Andric }
149*700637cbSDimitry Andric
unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe * IV,VPBasicBlock::iterator InsertPtForPhi)150*700637cbSDimitry Andric void UnrollState::unrollWidenInductionByUF(
151*700637cbSDimitry Andric VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {
152*700637cbSDimitry Andric VPBasicBlock *PH = cast<VPBasicBlock>(
153*700637cbSDimitry Andric IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
154*700637cbSDimitry Andric Type *IVTy = TypeInfo.inferScalarType(IV);
155*700637cbSDimitry Andric auto &ID = IV->getInductionDescriptor();
156*700637cbSDimitry Andric VPIRFlags Flags;
157*700637cbSDimitry Andric if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
158*700637cbSDimitry Andric Flags = ID.getInductionBinOp()->getFastMathFlags();
159*700637cbSDimitry Andric
160*700637cbSDimitry Andric VPValue *ScalarStep = IV->getStepValue();
161*700637cbSDimitry Andric VPBuilder Builder(PH);
162*700637cbSDimitry Andric VPInstruction *VectorStep = Builder.createNaryOp(
163*700637cbSDimitry Andric VPInstruction::WideIVStep, {&Plan.getVF(), ScalarStep}, IVTy, Flags,
164*700637cbSDimitry Andric IV->getDebugLoc());
165*700637cbSDimitry Andric
166*700637cbSDimitry Andric ToSkip.insert(VectorStep);
167*700637cbSDimitry Andric
168*700637cbSDimitry Andric // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
169*700637cbSDimitry Andric // remains the header phi. Parts > 0 are computed by adding Step to the
170*700637cbSDimitry Andric // previous part. The header phi recipe will get 2 new operands: the step
171*700637cbSDimitry Andric // value for a single part and the last part, used to compute the backedge
172*700637cbSDimitry Andric // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 =
173*700637cbSDimitry Andric // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
174*700637cbSDimitry Andric // %Part.1 = %Part.0 + %VectorStep
175*700637cbSDimitry Andric // %Part.2 = %Part.1 + %VectorStep
176*700637cbSDimitry Andric // %Part.3 = %Part.2 + %VectorStep
177*700637cbSDimitry Andric //
178*700637cbSDimitry Andric // The newly added recipes are added to ToSkip to avoid interleaving them
179*700637cbSDimitry Andric // again.
180*700637cbSDimitry Andric VPValue *Prev = IV;
181*700637cbSDimitry Andric Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
182*700637cbSDimitry Andric unsigned AddOpc =
183*700637cbSDimitry Andric IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add;
184*700637cbSDimitry Andric for (unsigned Part = 1; Part != UF; ++Part) {
185*700637cbSDimitry Andric std::string Name =
186*700637cbSDimitry Andric Part > 1 ? "step.add." + std::to_string(Part) : "step.add";
187*700637cbSDimitry Andric
188*700637cbSDimitry Andric VPInstruction *Add = Builder.createNaryOp(AddOpc,
189*700637cbSDimitry Andric {
190*700637cbSDimitry Andric Prev,
191*700637cbSDimitry Andric VectorStep,
192*700637cbSDimitry Andric },
193*700637cbSDimitry Andric Flags, IV->getDebugLoc(), Name);
194*700637cbSDimitry Andric ToSkip.insert(Add);
195*700637cbSDimitry Andric addRecipeForPart(IV, Add, Part);
196*700637cbSDimitry Andric Prev = Add;
197*700637cbSDimitry Andric }
198*700637cbSDimitry Andric IV->addOperand(VectorStep);
199*700637cbSDimitry Andric IV->addOperand(Prev);
200*700637cbSDimitry Andric }
201*700637cbSDimitry Andric
unrollHeaderPHIByUF(VPHeaderPHIRecipe * R,VPBasicBlock::iterator InsertPtForPhi)202*700637cbSDimitry Andric void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
203*700637cbSDimitry Andric VPBasicBlock::iterator InsertPtForPhi) {
204*700637cbSDimitry Andric // First-order recurrences pass a single vector or scalar through their header
205*700637cbSDimitry Andric // phis, irrespective of interleaving.
206*700637cbSDimitry Andric if (isa<VPFirstOrderRecurrencePHIRecipe>(R))
207*700637cbSDimitry Andric return;
208*700637cbSDimitry Andric
209*700637cbSDimitry Andric // Generate step vectors for each unrolled part.
210*700637cbSDimitry Andric if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) {
211*700637cbSDimitry Andric unrollWidenInductionByUF(IV, InsertPtForPhi);
212*700637cbSDimitry Andric return;
213*700637cbSDimitry Andric }
214*700637cbSDimitry Andric
215*700637cbSDimitry Andric auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R);
216*700637cbSDimitry Andric if (RdxPhi && RdxPhi->isOrdered())
217*700637cbSDimitry Andric return;
218*700637cbSDimitry Andric
219*700637cbSDimitry Andric auto InsertPt = std::next(R->getIterator());
220*700637cbSDimitry Andric for (unsigned Part = 1; Part != UF; ++Part) {
221*700637cbSDimitry Andric VPRecipeBase *Copy = R->clone();
222*700637cbSDimitry Andric Copy->insertBefore(*R->getParent(), InsertPt);
223*700637cbSDimitry Andric addRecipeForPart(R, Copy, Part);
224*700637cbSDimitry Andric if (isa<VPWidenPointerInductionRecipe>(R)) {
225*700637cbSDimitry Andric Copy->addOperand(R);
226*700637cbSDimitry Andric Copy->addOperand(getConstantVPV(Part));
227*700637cbSDimitry Andric } else if (RdxPhi) {
228*700637cbSDimitry Andric // If the start value is a ReductionStartVector, use the identity value
229*700637cbSDimitry Andric // (second operand) for unrolled parts. If the scaling factor is > 1,
230*700637cbSDimitry Andric // create a new ReductionStartVector with the scale factor and both
231*700637cbSDimitry Andric // operands set to the identity value.
232*700637cbSDimitry Andric if (auto *VPI = dyn_cast<VPInstruction>(RdxPhi->getStartValue())) {
233*700637cbSDimitry Andric assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
234*700637cbSDimitry Andric "unexpected start VPInstruction");
235*700637cbSDimitry Andric if (Part != 1)
236*700637cbSDimitry Andric continue;
237*700637cbSDimitry Andric VPValue *StartV;
238*700637cbSDimitry Andric if (match(VPI->getOperand(2), m_SpecificInt(1))) {
239*700637cbSDimitry Andric StartV = VPI->getOperand(1);
240*700637cbSDimitry Andric } else {
241*700637cbSDimitry Andric auto *C = VPI->clone();
242*700637cbSDimitry Andric C->setOperand(0, C->getOperand(1));
243*700637cbSDimitry Andric C->insertAfter(VPI);
244*700637cbSDimitry Andric StartV = C;
245*700637cbSDimitry Andric }
246*700637cbSDimitry Andric for (unsigned Part = 1; Part != UF; ++Part)
247*700637cbSDimitry Andric VPV2Parts[VPI][Part - 1] = StartV;
248*700637cbSDimitry Andric }
249*700637cbSDimitry Andric Copy->addOperand(getConstantVPV(Part));
250*700637cbSDimitry Andric } else {
251*700637cbSDimitry Andric assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
252*700637cbSDimitry Andric "unexpected header phi recipe not needing unrolled part");
253*700637cbSDimitry Andric }
254*700637cbSDimitry Andric }
255*700637cbSDimitry Andric }
256*700637cbSDimitry Andric
257*700637cbSDimitry Andric /// Handle non-header-phi recipes.
unrollRecipeByUF(VPRecipeBase & R)258*700637cbSDimitry Andric void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
259*700637cbSDimitry Andric if (match(&R, m_BranchOnCond(m_VPValue())) ||
260*700637cbSDimitry Andric match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
261*700637cbSDimitry Andric return;
262*700637cbSDimitry Andric
263*700637cbSDimitry Andric if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
264*700637cbSDimitry Andric if (vputils::onlyFirstPartUsed(VPI)) {
265*700637cbSDimitry Andric addUniformForAllParts(VPI);
266*700637cbSDimitry Andric return;
267*700637cbSDimitry Andric }
268*700637cbSDimitry Andric }
269*700637cbSDimitry Andric if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
270*700637cbSDimitry Andric if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
271*700637cbSDimitry Andric RepR->getOperand(1)->isDefinedOutsideLoopRegions()) {
272*700637cbSDimitry Andric // Stores to an invariant address only need to store the last part.
273*700637cbSDimitry Andric remapOperands(&R, UF - 1);
274*700637cbSDimitry Andric return;
275*700637cbSDimitry Andric }
276*700637cbSDimitry Andric if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
277*700637cbSDimitry Andric if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
278*700637cbSDimitry Andric addUniformForAllParts(RepR);
279*700637cbSDimitry Andric return;
280*700637cbSDimitry Andric }
281*700637cbSDimitry Andric }
282*700637cbSDimitry Andric }
283*700637cbSDimitry Andric
284*700637cbSDimitry Andric // Unroll non-uniform recipes.
285*700637cbSDimitry Andric auto InsertPt = std::next(R.getIterator());
286*700637cbSDimitry Andric VPBasicBlock &VPBB = *R.getParent();
287*700637cbSDimitry Andric for (unsigned Part = 1; Part != UF; ++Part) {
288*700637cbSDimitry Andric VPRecipeBase *Copy = R.clone();
289*700637cbSDimitry Andric Copy->insertBefore(VPBB, InsertPt);
290*700637cbSDimitry Andric addRecipeForPart(&R, Copy, Part);
291*700637cbSDimitry Andric
292*700637cbSDimitry Andric VPValue *Op;
293*700637cbSDimitry Andric if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
294*700637cbSDimitry Andric m_VPValue(), m_VPValue(Op)))) {
295*700637cbSDimitry Andric Copy->setOperand(0, getValueForPart(Op, Part - 1));
296*700637cbSDimitry Andric Copy->setOperand(1, getValueForPart(Op, Part));
297*700637cbSDimitry Andric continue;
298*700637cbSDimitry Andric }
299*700637cbSDimitry Andric if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
300*700637cbSDimitry Andric auto *Phi = dyn_cast<VPReductionPHIRecipe>(R.getOperand(0));
301*700637cbSDimitry Andric if (Phi && Phi->isOrdered()) {
302*700637cbSDimitry Andric auto &Parts = VPV2Parts[Phi];
303*700637cbSDimitry Andric if (Part == 1) {
304*700637cbSDimitry Andric Parts.clear();
305*700637cbSDimitry Andric Parts.push_back(Red);
306*700637cbSDimitry Andric }
307*700637cbSDimitry Andric Parts.push_back(Copy->getVPSingleValue());
308*700637cbSDimitry Andric Phi->setOperand(1, Copy->getVPSingleValue());
309*700637cbSDimitry Andric }
310*700637cbSDimitry Andric }
311*700637cbSDimitry Andric remapOperands(Copy, Part);
312*700637cbSDimitry Andric
313*700637cbSDimitry Andric // Add operand indicating the part to generate code for, to recipes still
314*700637cbSDimitry Andric // requiring it.
315*700637cbSDimitry Andric if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
316*700637cbSDimitry Andric VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) ||
317*700637cbSDimitry Andric match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
318*700637cbSDimitry Andric m_VPValue())))
319*700637cbSDimitry Andric Copy->addOperand(getConstantVPV(Part));
320*700637cbSDimitry Andric
321*700637cbSDimitry Andric if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R))
322*700637cbSDimitry Andric Copy->setOperand(0, R.getOperand(0));
323*700637cbSDimitry Andric }
324*700637cbSDimitry Andric }
325*700637cbSDimitry Andric
unrollBlock(VPBlockBase * VPB)326*700637cbSDimitry Andric void UnrollState::unrollBlock(VPBlockBase *VPB) {
327*700637cbSDimitry Andric auto *VPR = dyn_cast<VPRegionBlock>(VPB);
328*700637cbSDimitry Andric if (VPR) {
329*700637cbSDimitry Andric if (VPR->isReplicator())
330*700637cbSDimitry Andric return unrollReplicateRegionByUF(VPR);
331*700637cbSDimitry Andric
332*700637cbSDimitry Andric // Traverse blocks in region in RPO to ensure defs are visited before uses
333*700637cbSDimitry Andric // across blocks.
334*700637cbSDimitry Andric ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
335*700637cbSDimitry Andric RPOT(VPR->getEntry());
336*700637cbSDimitry Andric for (VPBlockBase *VPB : RPOT)
337*700637cbSDimitry Andric unrollBlock(VPB);
338*700637cbSDimitry Andric return;
339*700637cbSDimitry Andric }
340*700637cbSDimitry Andric
341*700637cbSDimitry Andric // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
342*700637cbSDimitry Andric auto *VPBB = cast<VPBasicBlock>(VPB);
343*700637cbSDimitry Andric auto InsertPtForPhi = VPBB->getFirstNonPhi();
344*700637cbSDimitry Andric for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
345*700637cbSDimitry Andric if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
346*700637cbSDimitry Andric continue;
347*700637cbSDimitry Andric
348*700637cbSDimitry Andric // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and
349*700637cbSDimitry Andric // Compute*Result which combine all parts to compute the final value.
350*700637cbSDimitry Andric VPValue *Op1;
351*700637cbSDimitry Andric if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) ||
352*700637cbSDimitry Andric match(&R, m_VPInstruction<VPInstruction::FirstActiveLane>(
353*700637cbSDimitry Andric m_VPValue(Op1))) ||
354*700637cbSDimitry Andric match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
355*700637cbSDimitry Andric m_VPValue(), m_VPValue(), m_VPValue(Op1))) ||
356*700637cbSDimitry Andric match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
357*700637cbSDimitry Andric m_VPValue(), m_VPValue(Op1))) ||
358*700637cbSDimitry Andric match(&R, m_VPInstruction<VPInstruction::ComputeFindIVResult>(
359*700637cbSDimitry Andric m_VPValue(), m_VPValue(), m_VPValue(), m_VPValue(Op1)))) {
360*700637cbSDimitry Andric addUniformForAllParts(cast<VPInstruction>(&R));
361*700637cbSDimitry Andric for (unsigned Part = 1; Part != UF; ++Part)
362*700637cbSDimitry Andric R.addOperand(getValueForPart(Op1, Part));
363*700637cbSDimitry Andric continue;
364*700637cbSDimitry Andric }
365*700637cbSDimitry Andric VPValue *Op0;
366*700637cbSDimitry Andric if (match(&R, m_VPInstruction<VPInstruction::ExtractLastElement>(
367*700637cbSDimitry Andric m_VPValue(Op0))) ||
368*700637cbSDimitry Andric match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
369*700637cbSDimitry Andric m_VPValue(Op0)))) {
370*700637cbSDimitry Andric addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
371*700637cbSDimitry Andric if (Plan.hasScalarVFOnly()) {
372*700637cbSDimitry Andric auto *I = cast<VPInstruction>(&R);
373*700637cbSDimitry Andric // Extracting from end with VF = 1 implies retrieving the last or
374*700637cbSDimitry Andric // penultimate scalar part (UF-1 or UF-2).
375*700637cbSDimitry Andric unsigned Offset =
376*700637cbSDimitry Andric I->getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
377*700637cbSDimitry Andric I->replaceAllUsesWith(getValueForPart(Op0, UF - Offset));
378*700637cbSDimitry Andric R.eraseFromParent();
379*700637cbSDimitry Andric } else {
380*700637cbSDimitry Andric // Otherwise we extract from the last part.
381*700637cbSDimitry Andric remapOperands(&R, UF - 1);
382*700637cbSDimitry Andric }
383*700637cbSDimitry Andric continue;
384*700637cbSDimitry Andric }
385*700637cbSDimitry Andric
386*700637cbSDimitry Andric auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
387*700637cbSDimitry Andric if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
388*700637cbSDimitry Andric addUniformForAllParts(SingleDef);
389*700637cbSDimitry Andric continue;
390*700637cbSDimitry Andric }
391*700637cbSDimitry Andric
392*700637cbSDimitry Andric if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
393*700637cbSDimitry Andric unrollHeaderPHIByUF(H, InsertPtForPhi);
394*700637cbSDimitry Andric continue;
395*700637cbSDimitry Andric }
396*700637cbSDimitry Andric
397*700637cbSDimitry Andric unrollRecipeByUF(R);
398*700637cbSDimitry Andric }
399*700637cbSDimitry Andric }
400*700637cbSDimitry Andric
unrollByUF(VPlan & Plan,unsigned UF,LLVMContext & Ctx)401*700637cbSDimitry Andric void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
402*700637cbSDimitry Andric assert(UF > 0 && "Unroll factor must be positive");
403*700637cbSDimitry Andric Plan.setUF(UF);
404*700637cbSDimitry Andric auto Cleanup = make_scope_exit([&Plan]() {
405*700637cbSDimitry Andric auto Iter = vp_depth_first_deep(Plan.getEntry());
406*700637cbSDimitry Andric // Remove recipes that are redundant after unrolling.
407*700637cbSDimitry Andric for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
408*700637cbSDimitry Andric for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
409*700637cbSDimitry Andric auto *VPI = dyn_cast<VPInstruction>(&R);
410*700637cbSDimitry Andric if (VPI &&
411*700637cbSDimitry Andric VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
412*700637cbSDimitry Andric VPI->getNumOperands() == 1) {
413*700637cbSDimitry Andric VPI->replaceAllUsesWith(VPI->getOperand(0));
414*700637cbSDimitry Andric VPI->eraseFromParent();
415*700637cbSDimitry Andric }
416*700637cbSDimitry Andric }
417*700637cbSDimitry Andric }
418*700637cbSDimitry Andric });
419*700637cbSDimitry Andric if (UF == 1) {
420*700637cbSDimitry Andric return;
421*700637cbSDimitry Andric }
422*700637cbSDimitry Andric
423*700637cbSDimitry Andric UnrollState Unroller(Plan, UF, Ctx);
424*700637cbSDimitry Andric
425*700637cbSDimitry Andric // Iterate over all blocks in the plan starting from Entry, and unroll
426*700637cbSDimitry Andric // recipes inside them. This includes the vector preheader and middle blocks,
427*700637cbSDimitry Andric // which may set up or post-process per-part values.
428*700637cbSDimitry Andric ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
429*700637cbSDimitry Andric Plan.getEntry());
430*700637cbSDimitry Andric for (VPBlockBase *VPB : RPOT)
431*700637cbSDimitry Andric Unroller.unrollBlock(VPB);
432*700637cbSDimitry Andric
433*700637cbSDimitry Andric unsigned Part = 1;
434*700637cbSDimitry Andric // Remap operands of cloned header phis to update backedge values. The header
435*700637cbSDimitry Andric // phis cloned during unrolling are just after the header phi for part 0.
436*700637cbSDimitry Andric // Reset Part to 1 when reaching the first (part 0) recipe of a block.
437*700637cbSDimitry Andric for (VPRecipeBase &H :
438*700637cbSDimitry Andric Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
439*700637cbSDimitry Andric // The second operand of Fixed Order Recurrence phi's, feeding the spliced
440*700637cbSDimitry Andric // value across the backedge, needs to remap to the last part of the spliced
441*700637cbSDimitry Andric // value.
442*700637cbSDimitry Andric if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) {
443*700637cbSDimitry Andric Unroller.remapOperand(&H, 1, UF - 1);
444*700637cbSDimitry Andric continue;
445*700637cbSDimitry Andric }
446*700637cbSDimitry Andric if (Unroller.contains(H.getVPSingleValue()) ||
447*700637cbSDimitry Andric isa<VPWidenPointerInductionRecipe>(&H)) {
448*700637cbSDimitry Andric Part = 1;
449*700637cbSDimitry Andric continue;
450*700637cbSDimitry Andric }
451*700637cbSDimitry Andric Unroller.remapOperands(&H, Part);
452*700637cbSDimitry Andric Part++;
453*700637cbSDimitry Andric }
454*700637cbSDimitry Andric
455*700637cbSDimitry Andric VPlanTransforms::removeDeadRecipes(Plan);
456*700637cbSDimitry Andric }
457*700637cbSDimitry Andric
458*700637cbSDimitry Andric /// Create a single-scalar clone of \p RepR for lane \p Lane.
cloneForLane(VPlan & Plan,VPBuilder & Builder,Type * IdxTy,VPReplicateRecipe * RepR,VPLane Lane)459*700637cbSDimitry Andric static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
460*700637cbSDimitry Andric Type *IdxTy, VPReplicateRecipe *RepR,
461*700637cbSDimitry Andric VPLane Lane) {
462*700637cbSDimitry Andric // Collect the operands at Lane, creating extracts as needed.
463*700637cbSDimitry Andric SmallVector<VPValue *> NewOps;
464*700637cbSDimitry Andric for (VPValue *Op : RepR->operands()) {
465*700637cbSDimitry Andric if (vputils::isSingleScalar(Op)) {
466*700637cbSDimitry Andric NewOps.push_back(Op);
467*700637cbSDimitry Andric continue;
468*700637cbSDimitry Andric }
469*700637cbSDimitry Andric if (Lane.getKind() == VPLane::Kind::ScalableLast) {
470*700637cbSDimitry Andric NewOps.push_back(
471*700637cbSDimitry Andric Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
472*700637cbSDimitry Andric continue;
473*700637cbSDimitry Andric }
474*700637cbSDimitry Andric // Look through buildvector to avoid unnecessary extracts.
475*700637cbSDimitry Andric if (match(Op, m_BuildVector())) {
476*700637cbSDimitry Andric NewOps.push_back(
477*700637cbSDimitry Andric cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane()));
478*700637cbSDimitry Andric continue;
479*700637cbSDimitry Andric }
480*700637cbSDimitry Andric VPValue *Idx =
481*700637cbSDimitry Andric Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
482*700637cbSDimitry Andric VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
483*700637cbSDimitry Andric NewOps.push_back(Ext);
484*700637cbSDimitry Andric }
485*700637cbSDimitry Andric
486*700637cbSDimitry Andric auto *New =
487*700637cbSDimitry Andric new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
488*700637cbSDimitry Andric /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
489*700637cbSDimitry Andric New->transferFlags(*RepR);
490*700637cbSDimitry Andric New->insertBefore(RepR);
491*700637cbSDimitry Andric return New;
492*700637cbSDimitry Andric }
493*700637cbSDimitry Andric
replicateByVF(VPlan & Plan,ElementCount VF)494*700637cbSDimitry Andric void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
495*700637cbSDimitry Andric Type *IdxTy = IntegerType::get(
496*700637cbSDimitry Andric Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32);
497*700637cbSDimitry Andric
498*700637cbSDimitry Andric // Visit all VPBBs outside the loop region and directly inside the top-level
499*700637cbSDimitry Andric // loop region.
500*700637cbSDimitry Andric auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
501*700637cbSDimitry Andric vp_depth_first_shallow(Plan.getEntry()));
502*700637cbSDimitry Andric auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
503*700637cbSDimitry Andric vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()));
504*700637cbSDimitry Andric auto VPBBsToUnroll =
505*700637cbSDimitry Andric concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion);
506*700637cbSDimitry Andric for (VPBasicBlock *VPBB : VPBBsToUnroll) {
507*700637cbSDimitry Andric for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
508*700637cbSDimitry Andric auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
509*700637cbSDimitry Andric if (!RepR || RepR->isSingleScalar())
510*700637cbSDimitry Andric continue;
511*700637cbSDimitry Andric
512*700637cbSDimitry Andric VPBuilder Builder(RepR);
513*700637cbSDimitry Andric if (RepR->getNumUsers() == 0) {
514*700637cbSDimitry Andric if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
515*700637cbSDimitry Andric vputils::isSingleScalar(RepR->getOperand(1))) {
516*700637cbSDimitry Andric // Stores to invariant addresses need to store the last lane only.
517*700637cbSDimitry Andric cloneForLane(Plan, Builder, IdxTy, RepR,
518*700637cbSDimitry Andric VPLane::getLastLaneForVF(VF));
519*700637cbSDimitry Andric } else {
520*700637cbSDimitry Andric // Create single-scalar version of RepR for all lanes.
521*700637cbSDimitry Andric for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
522*700637cbSDimitry Andric cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I));
523*700637cbSDimitry Andric }
524*700637cbSDimitry Andric RepR->eraseFromParent();
525*700637cbSDimitry Andric continue;
526*700637cbSDimitry Andric }
527*700637cbSDimitry Andric /// Create single-scalar version of RepR for all lanes.
528*700637cbSDimitry Andric SmallVector<VPValue *> LaneDefs;
529*700637cbSDimitry Andric for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
530*700637cbSDimitry Andric LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)));
531*700637cbSDimitry Andric
532*700637cbSDimitry Andric /// Users that only demand the first lane can use the definition for lane
533*700637cbSDimitry Andric /// 0.
534*700637cbSDimitry Andric RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
535*700637cbSDimitry Andric return U.onlyFirstLaneUsed(RepR);
536*700637cbSDimitry Andric });
537*700637cbSDimitry Andric
538*700637cbSDimitry Andric // If needed, create a Build(Struct)Vector recipe to insert the scalar
539*700637cbSDimitry Andric // lane values into a vector.
540*700637cbSDimitry Andric Type *ResTy = RepR->getUnderlyingInstr()->getType();
541*700637cbSDimitry Andric VPValue *VecRes = Builder.createNaryOp(
542*700637cbSDimitry Andric ResTy->isStructTy() ? VPInstruction::BuildStructVector
543*700637cbSDimitry Andric : VPInstruction::BuildVector,
544*700637cbSDimitry Andric LaneDefs);
545*700637cbSDimitry Andric RepR->replaceAllUsesWith(VecRes);
546*700637cbSDimitry Andric RepR->eraseFromParent();
547*700637cbSDimitry Andric }
548*700637cbSDimitry Andric }
549*700637cbSDimitry Andric }
550