xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements a set of utility VPlan to VPlan transformations.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlanTransforms.h"
15 #include "VPRecipeBuilder.h"
16 #include "VPlanAnalysis.h"
17 #include "VPlanCFG.h"
18 #include "VPlanDominatorTree.h"
19 #include "VPlanPatternMatch.h"
20 #include "llvm/ADT/PostOrderIterator.h"
21 #include "llvm/ADT/STLExtras.h"
22 #include "llvm/ADT/SetVector.h"
23 #include "llvm/Analysis/IVDescriptors.h"
24 #include "llvm/Analysis/VectorUtils.h"
25 #include "llvm/IR/Intrinsics.h"
26 #include "llvm/IR/PatternMatch.h"
27 
28 using namespace llvm;
29 
VPInstructionsToVPRecipes(VPlanPtr & Plan,function_ref<const InductionDescriptor * (PHINode *)> GetIntOrFpInductionDescriptor,ScalarEvolution & SE,const TargetLibraryInfo & TLI)30 void VPlanTransforms::VPInstructionsToVPRecipes(
31     VPlanPtr &Plan,
32     function_ref<const InductionDescriptor *(PHINode *)>
33         GetIntOrFpInductionDescriptor,
34     ScalarEvolution &SE, const TargetLibraryInfo &TLI) {
35 
36   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
37       Plan->getVectorLoopRegion());
38   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
39     // Skip blocks outside region
40     if (!VPBB->getParent())
41       break;
42     VPRecipeBase *Term = VPBB->getTerminator();
43     auto EndIter = Term ? Term->getIterator() : VPBB->end();
44     // Introduce each ingredient into VPlan.
45     for (VPRecipeBase &Ingredient :
46          make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
47 
48       VPValue *VPV = Ingredient.getVPSingleValue();
49       Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());
50 
51       VPRecipeBase *NewRecipe = nullptr;
52       if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) {
53         auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
54         const auto *II = GetIntOrFpInductionDescriptor(Phi);
55         if (!II)
56           continue;
57 
58         VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());
59         VPValue *Step =
60             vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE);
61         NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II);
62       } else {
63         assert(isa<VPInstruction>(&Ingredient) &&
64                "only VPInstructions expected here");
65         assert(!isa<PHINode>(Inst) && "phis should be handled above");
66         // Create VPWidenMemoryRecipe for loads and stores.
67         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
68           NewRecipe = new VPWidenLoadRecipe(
69               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
70               false /*Consecutive*/, false /*Reverse*/,
71               Ingredient.getDebugLoc());
72         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
73           NewRecipe = new VPWidenStoreRecipe(
74               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
75               nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
76               Ingredient.getDebugLoc());
77         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
78           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
79         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
80           NewRecipe = new VPWidenCallRecipe(
81               CI, Ingredient.operands(), getVectorIntrinsicIDForCall(CI, &TLI),
82               CI->getDebugLoc());
83         } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
84           NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
85         } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
86           NewRecipe = new VPWidenCastRecipe(
87               CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
88         } else {
89           NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
90         }
91       }
92 
93       NewRecipe->insertBefore(&Ingredient);
94       if (NewRecipe->getNumDefinedValues() == 1)
95         VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
96       else
97         assert(NewRecipe->getNumDefinedValues() == 0 &&
98                "Only recpies with zero or one defined values expected");
99       Ingredient.eraseFromParent();
100     }
101   }
102 }
103 
sinkScalarOperands(VPlan & Plan)104 static bool sinkScalarOperands(VPlan &Plan) {
105   auto Iter = vp_depth_first_deep(Plan.getEntry());
106   bool Changed = false;
107   // First, collect the operands of all recipes in replicate blocks as seeds for
108   // sinking.
109   SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
110   for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
111     VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
112     if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
113       continue;
114     VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
115     if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
116       continue;
117     for (auto &Recipe : *VPBB) {
118       for (VPValue *Op : Recipe.operands())
119         if (auto *Def =
120                 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
121           WorkList.insert(std::make_pair(VPBB, Def));
122     }
123   }
124 
125   bool ScalarVFOnly = Plan.hasScalarVFOnly();
126   // Try to sink each replicate or scalar IV steps recipe in the worklist.
127   for (unsigned I = 0; I != WorkList.size(); ++I) {
128     VPBasicBlock *SinkTo;
129     VPSingleDefRecipe *SinkCandidate;
130     std::tie(SinkTo, SinkCandidate) = WorkList[I];
131     if (SinkCandidate->getParent() == SinkTo ||
132         SinkCandidate->mayHaveSideEffects() ||
133         SinkCandidate->mayReadOrWriteMemory())
134       continue;
135     if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
136       if (!ScalarVFOnly && RepR->isUniform())
137         continue;
138     } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
139       continue;
140 
141     bool NeedsDuplicating = false;
142     // All recipe users of the sink candidate must be in the same block SinkTo
143     // or all users outside of SinkTo must be uniform-after-vectorization (
144     // i.e., only first lane is used) . In the latter case, we need to duplicate
145     // SinkCandidate.
146     auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
147                             SinkCandidate](VPUser *U) {
148       auto *UI = dyn_cast<VPRecipeBase>(U);
149       if (!UI)
150         return false;
151       if (UI->getParent() == SinkTo)
152         return true;
153       NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
154       // We only know how to duplicate VPRecipeRecipes for now.
155       return NeedsDuplicating && isa<VPReplicateRecipe>(SinkCandidate);
156     };
157     if (!all_of(SinkCandidate->users(), CanSinkWithUser))
158       continue;
159 
160     if (NeedsDuplicating) {
161       if (ScalarVFOnly)
162         continue;
163       Instruction *I = SinkCandidate->getUnderlyingInstr();
164       auto *Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true);
165       // TODO: add ".cloned" suffix to name of Clone's VPValue.
166 
167       Clone->insertBefore(SinkCandidate);
168       SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
169         return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
170       });
171     }
172     SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
173     for (VPValue *Op : SinkCandidate->operands())
174       if (auto *Def =
175               dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
176         WorkList.insert(std::make_pair(SinkTo, Def));
177     Changed = true;
178   }
179   return Changed;
180 }
181 
182 /// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
183 /// the mask.
getPredicatedMask(VPRegionBlock * R)184 VPValue *getPredicatedMask(VPRegionBlock *R) {
185   auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
186   if (!EntryBB || EntryBB->size() != 1 ||
187       !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
188     return nullptr;
189 
190   return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
191 }
192 
193 /// If \p R is a triangle region, return the 'then' block of the triangle.
getPredicatedThenBlock(VPRegionBlock * R)194 static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
195   auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
196   if (EntryBB->getNumSuccessors() != 2)
197     return nullptr;
198 
199   auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
200   auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
201   if (!Succ0 || !Succ1)
202     return nullptr;
203 
204   if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
205     return nullptr;
206   if (Succ0->getSingleSuccessor() == Succ1)
207     return Succ0;
208   if (Succ1->getSingleSuccessor() == Succ0)
209     return Succ1;
210   return nullptr;
211 }
212 
213 // Merge replicate regions in their successor region, if a replicate region
214 // is connected to a successor replicate region with the same predicate by a
215 // single, empty VPBasicBlock.
mergeReplicateRegionsIntoSuccessors(VPlan & Plan)216 static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
217   SetVector<VPRegionBlock *> DeletedRegions;
218 
219   // Collect replicate regions followed by an empty block, followed by another
220   // replicate region with matching masks to process front. This is to avoid
221   // iterator invalidation issues while merging regions.
222   SmallVector<VPRegionBlock *, 8> WorkList;
223   for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
224            vp_depth_first_deep(Plan.getEntry()))) {
225     if (!Region1->isReplicator())
226       continue;
227     auto *MiddleBasicBlock =
228         dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
229     if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
230       continue;
231 
232     auto *Region2 =
233         dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
234     if (!Region2 || !Region2->isReplicator())
235       continue;
236 
237     VPValue *Mask1 = getPredicatedMask(Region1);
238     VPValue *Mask2 = getPredicatedMask(Region2);
239     if (!Mask1 || Mask1 != Mask2)
240       continue;
241 
242     assert(Mask1 && Mask2 && "both region must have conditions");
243     WorkList.push_back(Region1);
244   }
245 
246   // Move recipes from Region1 to its successor region, if both are triangles.
247   for (VPRegionBlock *Region1 : WorkList) {
248     if (DeletedRegions.contains(Region1))
249       continue;
250     auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
251     auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
252 
253     VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
254     VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
255     if (!Then1 || !Then2)
256       continue;
257 
258     // Note: No fusion-preventing memory dependencies are expected in either
259     // region. Such dependencies should be rejected during earlier dependence
260     // checks, which guarantee accesses can be re-ordered for vectorization.
261     //
262     // Move recipes to the successor region.
263     for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
264       ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
265 
266     auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
267     auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
268 
269     // Move VPPredInstPHIRecipes from the merge block to the successor region's
270     // merge block. Update all users inside the successor region to use the
271     // original values.
272     for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
273       VPValue *PredInst1 =
274           cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
275       VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
276       Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
277         auto *UI = dyn_cast<VPRecipeBase>(&U);
278         return UI && UI->getParent() == Then2;
279       });
280 
281       // Remove phi recipes that are unused after merging the regions.
282       if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
283         Phi1ToMove.eraseFromParent();
284         continue;
285       }
286       Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
287     }
288 
289     // Finally, remove the first region.
290     for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
291       VPBlockUtils::disconnectBlocks(Pred, Region1);
292       VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
293     }
294     VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
295     DeletedRegions.insert(Region1);
296   }
297 
298   for (VPRegionBlock *ToDelete : DeletedRegions)
299     delete ToDelete;
300   return !DeletedRegions.empty();
301 }
302 
createReplicateRegion(VPReplicateRecipe * PredRecipe,VPlan & Plan)303 static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
304                                             VPlan &Plan) {
305   Instruction *Instr = PredRecipe->getUnderlyingInstr();
306   // Build the triangular if-then region.
307   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
308   assert(Instr->getParent() && "Predicated instruction not in any basic block");
309   auto *BlockInMask = PredRecipe->getMask();
310   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
311   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
312 
313   // Replace predicated replicate recipe with a replicate recipe without a
314   // mask but in the replicate region.
315   auto *RecipeWithoutMask = new VPReplicateRecipe(
316       PredRecipe->getUnderlyingInstr(),
317       make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
318       PredRecipe->isUniform());
319   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
320 
321   VPPredInstPHIRecipe *PHIRecipe = nullptr;
322   if (PredRecipe->getNumUsers() != 0) {
323     PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask);
324     PredRecipe->replaceAllUsesWith(PHIRecipe);
325     PHIRecipe->setOperand(0, RecipeWithoutMask);
326   }
327   PredRecipe->eraseFromParent();
328   auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
329   VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
330 
331   // Note: first set Entry as region entry and then connect successors starting
332   // from it in order, to propagate the "parent" of each VPBasicBlock.
333   VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
334   VPBlockUtils::connectBlocks(Pred, Exiting);
335 
336   return Region;
337 }
338 
addReplicateRegions(VPlan & Plan)339 static void addReplicateRegions(VPlan &Plan) {
340   SmallVector<VPReplicateRecipe *> WorkList;
341   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
342            vp_depth_first_deep(Plan.getEntry()))) {
343     for (VPRecipeBase &R : *VPBB)
344       if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
345         if (RepR->isPredicated())
346           WorkList.push_back(RepR);
347       }
348   }
349 
350   unsigned BBNum = 0;
351   for (VPReplicateRecipe *RepR : WorkList) {
352     VPBasicBlock *CurrentBlock = RepR->getParent();
353     VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
354 
355     BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
356     SplitBlock->setName(
357         OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
358     // Record predicated instructions for above packing optimizations.
359     VPBlockBase *Region = createReplicateRegion(RepR, Plan);
360     Region->setParent(CurrentBlock->getParent());
361     VPBlockUtils::disconnectBlocks(CurrentBlock, SplitBlock);
362     VPBlockUtils::connectBlocks(CurrentBlock, Region);
363     VPBlockUtils::connectBlocks(Region, SplitBlock);
364   }
365 }
366 
367 /// Remove redundant VPBasicBlocks by merging them into their predecessor if
368 /// the predecessor has a single successor.
mergeBlocksIntoPredecessors(VPlan & Plan)369 static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
370   SmallVector<VPBasicBlock *> WorkList;
371   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
372            vp_depth_first_deep(Plan.getEntry()))) {
373     // Don't fold the exit block of the Plan into its single predecessor for
374     // now.
375     // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
376     if (VPBB->getNumSuccessors() == 0 && !VPBB->getParent())
377       continue;
378     auto *PredVPBB =
379         dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
380     if (!PredVPBB || PredVPBB->getNumSuccessors() != 1)
381       continue;
382     WorkList.push_back(VPBB);
383   }
384 
385   for (VPBasicBlock *VPBB : WorkList) {
386     VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
387     for (VPRecipeBase &R : make_early_inc_range(*VPBB))
388       R.moveBefore(*PredVPBB, PredVPBB->end());
389     VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
390     auto *ParentRegion = cast_or_null<VPRegionBlock>(VPBB->getParent());
391     if (ParentRegion && ParentRegion->getExiting() == VPBB)
392       ParentRegion->setExiting(PredVPBB);
393     for (auto *Succ : to_vector(VPBB->successors())) {
394       VPBlockUtils::disconnectBlocks(VPBB, Succ);
395       VPBlockUtils::connectBlocks(PredVPBB, Succ);
396     }
397     delete VPBB;
398   }
399   return !WorkList.empty();
400 }
401 
createAndOptimizeReplicateRegions(VPlan & Plan)402 void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
403   // Convert masked VPReplicateRecipes to if-then region blocks.
404   addReplicateRegions(Plan);
405 
406   bool ShouldSimplify = true;
407   while (ShouldSimplify) {
408     ShouldSimplify = sinkScalarOperands(Plan);
409     ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
410     ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
411   }
412 }
413 
414 /// Remove redundant casts of inductions.
415 ///
416 /// Such redundant casts are casts of induction variables that can be ignored,
417 /// because we already proved that the casted phi is equal to the uncasted phi
418 /// in the vectorized loop. There is no need to vectorize the cast - the same
419 /// value can be used for both the phi and casts in the vector loop.
removeRedundantInductionCasts(VPlan & Plan)420 static void removeRedundantInductionCasts(VPlan &Plan) {
421   for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
422     auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
423     if (!IV || IV->getTruncInst())
424       continue;
425 
426     // A sequence of IR Casts has potentially been recorded for IV, which
427     // *must be bypassed* when the IV is vectorized, because the vectorized IV
428     // will produce the desired casted value. This sequence forms a def-use
429     // chain and is provided in reverse order, ending with the cast that uses
430     // the IV phi. Search for the recipe of the last cast in the chain and
431     // replace it with the original IV. Note that only the final cast is
432     // expected to have users outside the cast-chain and the dead casts left
433     // over will be cleaned up later.
434     auto &Casts = IV->getInductionDescriptor().getCastInsts();
435     VPValue *FindMyCast = IV;
436     for (Instruction *IRCast : reverse(Casts)) {
437       VPSingleDefRecipe *FoundUserCast = nullptr;
438       for (auto *U : FindMyCast->users()) {
439         auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
440         if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
441           FoundUserCast = UserCast;
442           break;
443         }
444       }
445       FindMyCast = FoundUserCast;
446     }
447     FindMyCast->replaceAllUsesWith(IV);
448   }
449 }
450 
451 /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
452 /// recipe, if it exists.
removeRedundantCanonicalIVs(VPlan & Plan)453 static void removeRedundantCanonicalIVs(VPlan &Plan) {
454   VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
455   VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
456   for (VPUser *U : CanonicalIV->users()) {
457     WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U);
458     if (WidenNewIV)
459       break;
460   }
461 
462   if (!WidenNewIV)
463     return;
464 
465   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
466   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
467     auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
468 
469     if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
470       continue;
471 
472     // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
473     // everything WidenNewIV's users need. That is, WidenOriginalIV will
474     // generate a vector phi or all users of WidenNewIV demand the first lane
475     // only.
476     if (any_of(WidenOriginalIV->users(),
477                [WidenOriginalIV](VPUser *U) {
478                  return !U->usesScalars(WidenOriginalIV);
479                }) ||
480         vputils::onlyFirstLaneUsed(WidenNewIV)) {
481       WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
482       WidenNewIV->eraseFromParent();
483       return;
484     }
485   }
486 }
487 
488 /// Returns true if \p R is dead and can be removed.
isDeadRecipe(VPRecipeBase & R)489 static bool isDeadRecipe(VPRecipeBase &R) {
490   using namespace llvm::PatternMatch;
491   // Do remove conditional assume instructions as their conditions may be
492   // flattened.
493   auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
494   bool IsConditionalAssume =
495       RepR && RepR->isPredicated() &&
496       match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>());
497   if (IsConditionalAssume)
498     return true;
499 
500   if (R.mayHaveSideEffects())
501     return false;
502 
503   // Recipe is dead if no user keeps the recipe alive.
504   return all_of(R.definedValues(),
505                 [](VPValue *V) { return V->getNumUsers() == 0; });
506 }
507 
removeDeadRecipes(VPlan & Plan)508 static void removeDeadRecipes(VPlan &Plan) {
509   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
510       Plan.getEntry());
511 
512   for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
513     // The recipes in the block are processed in reverse order, to catch chains
514     // of dead recipes.
515     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
516       if (isDeadRecipe(R))
517         R.eraseFromParent();
518     }
519   }
520 }
521 
522 static VPScalarIVStepsRecipe *
createScalarIVSteps(VPlan & Plan,InductionDescriptor::InductionKind Kind,Instruction::BinaryOps InductionOpcode,FPMathOperator * FPBinOp,ScalarEvolution & SE,Instruction * TruncI,VPValue * StartV,VPValue * Step,VPBasicBlock::iterator IP)523 createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
524                     Instruction::BinaryOps InductionOpcode,
525                     FPMathOperator *FPBinOp, ScalarEvolution &SE,
526                     Instruction *TruncI, VPValue *StartV, VPValue *Step,
527                     VPBasicBlock::iterator IP) {
528   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
529   VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
530   VPSingleDefRecipe *BaseIV = CanonicalIV;
531   if (!CanonicalIV->isCanonical(Kind, StartV, Step)) {
532     BaseIV = new VPDerivedIVRecipe(Kind, FPBinOp, StartV, CanonicalIV, Step);
533     HeaderVPBB->insert(BaseIV, IP);
534   }
535 
536   // Truncate base induction if needed.
537   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
538                           SE.getContext());
539   Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
540   if (TruncI) {
541     Type *TruncTy = TruncI->getType();
542     assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
543            "Not truncating.");
544     assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
545     BaseIV = new VPScalarCastRecipe(Instruction::Trunc, BaseIV, TruncTy);
546     HeaderVPBB->insert(BaseIV, IP);
547     ResultTy = TruncTy;
548   }
549 
550   // Truncate step if needed.
551   Type *StepTy = TypeInfo.inferScalarType(Step);
552   if (ResultTy != StepTy) {
553     assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
554            "Not truncating.");
555     assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
556     Step = new VPScalarCastRecipe(Instruction::Trunc, Step, ResultTy);
557     auto *VecPreheader =
558         cast<VPBasicBlock>(HeaderVPBB->getSingleHierarchicalPredecessor());
559     VecPreheader->appendRecipe(Step->getDefiningRecipe());
560   }
561 
562   VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(
563       BaseIV, Step, InductionOpcode,
564       FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags());
565   HeaderVPBB->insert(Steps, IP);
566   return Steps;
567 }
568 
569 /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
570 /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
571 /// VPWidenPointerInductionRecipe will generate vectors only. If some users
572 /// require vectors while other require scalars, the scalar uses need to extract
573 /// the scalars from the generated vectors (Note that this is different to how
574 /// int/fp inductions are handled). Also optimize VPWidenIntOrFpInductionRecipe,
575 /// if any of its users needs scalar values, by providing them scalar steps
576 /// built on the canonical scalar IV and update the original IV's users. This is
577 /// an optional optimization to reduce the needs of vector extracts.
legalizeAndOptimizeInductions(VPlan & Plan,ScalarEvolution & SE)578 static void legalizeAndOptimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
579   SmallVector<VPRecipeBase *> ToRemove;
580   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
581   bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1));
582   VPBasicBlock::iterator InsertPt = HeaderVPBB->getFirstNonPhi();
583   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
584     // Replace wide pointer inductions which have only their scalars used by
585     // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
586     if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
587       if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
588         continue;
589 
590       const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
591       VPValue *StartV =
592           Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
593       VPValue *StepV = PtrIV->getOperand(1);
594       VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
595           Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
596           SE, nullptr, StartV, StepV, InsertPt);
597 
598       auto *Recipe = new VPInstruction(VPInstruction::PtrAdd,
599                                        {PtrIV->getStartValue(), Steps},
600                                        PtrIV->getDebugLoc(), "next.gep");
601 
602       Recipe->insertAfter(Steps);
603       PtrIV->replaceAllUsesWith(Recipe);
604       continue;
605     }
606 
607     // Replace widened induction with scalar steps for users that only use
608     // scalars.
609     auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
610     if (!WideIV)
611       continue;
612     if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
613           return U->usesScalars(WideIV);
614         }))
615       continue;
616 
617     const InductionDescriptor &ID = WideIV->getInductionDescriptor();
618     VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
619         Plan, ID.getKind(), ID.getInductionOpcode(),
620         dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), SE,
621         WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
622         InsertPt);
623 
624     // Update scalar users of IV to use Step instead.
625     if (!HasOnlyVectorVFs)
626       WideIV->replaceAllUsesWith(Steps);
627     else
628       WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
629         return U.usesScalars(WideIV);
630       });
631   }
632 }
633 
634 /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
635 /// them with already existing recipes expanding the same SCEV expression.
removeRedundantExpandSCEVRecipes(VPlan & Plan)636 static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
637   DenseMap<const SCEV *, VPValue *> SCEV2VPV;
638 
639   for (VPRecipeBase &R :
640        make_early_inc_range(*Plan.getEntry()->getEntryBasicBlock())) {
641     auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
642     if (!ExpR)
643       continue;
644 
645     auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR});
646     if (I.second)
647       continue;
648     ExpR->replaceAllUsesWith(I.first->second);
649     ExpR->eraseFromParent();
650   }
651 }
652 
recursivelyDeleteDeadRecipes(VPValue * V)653 static void recursivelyDeleteDeadRecipes(VPValue *V) {
654   SmallVector<VPValue *> WorkList;
655   SmallPtrSet<VPValue *, 8> Seen;
656   WorkList.push_back(V);
657 
658   while (!WorkList.empty()) {
659     VPValue *Cur = WorkList.pop_back_val();
660     if (!Seen.insert(Cur).second)
661       continue;
662     VPRecipeBase *R = Cur->getDefiningRecipe();
663     if (!R)
664       continue;
665     if (!isDeadRecipe(*R))
666       continue;
667     WorkList.append(R->op_begin(), R->op_end());
668     R->eraseFromParent();
669   }
670 }
671 
optimizeForVFAndUF(VPlan & Plan,ElementCount BestVF,unsigned BestUF,PredicatedScalarEvolution & PSE)672 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
673                                          unsigned BestUF,
674                                          PredicatedScalarEvolution &PSE) {
675   assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
676   assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
677   VPBasicBlock *ExitingVPBB =
678       Plan.getVectorLoopRegion()->getExitingBasicBlock();
679   auto *Term = &ExitingVPBB->back();
680   // Try to simplify the branch condition if TC <= VF * UF when preparing to
681   // execute the plan for the main vector loop. We only do this if the
682   // terminator is:
683   //  1. BranchOnCount, or
684   //  2. BranchOnCond where the input is Not(ActiveLaneMask).
685   using namespace llvm::VPlanPatternMatch;
686   if (!match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) &&
687       !match(Term,
688              m_BranchOnCond(m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue())))))
689     return;
690 
691   Type *IdxTy =
692       Plan.getCanonicalIV()->getStartValue()->getLiveInIRValue()->getType();
693   const SCEV *TripCount = createTripCountSCEV(IdxTy, PSE);
694   ScalarEvolution &SE = *PSE.getSE();
695   ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
696   const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
697   if (TripCount->isZero() ||
698       !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
699     return;
700 
701   LLVMContext &Ctx = SE.getContext();
702   auto *BOC =
703       new VPInstruction(VPInstruction::BranchOnCond,
704                         {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))});
705 
706   SmallVector<VPValue *> PossiblyDead(Term->operands());
707   Term->eraseFromParent();
708   for (VPValue *Op : PossiblyDead)
709     recursivelyDeleteDeadRecipes(Op);
710   ExitingVPBB->appendRecipe(BOC);
711   Plan.setVF(BestVF);
712   Plan.setUF(BestUF);
713   // TODO: Further simplifications are possible
714   //      1. Replace inductions with constants.
715   //      2. Replace vector loop region with VPBasicBlock.
716 }
717 
718 #ifndef NDEBUG
GetReplicateRegion(VPRecipeBase * R)719 static VPRegionBlock *GetReplicateRegion(VPRecipeBase *R) {
720   auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
721   if (Region && Region->isReplicator()) {
722     assert(Region->getNumSuccessors() == 1 &&
723            Region->getNumPredecessors() == 1 && "Expected SESE region!");
724     assert(R->getParent()->size() == 1 &&
725            "A recipe in an original replicator region must be the only "
726            "recipe in its block");
727     return Region;
728   }
729   return nullptr;
730 }
731 #endif
732 
properlyDominates(const VPRecipeBase * A,const VPRecipeBase * B,VPDominatorTree & VPDT)733 static bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B,
734                               VPDominatorTree &VPDT) {
735   if (A == B)
736     return false;
737 
738   auto LocalComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) {
739     for (auto &R : *A->getParent()) {
740       if (&R == A)
741         return true;
742       if (&R == B)
743         return false;
744     }
745     llvm_unreachable("recipe not found");
746   };
747   const VPBlockBase *ParentA = A->getParent();
748   const VPBlockBase *ParentB = B->getParent();
749   if (ParentA == ParentB)
750     return LocalComesBefore(A, B);
751 
752   assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(A)) &&
753          "No replicate regions expected at this point");
754   assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(B)) &&
755          "No replicate regions expected at this point");
756   return VPDT.properlyDominates(ParentA, ParentB);
757 }
758 
759 /// Sink users of \p FOR after the recipe defining the previous value \p
760 /// Previous of the recurrence. \returns true if all users of \p FOR could be
761 /// re-arranged as needed or false if it is not possible.
762 static bool
sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe * FOR,VPRecipeBase * Previous,VPDominatorTree & VPDT)763 sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
764                                  VPRecipeBase *Previous,
765                                  VPDominatorTree &VPDT) {
766   // Collect recipes that need sinking.
767   SmallVector<VPRecipeBase *> WorkList;
768   SmallPtrSet<VPRecipeBase *, 8> Seen;
769   Seen.insert(Previous);
770   auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
771     // The previous value must not depend on the users of the recurrence phi. In
772     // that case, FOR is not a fixed order recurrence.
773     if (SinkCandidate == Previous)
774       return false;
775 
776     if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
777         !Seen.insert(SinkCandidate).second ||
778         properlyDominates(Previous, SinkCandidate, VPDT))
779       return true;
780 
781     if (SinkCandidate->mayHaveSideEffects())
782       return false;
783 
784     WorkList.push_back(SinkCandidate);
785     return true;
786   };
787 
788   // Recursively sink users of FOR after Previous.
789   WorkList.push_back(FOR);
790   for (unsigned I = 0; I != WorkList.size(); ++I) {
791     VPRecipeBase *Current = WorkList[I];
792     assert(Current->getNumDefinedValues() == 1 &&
793            "only recipes with a single defined value expected");
794 
795     for (VPUser *User : Current->getVPSingleValue()->users()) {
796       if (auto *R = dyn_cast<VPRecipeBase>(User))
797         if (!TryToPushSinkCandidate(R))
798           return false;
799     }
800   }
801 
802   // Keep recipes to sink ordered by dominance so earlier instructions are
803   // processed first.
804   sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
805     return properlyDominates(A, B, VPDT);
806   });
807 
808   for (VPRecipeBase *SinkCandidate : WorkList) {
809     if (SinkCandidate == FOR)
810       continue;
811 
812     SinkCandidate->moveAfter(Previous);
813     Previous = SinkCandidate;
814   }
815   return true;
816 }
817 
adjustFixedOrderRecurrences(VPlan & Plan,VPBuilder & LoopBuilder)818 bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
819                                                   VPBuilder &LoopBuilder) {
820   VPDominatorTree VPDT;
821   VPDT.recalculate(Plan);
822 
823   SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
824   for (VPRecipeBase &R :
825        Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
826     if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
827       RecurrencePhis.push_back(FOR);
828 
829   VPBasicBlock *MiddleVPBB =
830       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
831   VPBuilder MiddleBuilder;
832   // Set insert point so new recipes are inserted before terminator and
833   // condition, if there is either the former or both.
834   if (auto *Term =
835           dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator())) {
836     if (auto *Cmp = dyn_cast<VPInstruction>(Term->getOperand(0)))
837       MiddleBuilder.setInsertPoint(Cmp);
838     else
839       MiddleBuilder.setInsertPoint(Term);
840   } else
841     MiddleBuilder.setInsertPoint(MiddleVPBB);
842 
843   for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
844     SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
845     VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
846     // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
847     // to terminate.
848     while (auto *PrevPhi =
849                dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Previous)) {
850       assert(PrevPhi->getParent() == FOR->getParent());
851       assert(SeenPhis.insert(PrevPhi).second);
852       Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
853     }
854 
855     if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT))
856       return false;
857 
858     // Introduce a recipe to combine the incoming and previous values of a
859     // fixed-order recurrence.
860     VPBasicBlock *InsertBlock = Previous->getParent();
861     if (isa<VPHeaderPHIRecipe>(Previous))
862       LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
863     else
864       LoopBuilder.setInsertPoint(InsertBlock,
865                                  std::next(Previous->getIterator()));
866 
867     auto *RecurSplice = cast<VPInstruction>(
868         LoopBuilder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
869                                  {FOR, FOR->getBackedgeValue()}));
870 
871     FOR->replaceAllUsesWith(RecurSplice);
872     // Set the first operand of RecurSplice to FOR again, after replacing
873     // all users.
874     RecurSplice->setOperand(0, FOR);
875 
876     // This is the second phase of vectorizing first-order recurrences. An
877     // overview of the transformation is described below. Suppose we have the
878     // following loop with some use after the loop of the last a[i-1],
879     //
880     //   for (int i = 0; i < n; ++i) {
881     //     t = a[i - 1];
882     //     b[i] = a[i] - t;
883     //   }
884     //   use t;
885     //
886     // There is a first-order recurrence on "a". For this loop, the shorthand
887     // scalar IR looks like:
888     //
889     //   scalar.ph:
890     //     s_init = a[-1]
891     //     br scalar.body
892     //
893     //   scalar.body:
894     //     i = phi [0, scalar.ph], [i+1, scalar.body]
895     //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
896     //     s2 = a[i]
897     //     b[i] = s2 - s1
898     //     br cond, scalar.body, exit.block
899     //
900     //   exit.block:
901     //     use = lcssa.phi [s1, scalar.body]
902     //
903     // In this example, s1 is a recurrence because it's value depends on the
904     // previous iteration. In the first phase of vectorization, we created a
905     // vector phi v1 for s1. We now complete the vectorization and produce the
906     // shorthand vector IR shown below (for VF = 4, UF = 1).
907     //
908     //   vector.ph:
909     //     v_init = vector(..., ..., ..., a[-1])
910     //     br vector.body
911     //
912     //   vector.body
913     //     i = phi [0, vector.ph], [i+4, vector.body]
914     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
915     //     v2 = a[i, i+1, i+2, i+3];
916     //     v3 = vector(v1(3), v2(0, 1, 2))
917     //     b[i, i+1, i+2, i+3] = v2 - v3
918     //     br cond, vector.body, middle.block
919     //
920     //   middle.block:
921     //     s_penultimate = v2(2) = v3(3)
922     //     s_resume = v2(3)
923     //     br cond, scalar.ph, exit.block
924     //
925     //   scalar.ph:
926     //     s_init' = phi [s_resume, middle.block], [s_init, otherwise]
927     //     br scalar.body
928     //
929     //   scalar.body:
930     //     i = phi [0, scalar.ph], [i+1, scalar.body]
931     //     s1 = phi [s_init', scalar.ph], [s2, scalar.body]
932     //     s2 = a[i]
933     //     b[i] = s2 - s1
934     //     br cond, scalar.body, exit.block
935     //
936     //   exit.block:
937     //     lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block]
938     //
939     // After execution completes the vector loop, we extract the next value of
940     // the recurrence (x) to use as the initial value in the scalar loop. This
941     // is modeled by ExtractFromEnd.
942     Type *IntTy = Plan.getCanonicalIV()->getScalarType();
943 
944     // Extract the penultimate value of the recurrence and update VPLiveOut
945     // users of the recurrence splice. Note that the extract of the final value
946     // used to resume in the scalar loop is created earlier during VPlan
947     // construction.
948     auto *Penultimate = cast<VPInstruction>(MiddleBuilder.createNaryOp(
949         VPInstruction::ExtractFromEnd,
950         {FOR->getBackedgeValue(),
951          Plan.getOrAddLiveIn(ConstantInt::get(IntTy, 2))},
952         {}, "vector.recur.extract.for.phi"));
953     RecurSplice->replaceUsesWithIf(
954         Penultimate, [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
955   }
956   return true;
957 }
958 
collectUsersRecursively(VPValue * V)959 static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
960   SetVector<VPUser *> Users(V->user_begin(), V->user_end());
961   for (unsigned I = 0; I != Users.size(); ++I) {
962     VPRecipeBase *Cur = dyn_cast<VPRecipeBase>(Users[I]);
963     if (!Cur || isa<VPHeaderPHIRecipe>(Cur))
964       continue;
965     for (VPValue *V : Cur->definedValues())
966       Users.insert(V->user_begin(), V->user_end());
967   }
968   return Users.takeVector();
969 }
970 
clearReductionWrapFlags(VPlan & Plan)971 void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
972   for (VPRecipeBase &R :
973        Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
974     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
975     if (!PhiR)
976       continue;
977     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
978     RecurKind RK = RdxDesc.getRecurrenceKind();
979     if (RK != RecurKind::Add && RK != RecurKind::Mul)
980       continue;
981 
982     for (VPUser *U : collectUsersRecursively(PhiR))
983       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
984         RecWithFlags->dropPoisonGeneratingFlags();
985       }
986   }
987 }
988 
989 /// Try to simplify recipe \p R.
simplifyRecipe(VPRecipeBase & R,VPTypeAnalysis & TypeInfo)990 static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
991   using namespace llvm::VPlanPatternMatch;
992   // Try to remove redundant blend recipes.
993   if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
994     VPValue *Inc0 = Blend->getIncomingValue(0);
995     for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
996       if (Inc0 != Blend->getIncomingValue(I) &&
997           !match(Blend->getMask(I), m_False()))
998         return;
999     Blend->replaceAllUsesWith(Inc0);
1000     Blend->eraseFromParent();
1001     return;
1002   }
1003 
1004   VPValue *A;
1005   if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1006     VPValue *Trunc = R.getVPSingleValue();
1007     Type *TruncTy = TypeInfo.inferScalarType(Trunc);
1008     Type *ATy = TypeInfo.inferScalarType(A);
1009     if (TruncTy == ATy) {
1010       Trunc->replaceAllUsesWith(A);
1011     } else {
1012       // Don't replace a scalarizing recipe with a widened cast.
1013       if (isa<VPReplicateRecipe>(&R))
1014         return;
1015       if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1016 
1017         unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
1018                                  ? Instruction::SExt
1019                                  : Instruction::ZExt;
1020         auto *VPC =
1021             new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
1022         if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
1023           // UnderlyingExt has distinct return type, used to retain legacy cost.
1024           VPC->setUnderlyingValue(UnderlyingExt);
1025         }
1026         VPC->insertBefore(&R);
1027         Trunc->replaceAllUsesWith(VPC);
1028       } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1029         auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
1030         VPC->insertBefore(&R);
1031         Trunc->replaceAllUsesWith(VPC);
1032       }
1033     }
1034 #ifndef NDEBUG
1035     // Verify that the cached type info is for both A and its users is still
1036     // accurate by comparing it to freshly computed types.
1037     VPTypeAnalysis TypeInfo2(
1038         R.getParent()->getPlan()->getCanonicalIV()->getScalarType(),
1039         TypeInfo.getContext());
1040     assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1041     for (VPUser *U : A->users()) {
1042       auto *R = dyn_cast<VPRecipeBase>(U);
1043       if (!R)
1044         continue;
1045       for (VPValue *VPV : R->definedValues())
1046         assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1047     }
1048 #endif
1049   }
1050 
1051   // Simplify (X && Y) || (X && !Y) -> X.
1052   // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1053   // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1054   // recipes to be visited during simplification.
1055   VPValue *X, *Y, *X1, *Y1;
1056   if (match(&R,
1057             m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
1058                          m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
1059       X == X1 && Y == Y1) {
1060     R.getVPSingleValue()->replaceAllUsesWith(X);
1061     return;
1062   }
1063 
1064   if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
1065     return R.getVPSingleValue()->replaceAllUsesWith(A);
1066 }
1067 
1068 /// Try to simplify the recipes in \p Plan.
simplifyRecipes(VPlan & Plan,LLVMContext & Ctx)1069 static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) {
1070   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
1071       Plan.getEntry());
1072   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), Ctx);
1073   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
1074     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1075       simplifyRecipe(R, TypeInfo);
1076     }
1077   }
1078 }
1079 
truncateToMinimalBitwidths(VPlan & Plan,const MapVector<Instruction *,uint64_t> & MinBWs,LLVMContext & Ctx)1080 void VPlanTransforms::truncateToMinimalBitwidths(
1081     VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs,
1082     LLVMContext &Ctx) {
1083 #ifndef NDEBUG
1084   // Count the processed recipes and cross check the count later with MinBWs
1085   // size, to make sure all entries in MinBWs have been handled.
1086   unsigned NumProcessedRecipes = 0;
1087 #endif
1088   // Keep track of created truncates, so they can be re-used. Note that we
1089   // cannot use RAUW after creating a new truncate, as this would could make
1090   // other uses have different types for their operands, making them invalidly
1091   // typed.
1092   DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
1093   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), Ctx);
1094   VPBasicBlock *PH = Plan.getEntry();
1095   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1096            vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
1097     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1098       if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
1099                VPWidenSelectRecipe, VPWidenLoadRecipe>(&R))
1100         continue;
1101 
1102       VPValue *ResultVPV = R.getVPSingleValue();
1103       auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
1104       unsigned NewResSizeInBits = MinBWs.lookup(UI);
1105       if (!NewResSizeInBits)
1106         continue;
1107 
1108 #ifndef NDEBUG
1109       NumProcessedRecipes++;
1110 #endif
1111       // If the value wasn't vectorized, we must maintain the original scalar
1112       // type. Skip those here, after incrementing NumProcessedRecipes. Also
1113       // skip casts which do not need to be handled explicitly here, as
1114       // redundant casts will be removed during recipe simplification.
1115       if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R)) {
1116 #ifndef NDEBUG
1117         // If any of the operands is a live-in and not used by VPWidenRecipe or
1118         // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as
1119         // processed as well. When MinBWs is currently constructed, there is no
1120         // information about whether recipes are widened or replicated and in
1121         // case they are reciplicated the operands are not truncated. Counting
1122         // them them here ensures we do not miss any recipes in MinBWs.
1123         // TODO: Remove once the analysis is done on VPlan.
1124         for (VPValue *Op : R.operands()) {
1125           if (!Op->isLiveIn())
1126             continue;
1127           auto *UV = dyn_cast_or_null<Instruction>(Op->getUnderlyingValue());
1128           if (UV && MinBWs.contains(UV) && !ProcessedTruncs.contains(Op) &&
1129               all_of(Op->users(), [](VPUser *U) {
1130                 return !isa<VPWidenRecipe, VPWidenSelectRecipe>(U);
1131               })) {
1132             // Add an entry to ProcessedTruncs to avoid counting the same
1133             // operand multiple times.
1134             ProcessedTruncs[Op] = nullptr;
1135             NumProcessedRecipes += 1;
1136           }
1137         }
1138 #endif
1139         continue;
1140       }
1141 
1142       Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
1143       unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
1144       assert(OldResTy->isIntegerTy() && "only integer types supported");
1145       (void)OldResSizeInBits;
1146 
1147       auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);
1148 
1149       // Any wrapping introduced by shrinking this operation shouldn't be
1150       // considered undefined behavior. So, we can't unconditionally copy
1151       // arithmetic wrapping flags to VPW.
1152       if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
1153         VPW->dropPoisonGeneratingFlags();
1154 
1155       using namespace llvm::VPlanPatternMatch;
1156       if (OldResSizeInBits != NewResSizeInBits &&
1157           !match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue()))) {
1158         // Extend result to original width.
1159         auto *Ext =
1160             new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
1161         Ext->insertAfter(&R);
1162         ResultVPV->replaceAllUsesWith(Ext);
1163         Ext->setOperand(0, ResultVPV);
1164         assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
1165       } else
1166         assert(
1167             match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) &&
1168             "Only ICmps should not need extending the result.");
1169 
1170       assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
1171       if (isa<VPWidenLoadRecipe>(&R))
1172         continue;
1173 
1174       // Shrink operands by introducing truncates as needed.
1175       unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
1176       for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
1177         auto *Op = R.getOperand(Idx);
1178         unsigned OpSizeInBits =
1179             TypeInfo.inferScalarType(Op)->getScalarSizeInBits();
1180         if (OpSizeInBits == NewResSizeInBits)
1181           continue;
1182         assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
1183         auto [ProcessedIter, IterIsEmpty] =
1184             ProcessedTruncs.insert({Op, nullptr});
1185         VPWidenCastRecipe *NewOp =
1186             IterIsEmpty
1187                 ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)
1188                 : ProcessedIter->second;
1189         R.setOperand(Idx, NewOp);
1190         if (!IterIsEmpty)
1191           continue;
1192         ProcessedIter->second = NewOp;
1193         if (!Op->isLiveIn()) {
1194           NewOp->insertBefore(&R);
1195         } else {
1196           PH->appendRecipe(NewOp);
1197 #ifndef NDEBUG
1198           auto *OpInst = dyn_cast<Instruction>(Op->getLiveInIRValue());
1199           bool IsContained = MinBWs.contains(OpInst);
1200           NumProcessedRecipes += IsContained;
1201 #endif
1202         }
1203       }
1204 
1205     }
1206   }
1207 
1208   assert(MinBWs.size() == NumProcessedRecipes &&
1209          "some entries in MinBWs haven't been processed");
1210 }
1211 
optimize(VPlan & Plan,ScalarEvolution & SE)1212 void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
1213   removeRedundantCanonicalIVs(Plan);
1214   removeRedundantInductionCasts(Plan);
1215 
1216   simplifyRecipes(Plan, SE.getContext());
1217   legalizeAndOptimizeInductions(Plan, SE);
1218   removeDeadRecipes(Plan);
1219 
1220   createAndOptimizeReplicateRegions(Plan);
1221 
1222   removeRedundantExpandSCEVRecipes(Plan);
1223   mergeBlocksIntoPredecessors(Plan);
1224 }
1225 
1226 // Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
1227 // the loop terminator with a branch-on-cond recipe with the negated
1228 // active-lane-mask as operand. Note that this turns the loop into an
1229 // uncountable one. Only the existing terminator is replaced, all other existing
1230 // recipes/users remain unchanged, except for poison-generating flags being
1231 // dropped from the canonical IV increment. Return the created
1232 // VPActiveLaneMaskPHIRecipe.
1233 //
1234 // The function uses the following definitions:
1235 //
1236 //  %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
1237 //    calculate-trip-count-minus-VF (original TC) : original TC
1238 //  %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
1239 //     CanonicalIVPhi : CanonicalIVIncrement
1240 //  %StartV is the canonical induction start value.
1241 //
1242 // The function adds the following recipes:
1243 //
1244 // vector.ph:
1245 //   %TripCount = calculate-trip-count-minus-VF (original TC)
1246 //       [if DataWithControlFlowWithoutRuntimeCheck]
1247 //   %EntryInc = canonical-iv-increment-for-part %StartV
1248 //   %EntryALM = active-lane-mask %EntryInc, %TripCount
1249 //
1250 // vector.body:
1251 //   ...
1252 //   %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
1253 //   ...
1254 //   %InLoopInc = canonical-iv-increment-for-part %IncrementValue
1255 //   %ALM = active-lane-mask %InLoopInc, TripCount
1256 //   %Negated = Not %ALM
1257 //   branch-on-cond %Negated
1258 //
addVPLaneMaskPhiAndUpdateExitBranch(VPlan & Plan,bool DataAndControlFlowWithoutRuntimeCheck)1259 static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
1260     VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
1261   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
1262   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
1263   auto *CanonicalIVPHI = Plan.getCanonicalIV();
1264   VPValue *StartV = CanonicalIVPHI->getStartValue();
1265 
1266   auto *CanonicalIVIncrement =
1267       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
1268   // TODO: Check if dropping the flags is needed if
1269   // !DataAndControlFlowWithoutRuntimeCheck.
1270   CanonicalIVIncrement->dropPoisonGeneratingFlags();
1271   DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
1272   // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
1273   // we have to take unrolling into account. Each part needs to start at
1274   //   Part * VF
1275   auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor());
1276   VPBuilder Builder(VecPreheader);
1277 
1278   // Create the ActiveLaneMask instruction using the correct start values.
1279   VPValue *TC = Plan.getTripCount();
1280 
1281   VPValue *TripCount, *IncrementValue;
1282   if (!DataAndControlFlowWithoutRuntimeCheck) {
1283     // When the loop is guarded by a runtime overflow check for the loop
1284     // induction variable increment by VF, we can increment the value before
1285     // the get.active.lane mask and use the unmodified tripcount.
1286     IncrementValue = CanonicalIVIncrement;
1287     TripCount = TC;
1288   } else {
1289     // When avoiding a runtime check, the active.lane.mask inside the loop
1290     // uses a modified trip count and the induction variable increment is
1291     // done after the active.lane.mask intrinsic is called.
1292     IncrementValue = CanonicalIVPHI;
1293     TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
1294                                      {TC}, DL);
1295   }
1296   auto *EntryIncrement = Builder.createOverflowingOp(
1297       VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
1298       "index.part.next");
1299 
1300   // Create the active lane mask instruction in the VPlan preheader.
1301   auto *EntryALM =
1302       Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
1303                            DL, "active.lane.mask.entry");
1304 
1305   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
1306   // preheader ActiveLaneMask instruction.
1307   auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
1308   LaneMaskPhi->insertAfter(CanonicalIVPHI);
1309 
1310   // Create the active lane mask for the next iteration of the loop before the
1311   // original terminator.
1312   VPRecipeBase *OriginalTerminator = EB->getTerminator();
1313   Builder.setInsertPoint(OriginalTerminator);
1314   auto *InLoopIncrement =
1315       Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
1316                                   {IncrementValue}, {false, false}, DL);
1317   auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
1318                                    {InLoopIncrement, TripCount}, DL,
1319                                    "active.lane.mask.next");
1320   LaneMaskPhi->addOperand(ALM);
1321 
1322   // Replace the original terminator with BranchOnCond. We have to invert the
1323   // mask here because a true condition means jumping to the exit block.
1324   auto *NotMask = Builder.createNot(ALM, DL);
1325   Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
1326   OriginalTerminator->eraseFromParent();
1327   return LaneMaskPhi;
1328 }
1329 
1330 /// Collect all VPValues representing a header mask through the (ICMP_ULE,
1331 /// WideCanonicalIV, backedge-taken-count) pattern.
1332 /// TODO: Introduce explicit recipe for header-mask instead of searching
1333 /// for the header-mask pattern manually.
collectAllHeaderMasks(VPlan & Plan)1334 static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
1335   SmallVector<VPValue *> WideCanonicalIVs;
1336   auto *FoundWidenCanonicalIVUser =
1337       find_if(Plan.getCanonicalIV()->users(),
1338               [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
1339   assert(count_if(Plan.getCanonicalIV()->users(),
1340                   [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <=
1341              1 &&
1342          "Must have at most one VPWideCanonicalIVRecipe");
1343   if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
1344     auto *WideCanonicalIV =
1345         cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
1346     WideCanonicalIVs.push_back(WideCanonicalIV);
1347   }
1348 
1349   // Also include VPWidenIntOrFpInductionRecipes that represent a widened
1350   // version of the canonical induction.
1351   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1352   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1353     auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1354     if (WidenOriginalIV && WidenOriginalIV->isCanonical())
1355       WideCanonicalIVs.push_back(WidenOriginalIV);
1356   }
1357 
1358   // Walk users of wide canonical IVs and collect to all compares of the form
1359   // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
1360   SmallVector<VPValue *> HeaderMasks;
1361   for (auto *Wide : WideCanonicalIVs) {
1362     for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
1363       auto *HeaderMask = dyn_cast<VPInstruction>(U);
1364       if (!HeaderMask || !vputils::isHeaderMask(HeaderMask, Plan))
1365         continue;
1366 
1367       assert(HeaderMask->getOperand(0) == Wide &&
1368              "WidenCanonicalIV must be the first operand of the compare");
1369       HeaderMasks.push_back(HeaderMask);
1370     }
1371   }
1372   return HeaderMasks;
1373 }
1374 
addActiveLaneMask(VPlan & Plan,bool UseActiveLaneMaskForControlFlow,bool DataAndControlFlowWithoutRuntimeCheck)1375 void VPlanTransforms::addActiveLaneMask(
1376     VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
1377     bool DataAndControlFlowWithoutRuntimeCheck) {
1378   assert((!DataAndControlFlowWithoutRuntimeCheck ||
1379           UseActiveLaneMaskForControlFlow) &&
1380          "DataAndControlFlowWithoutRuntimeCheck implies "
1381          "UseActiveLaneMaskForControlFlow");
1382 
1383   auto FoundWidenCanonicalIVUser =
1384       find_if(Plan.getCanonicalIV()->users(),
1385               [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
1386   assert(FoundWidenCanonicalIVUser &&
1387          "Must have widened canonical IV when tail folding!");
1388   auto *WideCanonicalIV =
1389       cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
1390   VPSingleDefRecipe *LaneMask;
1391   if (UseActiveLaneMaskForControlFlow) {
1392     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
1393         Plan, DataAndControlFlowWithoutRuntimeCheck);
1394   } else {
1395     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
1396     LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
1397                               {WideCanonicalIV, Plan.getTripCount()}, nullptr,
1398                               "active.lane.mask");
1399   }
1400 
1401   // Walk users of WideCanonicalIV and replace all compares of the form
1402   // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
1403   // active-lane-mask.
1404   for (VPValue *HeaderMask : collectAllHeaderMasks(Plan))
1405     HeaderMask->replaceAllUsesWith(LaneMask);
1406 }
1407 
1408 /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
1409 /// replaces all uses except the canonical IV increment of
1410 /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
1411 /// is used only for loop iterations counting after this transformation.
1412 ///
1413 /// The function uses the following definitions:
1414 ///  %StartV is the canonical induction start value.
1415 ///
1416 /// The function adds the following recipes:
1417 ///
1418 /// vector.ph:
1419 /// ...
1420 ///
1421 /// vector.body:
1422 /// ...
1423 /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
1424 ///                                               [ %NextEVLIV, %vector.body ]
1425 /// %VPEVL = EXPLICIT-VECTOR-LENGTH %EVLPhi, original TC
1426 /// ...
1427 /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
1428 /// ...
1429 ///
tryAddExplicitVectorLength(VPlan & Plan)1430 bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
1431   VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1432   // The transform updates all users of inductions to work based on EVL, instead
1433   // of the VF directly. At the moment, widened inductions cannot be updated, so
1434   // bail out if the plan contains any.
1435   bool ContainsWidenInductions = any_of(Header->phis(), [](VPRecipeBase &Phi) {
1436     return isa<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(
1437         &Phi);
1438   });
1439   // FIXME: Remove this once we can transform (select header_mask, true_value,
1440   // false_value) into vp.merge.
1441   bool ContainsOutloopReductions =
1442       any_of(Header->phis(), [&](VPRecipeBase &Phi) {
1443         auto *R = dyn_cast<VPReductionPHIRecipe>(&Phi);
1444         return R && !R->isInLoop();
1445       });
1446   if (ContainsWidenInductions || ContainsOutloopReductions)
1447     return false;
1448 
1449   auto *CanonicalIVPHI = Plan.getCanonicalIV();
1450   VPValue *StartV = CanonicalIVPHI->getStartValue();
1451 
1452   // Create the ExplicitVectorLengthPhi recipe in the main loop.
1453   auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
1454   EVLPhi->insertAfter(CanonicalIVPHI);
1455   auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength,
1456                                   {EVLPhi, Plan.getTripCount()});
1457   VPEVL->insertBefore(*Header, Header->getFirstNonPhi());
1458 
1459   auto *CanonicalIVIncrement =
1460       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
1461   VPSingleDefRecipe *OpVPEVL = VPEVL;
1462   if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits();
1463       IVSize != 32) {
1464     OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc
1465                                                  : Instruction::ZExt,
1466                                      OpVPEVL, CanonicalIVPHI->getScalarType());
1467     OpVPEVL->insertBefore(CanonicalIVIncrement);
1468   }
1469   auto *NextEVLIV =
1470       new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi},
1471                         {CanonicalIVIncrement->hasNoUnsignedWrap(),
1472                          CanonicalIVIncrement->hasNoSignedWrap()},
1473                         CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
1474   NextEVLIV->insertBefore(CanonicalIVIncrement);
1475   EVLPhi->addOperand(NextEVLIV);
1476 
1477   for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
1478     for (VPUser *U : collectUsersRecursively(HeaderMask)) {
1479       VPRecipeBase *NewRecipe = nullptr;
1480       auto *CurRecipe = dyn_cast<VPRecipeBase>(U);
1481       if (!CurRecipe)
1482         continue;
1483 
1484       auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
1485         assert(OrigMask && "Unmasked recipe when folding tail");
1486         return HeaderMask == OrigMask ? nullptr : OrigMask;
1487       };
1488       if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(CurRecipe)) {
1489         VPValue *NewMask = GetNewMask(MemR->getMask());
1490         if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR))
1491           NewRecipe = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
1492         else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR))
1493           NewRecipe = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
1494         else
1495           llvm_unreachable("unsupported recipe");
1496       } else if (auto *RedR = dyn_cast<VPReductionRecipe>(CurRecipe)) {
1497         NewRecipe = new VPReductionEVLRecipe(RedR, VPEVL,
1498                                              GetNewMask(RedR->getCondOp()));
1499       }
1500 
1501       if (NewRecipe) {
1502         [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues();
1503         assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
1504                "New recipe must define the same number of values as the "
1505                "original.");
1506         assert(
1507             NumDefVal <= 1 &&
1508             "Only supports recipes with a single definition or without users.");
1509         NewRecipe->insertBefore(CurRecipe);
1510         if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(NewRecipe)) {
1511           VPValue *CurVPV = CurRecipe->getVPSingleValue();
1512           CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
1513         }
1514         CurRecipe->eraseFromParent();
1515       }
1516     }
1517     recursivelyDeleteDeadRecipes(HeaderMask);
1518   }
1519   // Replace all uses of VPCanonicalIVPHIRecipe by
1520   // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
1521   CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
1522   CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
1523   // TODO: support unroll factor > 1.
1524   Plan.setUF(1);
1525   return true;
1526 }
1527 
dropPoisonGeneratingRecipes(VPlan & Plan,function_ref<bool (BasicBlock *)> BlockNeedsPredication)1528 void VPlanTransforms::dropPoisonGeneratingRecipes(
1529     VPlan &Plan, function_ref<bool(BasicBlock *)> BlockNeedsPredication) {
1530   // Collect recipes in the backward slice of `Root` that may generate a poison
1531   // value that is used after vectorization.
1532   SmallPtrSet<VPRecipeBase *, 16> Visited;
1533   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1534     SmallVector<VPRecipeBase *, 16> Worklist;
1535     Worklist.push_back(Root);
1536 
1537     // Traverse the backward slice of Root through its use-def chain.
1538     while (!Worklist.empty()) {
1539       VPRecipeBase *CurRec = Worklist.back();
1540       Worklist.pop_back();
1541 
1542       if (!Visited.insert(CurRec).second)
1543         continue;
1544 
1545       // Prune search if we find another recipe generating a widen memory
1546       // instruction. Widen memory instructions involved in address computation
1547       // will lead to gather/scatter instructions, which don't need to be
1548       // handled.
1549       if (isa<VPWidenMemoryRecipe>(CurRec) || isa<VPInterleaveRecipe>(CurRec) ||
1550           isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPHeaderPHIRecipe>(CurRec))
1551         continue;
1552 
1553       // This recipe contributes to the address computation of a widen
1554       // load/store. If the underlying instruction has poison-generating flags,
1555       // drop them directly.
1556       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
1557         VPValue *A, *B;
1558         using namespace llvm::VPlanPatternMatch;
1559         // Dropping disjoint from an OR may yield incorrect results, as some
1560         // analysis may have converted it to an Add implicitly (e.g. SCEV used
1561         // for dependence analysis). Instead, replace it with an equivalent Add.
1562         // This is possible as all users of the disjoint OR only access lanes
1563         // where the operands are disjoint or poison otherwise.
1564         if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
1565             RecWithFlags->isDisjoint()) {
1566           VPBuilder Builder(RecWithFlags);
1567           VPInstruction *New = Builder.createOverflowingOp(
1568               Instruction::Add, {A, B}, {false, false},
1569               RecWithFlags->getDebugLoc());
1570           New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
1571           RecWithFlags->replaceAllUsesWith(New);
1572           RecWithFlags->eraseFromParent();
1573           CurRec = New;
1574         } else
1575           RecWithFlags->dropPoisonGeneratingFlags();
1576       } else {
1577         Instruction *Instr = dyn_cast_or_null<Instruction>(
1578             CurRec->getVPSingleValue()->getUnderlyingValue());
1579         (void)Instr;
1580         assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
1581                "found instruction with poison generating flags not covered by "
1582                "VPRecipeWithIRFlags");
1583       }
1584 
1585       // Add new definitions to the worklist.
1586       for (VPValue *operand : CurRec->operands())
1587         if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1588           Worklist.push_back(OpDef);
1589     }
1590   });
1591 
1592   // Traverse all the recipes in the VPlan and collect the poison-generating
1593   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1594   // VPInterleaveRecipe.
1595   auto Iter = vp_depth_first_deep(Plan.getEntry());
1596   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1597     for (VPRecipeBase &Recipe : *VPBB) {
1598       if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
1599         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1600         VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1601         if (AddrDef && WidenRec->isConsecutive() &&
1602             BlockNeedsPredication(UnderlyingInstr.getParent()))
1603           collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1604       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1605         VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1606         if (AddrDef) {
1607           // Check if any member of the interleave group needs predication.
1608           const InterleaveGroup<Instruction> *InterGroup =
1609               InterleaveRec->getInterleaveGroup();
1610           bool NeedPredication = false;
1611           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1612                I < NumMembers; ++I) {
1613             Instruction *Member = InterGroup->getMember(I);
1614             if (Member)
1615               NeedPredication |= BlockNeedsPredication(Member->getParent());
1616           }
1617 
1618           if (NeedPredication)
1619             collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1620         }
1621       }
1622     }
1623   }
1624 }
1625