xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (revision e64bea71c21eb42e97aa615188ba91f6cce0d36d)
1 //===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements a set of utility VPlan to VPlan transformations.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlanTransforms.h"
15 #include "VPRecipeBuilder.h"
16 #include "VPlan.h"
17 #include "VPlanAnalysis.h"
18 #include "VPlanCFG.h"
19 #include "VPlanDominatorTree.h"
20 #include "VPlanHelpers.h"
21 #include "VPlanPatternMatch.h"
22 #include "VPlanUtils.h"
23 #include "VPlanVerifier.h"
24 #include "llvm/ADT/APInt.h"
25 #include "llvm/ADT/PostOrderIterator.h"
26 #include "llvm/ADT/STLExtras.h"
27 #include "llvm/ADT/SetVector.h"
28 #include "llvm/ADT/TypeSwitch.h"
29 #include "llvm/Analysis/IVDescriptors.h"
30 #include "llvm/Analysis/InstSimplifyFolder.h"
31 #include "llvm/Analysis/LoopInfo.h"
32 #include "llvm/Analysis/VectorUtils.h"
33 #include "llvm/IR/Intrinsics.h"
34 #include "llvm/IR/MDBuilder.h"
35 #include "llvm/IR/PatternMatch.h"
36 #include "llvm/Support/Casting.h"
37 #include "llvm/Support/TypeSize.h"
38 
39 using namespace llvm;
40 
tryToConvertVPInstructionsToVPRecipes(VPlanPtr & Plan,function_ref<const InductionDescriptor * (PHINode *)> GetIntOrFpInductionDescriptor,ScalarEvolution & SE,const TargetLibraryInfo & TLI)41 bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
42     VPlanPtr &Plan,
43     function_ref<const InductionDescriptor *(PHINode *)>
44         GetIntOrFpInductionDescriptor,
45     ScalarEvolution &SE, const TargetLibraryInfo &TLI) {
46 
47   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
48       Plan->getVectorLoopRegion());
49   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
50     // Skip blocks outside region
51     if (!VPBB->getParent())
52       break;
53     VPRecipeBase *Term = VPBB->getTerminator();
54     auto EndIter = Term ? Term->getIterator() : VPBB->end();
55     // Introduce each ingredient into VPlan.
56     for (VPRecipeBase &Ingredient :
57          make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
58 
59       VPValue *VPV = Ingredient.getVPSingleValue();
60       if (!VPV->getUnderlyingValue())
61         continue;
62 
63       Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());
64 
65       VPRecipeBase *NewRecipe = nullptr;
66       if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) {
67         auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
68         const auto *II = GetIntOrFpInductionDescriptor(Phi);
69         if (!II)
70           continue;
71 
72         VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());
73         VPValue *Step =
74             vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE);
75         NewRecipe = new VPWidenIntOrFpInductionRecipe(
76             Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());
77       } else {
78         assert(isa<VPInstruction>(&Ingredient) &&
79                "only VPInstructions expected here");
80         assert(!isa<PHINode>(Inst) && "phis should be handled above");
81         // Create VPWidenMemoryRecipe for loads and stores.
82         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
83           NewRecipe = new VPWidenLoadRecipe(
84               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
85               false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
86               Ingredient.getDebugLoc());
87         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
88           NewRecipe = new VPWidenStoreRecipe(
89               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
90               nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
91               VPIRMetadata(*Store), Ingredient.getDebugLoc());
92         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
93           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
94         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
95           Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
96           if (VectorID == Intrinsic::not_intrinsic)
97             return false;
98           NewRecipe = new VPWidenIntrinsicRecipe(
99               *CI, getVectorIntrinsicIDForCall(CI, &TLI),
100               {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),
101               CI->getDebugLoc());
102         } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
103           NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
104         } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
105           NewRecipe = new VPWidenCastRecipe(
106               CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
107         } else {
108           NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
109         }
110       }
111 
112       NewRecipe->insertBefore(&Ingredient);
113       if (NewRecipe->getNumDefinedValues() == 1)
114         VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
115       else
116         assert(NewRecipe->getNumDefinedValues() == 0 &&
117                "Only recpies with zero or one defined values expected");
118       Ingredient.eraseFromParent();
119     }
120   }
121   return true;
122 }
123 
sinkScalarOperands(VPlan & Plan)124 static bool sinkScalarOperands(VPlan &Plan) {
125   auto Iter = vp_depth_first_deep(Plan.getEntry());
126   bool Changed = false;
127   // First, collect the operands of all recipes in replicate blocks as seeds for
128   // sinking.
129   SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
130   for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
131     VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
132     if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
133       continue;
134     VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
135     if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
136       continue;
137     for (auto &Recipe : *VPBB) {
138       for (VPValue *Op : Recipe.operands())
139         if (auto *Def =
140                 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
141           WorkList.insert(std::make_pair(VPBB, Def));
142     }
143   }
144 
145   bool ScalarVFOnly = Plan.hasScalarVFOnly();
146   // Try to sink each replicate or scalar IV steps recipe in the worklist.
147   for (unsigned I = 0; I != WorkList.size(); ++I) {
148     VPBasicBlock *SinkTo;
149     VPSingleDefRecipe *SinkCandidate;
150     std::tie(SinkTo, SinkCandidate) = WorkList[I];
151     if (SinkCandidate->getParent() == SinkTo ||
152         SinkCandidate->mayHaveSideEffects() ||
153         SinkCandidate->mayReadOrWriteMemory())
154       continue;
155     if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
156       if (!ScalarVFOnly && RepR->isSingleScalar())
157         continue;
158     } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
159       continue;
160 
161     bool NeedsDuplicating = false;
162     // All recipe users of the sink candidate must be in the same block SinkTo
163     // or all users outside of SinkTo must be uniform-after-vectorization (
164     // i.e., only first lane is used) . In the latter case, we need to duplicate
165     // SinkCandidate.
166     auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
167                             SinkCandidate](VPUser *U) {
168       auto *UI = cast<VPRecipeBase>(U);
169       if (UI->getParent() == SinkTo)
170         return true;
171       NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
172       // We only know how to duplicate VPReplicateRecipes and
173       // VPScalarIVStepsRecipes for now.
174       return NeedsDuplicating &&
175              isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(SinkCandidate);
176     };
177     if (!all_of(SinkCandidate->users(), CanSinkWithUser))
178       continue;
179 
180     if (NeedsDuplicating) {
181       if (ScalarVFOnly)
182         continue;
183       VPSingleDefRecipe *Clone;
184       if (auto *SinkCandidateRepR =
185               dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
186         // TODO: Handle converting to uniform recipes as separate transform,
187         // then cloning should be sufficient here.
188         Instruction *I = SinkCandidate->getUnderlyingInstr();
189         Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
190                                       nullptr /*Mask*/, *SinkCandidateRepR);
191         // TODO: add ".cloned" suffix to name of Clone's VPValue.
192       } else {
193         Clone = SinkCandidate->clone();
194       }
195 
196       Clone->insertBefore(SinkCandidate);
197       SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
198         return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
199       });
200     }
201     SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
202     for (VPValue *Op : SinkCandidate->operands())
203       if (auto *Def =
204               dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
205         WorkList.insert(std::make_pair(SinkTo, Def));
206     Changed = true;
207   }
208   return Changed;
209 }
210 
211 /// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
212 /// the mask.
getPredicatedMask(VPRegionBlock * R)213 VPValue *getPredicatedMask(VPRegionBlock *R) {
214   auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
215   if (!EntryBB || EntryBB->size() != 1 ||
216       !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
217     return nullptr;
218 
219   return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
220 }
221 
222 /// If \p R is a triangle region, return the 'then' block of the triangle.
getPredicatedThenBlock(VPRegionBlock * R)223 static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
224   auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
225   if (EntryBB->getNumSuccessors() != 2)
226     return nullptr;
227 
228   auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
229   auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
230   if (!Succ0 || !Succ1)
231     return nullptr;
232 
233   if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
234     return nullptr;
235   if (Succ0->getSingleSuccessor() == Succ1)
236     return Succ0;
237   if (Succ1->getSingleSuccessor() == Succ0)
238     return Succ1;
239   return nullptr;
240 }
241 
242 // Merge replicate regions in their successor region, if a replicate region
243 // is connected to a successor replicate region with the same predicate by a
244 // single, empty VPBasicBlock.
mergeReplicateRegionsIntoSuccessors(VPlan & Plan)245 static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
246   SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
247 
248   // Collect replicate regions followed by an empty block, followed by another
249   // replicate region with matching masks to process front. This is to avoid
250   // iterator invalidation issues while merging regions.
251   SmallVector<VPRegionBlock *, 8> WorkList;
252   for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
253            vp_depth_first_deep(Plan.getEntry()))) {
254     if (!Region1->isReplicator())
255       continue;
256     auto *MiddleBasicBlock =
257         dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
258     if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
259       continue;
260 
261     auto *Region2 =
262         dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
263     if (!Region2 || !Region2->isReplicator())
264       continue;
265 
266     VPValue *Mask1 = getPredicatedMask(Region1);
267     VPValue *Mask2 = getPredicatedMask(Region2);
268     if (!Mask1 || Mask1 != Mask2)
269       continue;
270 
271     assert(Mask1 && Mask2 && "both region must have conditions");
272     WorkList.push_back(Region1);
273   }
274 
275   // Move recipes from Region1 to its successor region, if both are triangles.
276   for (VPRegionBlock *Region1 : WorkList) {
277     if (TransformedRegions.contains(Region1))
278       continue;
279     auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
280     auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
281 
282     VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
283     VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
284     if (!Then1 || !Then2)
285       continue;
286 
287     // Note: No fusion-preventing memory dependencies are expected in either
288     // region. Such dependencies should be rejected during earlier dependence
289     // checks, which guarantee accesses can be re-ordered for vectorization.
290     //
291     // Move recipes to the successor region.
292     for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
293       ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
294 
295     auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
296     auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
297 
298     // Move VPPredInstPHIRecipes from the merge block to the successor region's
299     // merge block. Update all users inside the successor region to use the
300     // original values.
301     for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
302       VPValue *PredInst1 =
303           cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
304       VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
305       Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
306         return cast<VPRecipeBase>(&U)->getParent() == Then2;
307       });
308 
309       // Remove phi recipes that are unused after merging the regions.
310       if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
311         Phi1ToMove.eraseFromParent();
312         continue;
313       }
314       Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
315     }
316 
317     // Remove the dead recipes in Region1's entry block.
318     for (VPRecipeBase &R :
319          make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
320       R.eraseFromParent();
321 
322     // Finally, remove the first region.
323     for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
324       VPBlockUtils::disconnectBlocks(Pred, Region1);
325       VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
326     }
327     VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
328     TransformedRegions.insert(Region1);
329   }
330 
331   return !TransformedRegions.empty();
332 }
333 
createReplicateRegion(VPReplicateRecipe * PredRecipe,VPlan & Plan)334 static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
335                                             VPlan &Plan) {
336   Instruction *Instr = PredRecipe->getUnderlyingInstr();
337   // Build the triangular if-then region.
338   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
339   assert(Instr->getParent() && "Predicated instruction not in any basic block");
340   auto *BlockInMask = PredRecipe->getMask();
341   auto *MaskDef = BlockInMask->getDefiningRecipe();
342   auto *BOMRecipe = new VPBranchOnMaskRecipe(
343       BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc());
344   auto *Entry =
345       Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
346 
347   // Replace predicated replicate recipe with a replicate recipe without a
348   // mask but in the replicate region.
349   auto *RecipeWithoutMask = new VPReplicateRecipe(
350       PredRecipe->getUnderlyingInstr(),
351       make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
352       PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
353   auto *Pred =
354       Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
355 
356   VPPredInstPHIRecipe *PHIRecipe = nullptr;
357   if (PredRecipe->getNumUsers() != 0) {
358     PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
359                                         RecipeWithoutMask->getDebugLoc());
360     PredRecipe->replaceAllUsesWith(PHIRecipe);
361     PHIRecipe->setOperand(0, RecipeWithoutMask);
362   }
363   PredRecipe->eraseFromParent();
364   auto *Exiting =
365       Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
366   VPRegionBlock *Region =
367       Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
368 
369   // Note: first set Entry as region entry and then connect successors starting
370   // from it in order, to propagate the "parent" of each VPBasicBlock.
371   VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
372   VPBlockUtils::connectBlocks(Pred, Exiting);
373 
374   return Region;
375 }
376 
addReplicateRegions(VPlan & Plan)377 static void addReplicateRegions(VPlan &Plan) {
378   SmallVector<VPReplicateRecipe *> WorkList;
379   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
380            vp_depth_first_deep(Plan.getEntry()))) {
381     for (VPRecipeBase &R : *VPBB)
382       if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
383         if (RepR->isPredicated())
384           WorkList.push_back(RepR);
385       }
386   }
387 
388   unsigned BBNum = 0;
389   for (VPReplicateRecipe *RepR : WorkList) {
390     VPBasicBlock *CurrentBlock = RepR->getParent();
391     VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
392 
393     BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
394     SplitBlock->setName(
395         OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
396     // Record predicated instructions for above packing optimizations.
397     VPRegionBlock *Region = createReplicateRegion(RepR, Plan);
398     Region->setParent(CurrentBlock->getParent());
399     VPBlockUtils::insertOnEdge(CurrentBlock, SplitBlock, Region);
400 
401     VPRegionBlock *ParentRegion = Region->getParent();
402     if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
403       ParentRegion->setExiting(SplitBlock);
404   }
405 }
406 
407 /// Remove redundant VPBasicBlocks by merging them into their predecessor if
408 /// the predecessor has a single successor.
mergeBlocksIntoPredecessors(VPlan & Plan)409 static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
410   SmallVector<VPBasicBlock *> WorkList;
411   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
412            vp_depth_first_deep(Plan.getEntry()))) {
413     // Don't fold the blocks in the skeleton of the Plan into their single
414     // predecessors for now.
415     // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
416     if (!VPBB->getParent())
417       continue;
418     auto *PredVPBB =
419         dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
420     if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
421         isa<VPIRBasicBlock>(PredVPBB))
422       continue;
423     WorkList.push_back(VPBB);
424   }
425 
426   for (VPBasicBlock *VPBB : WorkList) {
427     VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
428     for (VPRecipeBase &R : make_early_inc_range(*VPBB))
429       R.moveBefore(*PredVPBB, PredVPBB->end());
430     VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
431     auto *ParentRegion = VPBB->getParent();
432     if (ParentRegion && ParentRegion->getExiting() == VPBB)
433       ParentRegion->setExiting(PredVPBB);
434     for (auto *Succ : to_vector(VPBB->successors())) {
435       VPBlockUtils::disconnectBlocks(VPBB, Succ);
436       VPBlockUtils::connectBlocks(PredVPBB, Succ);
437     }
438     // VPBB is now dead and will be cleaned up when the plan gets destroyed.
439   }
440   return !WorkList.empty();
441 }
442 
createAndOptimizeReplicateRegions(VPlan & Plan)443 void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
444   // Convert masked VPReplicateRecipes to if-then region blocks.
445   addReplicateRegions(Plan);
446 
447   bool ShouldSimplify = true;
448   while (ShouldSimplify) {
449     ShouldSimplify = sinkScalarOperands(Plan);
450     ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
451     ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
452   }
453 }
454 
455 /// Remove redundant casts of inductions.
456 ///
457 /// Such redundant casts are casts of induction variables that can be ignored,
458 /// because we already proved that the casted phi is equal to the uncasted phi
459 /// in the vectorized loop. There is no need to vectorize the cast - the same
460 /// value can be used for both the phi and casts in the vector loop.
removeRedundantInductionCasts(VPlan & Plan)461 static void removeRedundantInductionCasts(VPlan &Plan) {
462   for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
463     auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
464     if (!IV || IV->getTruncInst())
465       continue;
466 
467     // A sequence of IR Casts has potentially been recorded for IV, which
468     // *must be bypassed* when the IV is vectorized, because the vectorized IV
469     // will produce the desired casted value. This sequence forms a def-use
470     // chain and is provided in reverse order, ending with the cast that uses
471     // the IV phi. Search for the recipe of the last cast in the chain and
472     // replace it with the original IV. Note that only the final cast is
473     // expected to have users outside the cast-chain and the dead casts left
474     // over will be cleaned up later.
475     auto &Casts = IV->getInductionDescriptor().getCastInsts();
476     VPValue *FindMyCast = IV;
477     for (Instruction *IRCast : reverse(Casts)) {
478       VPSingleDefRecipe *FoundUserCast = nullptr;
479       for (auto *U : FindMyCast->users()) {
480         auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
481         if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
482           FoundUserCast = UserCast;
483           break;
484         }
485       }
486       FindMyCast = FoundUserCast;
487     }
488     FindMyCast->replaceAllUsesWith(IV);
489   }
490 }
491 
492 /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
493 /// recipe, if it exists.
removeRedundantCanonicalIVs(VPlan & Plan)494 static void removeRedundantCanonicalIVs(VPlan &Plan) {
495   VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
496   VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
497   for (VPUser *U : CanonicalIV->users()) {
498     WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U);
499     if (WidenNewIV)
500       break;
501   }
502 
503   if (!WidenNewIV)
504     return;
505 
506   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
507   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
508     auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
509 
510     if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
511       continue;
512 
513     // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
514     // everything WidenNewIV's users need. That is, WidenOriginalIV will
515     // generate a vector phi or all users of WidenNewIV demand the first lane
516     // only.
517     if (any_of(WidenOriginalIV->users(),
518                [WidenOriginalIV](VPUser *U) {
519                  return !U->usesScalars(WidenOriginalIV);
520                }) ||
521         vputils::onlyFirstLaneUsed(WidenNewIV)) {
522       WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
523       WidenNewIV->eraseFromParent();
524       return;
525     }
526   }
527 }
528 
529 /// Returns true if \p R is dead and can be removed.
isDeadRecipe(VPRecipeBase & R)530 static bool isDeadRecipe(VPRecipeBase &R) {
531   using namespace llvm::PatternMatch;
532   // Do remove conditional assume instructions as their conditions may be
533   // flattened.
534   auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
535   bool IsConditionalAssume =
536       RepR && RepR->isPredicated() &&
537       match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>());
538   if (IsConditionalAssume)
539     return true;
540 
541   if (R.mayHaveSideEffects())
542     return false;
543 
544   // Recipe is dead if no user keeps the recipe alive.
545   return all_of(R.definedValues(),
546                 [](VPValue *V) { return V->getNumUsers() == 0; });
547 }
548 
removeDeadRecipes(VPlan & Plan)549 void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
550   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
551       Plan.getEntry());
552 
553   for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
554     // The recipes in the block are processed in reverse order, to catch chains
555     // of dead recipes.
556     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
557       if (isDeadRecipe(R))
558         R.eraseFromParent();
559     }
560   }
561 }
562 
563 static VPScalarIVStepsRecipe *
createScalarIVSteps(VPlan & Plan,InductionDescriptor::InductionKind Kind,Instruction::BinaryOps InductionOpcode,FPMathOperator * FPBinOp,Instruction * TruncI,VPValue * StartV,VPValue * Step,DebugLoc DL,VPBuilder & Builder)564 createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
565                     Instruction::BinaryOps InductionOpcode,
566                     FPMathOperator *FPBinOp, Instruction *TruncI,
567                     VPValue *StartV, VPValue *Step, DebugLoc DL,
568                     VPBuilder &Builder) {
569   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
570   VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
571   VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
572       Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
573 
574   // Truncate base induction if needed.
575   Type *CanonicalIVType = CanonicalIV->getScalarType();
576   VPTypeAnalysis TypeInfo(CanonicalIVType);
577   Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
578   if (TruncI) {
579     Type *TruncTy = TruncI->getType();
580     assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
581            "Not truncating.");
582     assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
583     BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
584     ResultTy = TruncTy;
585   }
586 
587   // Truncate step if needed.
588   Type *StepTy = TypeInfo.inferScalarType(Step);
589   if (ResultTy != StepTy) {
590     assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
591            "Not truncating.");
592     assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
593     auto *VecPreheader =
594         cast<VPBasicBlock>(HeaderVPBB->getSingleHierarchicalPredecessor());
595     VPBuilder::InsertPointGuard Guard(Builder);
596     Builder.setInsertPoint(VecPreheader);
597     Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
598   }
599   return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
600                                      &Plan.getVF(), DL);
601 }
602 
collectUsersRecursively(VPValue * V)603 static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
604   SetVector<VPUser *> Users(llvm::from_range, V->users());
605   for (unsigned I = 0; I != Users.size(); ++I) {
606     VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]);
607     if (isa<VPHeaderPHIRecipe>(Cur))
608       continue;
609     for (VPValue *V : Cur->definedValues())
610       Users.insert_range(V->users());
611   }
612   return Users.takeVector();
613 }
614 
615 /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
616 /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
617 /// VPWidenPointerInductionRecipe will generate vectors only. If some users
618 /// require vectors while other require scalars, the scalar uses need to extract
619 /// the scalars from the generated vectors (Note that this is different to how
620 /// int/fp inductions are handled). Legalize extract-from-ends using uniform
621 /// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
622 /// the correct end value is available. Also optimize
623 /// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
624 /// providing them scalar steps built on the canonical scalar IV and update the
625 /// original IV's users. This is an optional optimization to reduce the needs of
626 /// vector extracts.
legalizeAndOptimizeInductions(VPlan & Plan)627 static void legalizeAndOptimizeInductions(VPlan &Plan) {
628   using namespace llvm::VPlanPatternMatch;
629   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
630   bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
631   VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
632   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
633     auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
634     if (!PhiR)
635       continue;
636 
637     // Try to narrow wide and replicating recipes to uniform recipes, based on
638     // VPlan analysis.
639     // TODO: Apply to all recipes in the future, to replace legacy uniformity
640     // analysis.
641     auto Users = collectUsersRecursively(PhiR);
642     for (VPUser *U : reverse(Users)) {
643       auto *Def = dyn_cast<VPSingleDefRecipe>(U);
644       auto *RepR = dyn_cast<VPReplicateRecipe>(U);
645       // Skip recipes that shouldn't be narrowed.
646       if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
647           Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
648           (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
649         continue;
650 
651       // Skip recipes that may have other lanes than their first used.
652       if (!vputils::isSingleScalar(Def) && !vputils::onlyFirstLaneUsed(Def))
653         continue;
654 
655       auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
656                                           Def->operands(), /*IsUniform*/ true);
657       Clone->insertAfter(Def);
658       Def->replaceAllUsesWith(Clone);
659     }
660 
661     // Replace wide pointer inductions which have only their scalars used by
662     // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
663     if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
664       if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
665         continue;
666 
667       const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
668       VPValue *StartV =
669           Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
670       VPValue *StepV = PtrIV->getOperand(1);
671       VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
672           Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
673           nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
674 
675       VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
676                                              PtrIV->getDebugLoc(), "next.gep");
677 
678       PtrIV->replaceAllUsesWith(PtrAdd);
679       continue;
680     }
681 
682     // Replace widened induction with scalar steps for users that only use
683     // scalars.
684     auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
685     if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
686           return U->usesScalars(WideIV);
687         }))
688       continue;
689 
690     const InductionDescriptor &ID = WideIV->getInductionDescriptor();
691     VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
692         Plan, ID.getKind(), ID.getInductionOpcode(),
693         dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
694         WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
695         WideIV->getDebugLoc(), Builder);
696 
697     // Update scalar users of IV to use Step instead.
698     if (!HasOnlyVectorVFs)
699       WideIV->replaceAllUsesWith(Steps);
700     else
701       WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
702         return U.usesScalars(WideIV);
703       });
704   }
705 }
706 
707 /// Check if \p VPV is an untruncated wide induction, either before or after the
708 /// increment. If so return the header IV (before the increment), otherwise
709 /// return null.
getOptimizableIVOf(VPValue * VPV)710 static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) {
711   auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
712   if (WideIV) {
713     // VPV itself is a wide induction, separately compute the end value for exit
714     // users if it is not a truncated IV.
715     auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
716     return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
717   }
718 
719   // Check if VPV is an optimizable induction increment.
720   VPRecipeBase *Def = VPV->getDefiningRecipe();
721   if (!Def || Def->getNumOperands() != 2)
722     return nullptr;
723   WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
724   if (!WideIV)
725     WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
726   if (!WideIV)
727     return nullptr;
728 
729   auto IsWideIVInc = [&]() {
730     using namespace VPlanPatternMatch;
731     auto &ID = WideIV->getInductionDescriptor();
732 
733     // Check if VPV increments the induction by the induction step.
734     VPValue *IVStep = WideIV->getStepValue();
735     switch (ID.getInductionOpcode()) {
736     case Instruction::Add:
737       return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
738                                                      m_Specific(IVStep)));
739     case Instruction::FAdd:
740       return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
741                                                       m_Specific(IVStep)));
742     case Instruction::FSub:
743       return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
744                                                     m_Specific(IVStep)));
745     case Instruction::Sub: {
746       // IVStep will be the negated step of the subtraction. Check if Step == -1
747       // * IVStep.
748       VPValue *Step;
749       if (!match(VPV,
750                  m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
751           !Step->isLiveIn() || !IVStep->isLiveIn())
752         return false;
753       auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
754       auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
755       return StepCI && IVStepCI &&
756              StepCI->getValue() == (-1 * IVStepCI->getValue());
757     }
758     default:
759       return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
760              match(VPV, m_GetElementPtr(m_Specific(WideIV),
761                                         m_Specific(WideIV->getStepValue())));
762     }
763     llvm_unreachable("should have been covered by switch above");
764   };
765   return IsWideIVInc() ? WideIV : nullptr;
766 }
767 
768 /// Attempts to optimize the induction variable exit values for users in the
769 /// early exit block.
optimizeEarlyExitInductionUser(VPlan & Plan,VPTypeAnalysis & TypeInfo,VPBlockBase * PredVPBB,VPValue * Op)770 static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
771                                                VPTypeAnalysis &TypeInfo,
772                                                VPBlockBase *PredVPBB,
773                                                VPValue *Op) {
774   using namespace VPlanPatternMatch;
775 
776   VPValue *Incoming, *Mask;
777   if (!match(Op, m_VPInstruction<Instruction::ExtractElement>(
778                      m_VPValue(Incoming),
779                      m_VPInstruction<VPInstruction::FirstActiveLane>(
780                          m_VPValue(Mask)))))
781     return nullptr;
782 
783   auto *WideIV = getOptimizableIVOf(Incoming);
784   if (!WideIV)
785     return nullptr;
786 
787   auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
788   if (WideIntOrFp && WideIntOrFp->getTruncInst())
789     return nullptr;
790 
791   // Calculate the final index.
792   VPValue *EndValue = Plan.getCanonicalIV();
793   auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
794   VPBuilder B(cast<VPBasicBlock>(PredVPBB));
795 
796   DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
797   VPValue *FirstActiveLane =
798       B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
799   Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
800   FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
801                                               FirstActiveLaneType, DL);
802   EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
803 
804   // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
805   // changed it means the exit is using the incremented value, so we need to
806   // add the step.
807   if (Incoming != WideIV) {
808     VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
809     EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
810   }
811 
812   if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
813     const InductionDescriptor &ID = WideIV->getInductionDescriptor();
814     VPValue *Start = WideIV->getStartValue();
815     VPValue *Step = WideIV->getStepValue();
816     EndValue = B.createDerivedIV(
817         ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
818         Start, EndValue, Step);
819   }
820 
821   return EndValue;
822 }
823 
824 /// Attempts to optimize the induction variable exit values for users in the
825 /// exit block coming from the latch in the original scalar loop.
826 static VPValue *
optimizeLatchExitInductionUser(VPlan & Plan,VPTypeAnalysis & TypeInfo,VPBlockBase * PredVPBB,VPValue * Op,DenseMap<VPValue *,VPValue * > & EndValues)827 optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo,
828                                VPBlockBase *PredVPBB, VPValue *Op,
829                                DenseMap<VPValue *, VPValue *> &EndValues) {
830   using namespace VPlanPatternMatch;
831 
832   VPValue *Incoming;
833   if (!match(Op, m_VPInstruction<VPInstruction::ExtractLastElement>(
834                      m_VPValue(Incoming))))
835     return nullptr;
836 
837   auto *WideIV = getOptimizableIVOf(Incoming);
838   if (!WideIV)
839     return nullptr;
840 
841   VPValue *EndValue = EndValues.lookup(WideIV);
842   assert(EndValue && "end value must have been pre-computed");
843 
844   // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
845   // changed it means the exit is using the incremented value, so we don't
846   // need to subtract the step.
847   if (Incoming != WideIV)
848     return EndValue;
849 
850   // Otherwise, subtract the step from the EndValue.
851   VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
852   VPValue *Step = WideIV->getStepValue();
853   Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
854   if (ScalarTy->isIntegerTy())
855     return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
856   if (ScalarTy->isPointerTy()) {
857     auto *Zero = Plan.getOrAddLiveIn(
858         ConstantInt::get(Step->getLiveInIRValue()->getType(), 0));
859     return B.createPtrAdd(EndValue,
860                           B.createNaryOp(Instruction::Sub, {Zero, Step}), {},
861                           "ind.escape");
862   }
863   if (ScalarTy->isFloatingPointTy()) {
864     const auto &ID = WideIV->getInductionDescriptor();
865     return B.createNaryOp(
866         ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
867             ? Instruction::FSub
868             : Instruction::FAdd,
869         {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
870   }
871   llvm_unreachable("all possible induction types must be handled");
872   return nullptr;
873 }
874 
optimizeInductionExitUsers(VPlan & Plan,DenseMap<VPValue *,VPValue * > & EndValues)875 void VPlanTransforms::optimizeInductionExitUsers(
876     VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues) {
877   VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
878   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
879   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
880     for (VPRecipeBase &R : ExitVPBB->phis()) {
881       auto *ExitIRI = cast<VPIRPhi>(&R);
882 
883       for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
884         VPValue *Escape = nullptr;
885         if (PredVPBB == MiddleVPBB)
886           Escape = optimizeLatchExitInductionUser(
887               Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), EndValues);
888         else
889           Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
890                                                   ExitIRI->getOperand(Idx));
891         if (Escape)
892           ExitIRI->setOperand(Idx, Escape);
893       }
894     }
895   }
896 }
897 
898 /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
899 /// them with already existing recipes expanding the same SCEV expression.
removeRedundantExpandSCEVRecipes(VPlan & Plan)900 static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
901   DenseMap<const SCEV *, VPValue *> SCEV2VPV;
902 
903   for (VPRecipeBase &R :
904        make_early_inc_range(*Plan.getEntry()->getEntryBasicBlock())) {
905     auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
906     if (!ExpR)
907       continue;
908 
909     auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR});
910     if (I.second)
911       continue;
912     ExpR->replaceAllUsesWith(I.first->second);
913     ExpR->eraseFromParent();
914   }
915 }
916 
recursivelyDeleteDeadRecipes(VPValue * V)917 static void recursivelyDeleteDeadRecipes(VPValue *V) {
918   SmallVector<VPValue *> WorkList;
919   SmallPtrSet<VPValue *, 8> Seen;
920   WorkList.push_back(V);
921 
922   while (!WorkList.empty()) {
923     VPValue *Cur = WorkList.pop_back_val();
924     if (!Seen.insert(Cur).second)
925       continue;
926     VPRecipeBase *R = Cur->getDefiningRecipe();
927     if (!R)
928       continue;
929     if (!isDeadRecipe(*R))
930       continue;
931     WorkList.append(R->op_begin(), R->op_end());
932     R->eraseFromParent();
933   }
934 }
935 
936 /// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
937 /// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are
938 /// foldable live-ins.
tryToFoldLiveIns(const VPRecipeBase & R,unsigned Opcode,ArrayRef<VPValue * > Operands,const DataLayout & DL,VPTypeAnalysis & TypeInfo)939 static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
940                                ArrayRef<VPValue *> Operands,
941                                const DataLayout &DL, VPTypeAnalysis &TypeInfo) {
942   SmallVector<Value *, 4> Ops;
943   for (VPValue *Op : Operands) {
944     if (!Op->isLiveIn() || !Op->getLiveInIRValue())
945       return nullptr;
946     Ops.push_back(Op->getLiveInIRValue());
947   }
948 
949   InstSimplifyFolder Folder(DL);
950   if (Instruction::isBinaryOp(Opcode))
951     return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), Ops[0],
952                             Ops[1]);
953   if (Instruction::isCast(Opcode))
954     return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
955                            TypeInfo.inferScalarType(R.getVPSingleValue()));
956   switch (Opcode) {
957   case VPInstruction::LogicalAnd:
958     return Folder.FoldSelect(Ops[0], Ops[1],
959                              ConstantInt::getNullValue(Ops[1]->getType()));
960   case VPInstruction::Not:
961     return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
962                             Constant::getAllOnesValue(Ops[0]->getType()));
963   case Instruction::Select:
964     return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
965   case Instruction::ICmp:
966   case Instruction::FCmp:
967     return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
968                           Ops[1]);
969   case Instruction::GetElementPtr: {
970     auto &RFlags = cast<VPRecipeWithIRFlags>(R);
971     auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
972     return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], drop_begin(Ops),
973                           RFlags.getGEPNoWrapFlags());
974   }
975   case VPInstruction::PtrAdd:
976     return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0],
977                           Ops[1],
978                           cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
979   case Instruction::InsertElement:
980     return Folder.FoldInsertElement(Ops[0], Ops[1], Ops[2]);
981   case Instruction::ExtractElement:
982     return Folder.FoldExtractElement(Ops[0], Ops[1]);
983   }
984   return nullptr;
985 }
986 
987 /// Try to simplify recipe \p R.
simplifyRecipe(VPRecipeBase & R,VPTypeAnalysis & TypeInfo)988 static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
989   using namespace llvm::VPlanPatternMatch;
990   VPlan *Plan = R.getParent()->getPlan();
991 
992   auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
993   if (!Def)
994     return;
995 
996   // Simplification of live-in IR values for SingleDef recipes using
997   // InstSimplifyFolder.
998   if (TypeSwitch<VPRecipeBase *, bool>(&R)
999           .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
1000                 VPReplicateRecipe>([&](auto *I) {
1001             const DataLayout &DL =
1002                 Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
1003             Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,
1004                                         TypeInfo);
1005             if (V)
1006               I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));
1007             return V;
1008           })
1009           .Default([](auto *) { return false; }))
1010     return;
1011 
1012   // Fold PredPHI LiveIn -> LiveIn.
1013   if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
1014     VPValue *Op = PredPHI->getOperand(0);
1015     if (Op->isLiveIn())
1016       PredPHI->replaceAllUsesWith(Op);
1017   }
1018 
1019   VPValue *A;
1020   if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1021     Type *TruncTy = TypeInfo.inferScalarType(Def);
1022     Type *ATy = TypeInfo.inferScalarType(A);
1023     if (TruncTy == ATy) {
1024       Def->replaceAllUsesWith(A);
1025     } else {
1026       // Don't replace a scalarizing recipe with a widened cast.
1027       if (isa<VPReplicateRecipe>(Def))
1028         return;
1029       if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1030 
1031         unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
1032                                  ? Instruction::SExt
1033                                  : Instruction::ZExt;
1034         auto *VPC =
1035             new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
1036         if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
1037           // UnderlyingExt has distinct return type, used to retain legacy cost.
1038           VPC->setUnderlyingValue(UnderlyingExt);
1039         }
1040         VPC->insertBefore(&R);
1041         Def->replaceAllUsesWith(VPC);
1042       } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1043         auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
1044         VPC->insertBefore(&R);
1045         Def->replaceAllUsesWith(VPC);
1046       }
1047     }
1048 #ifndef NDEBUG
1049     // Verify that the cached type info is for both A and its users is still
1050     // accurate by comparing it to freshly computed types.
1051     VPTypeAnalysis TypeInfo2(Plan->getCanonicalIV()->getScalarType());
1052     assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1053     for (VPUser *U : A->users()) {
1054       auto *R = cast<VPRecipeBase>(U);
1055       for (VPValue *VPV : R->definedValues())
1056         assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1057     }
1058 #endif
1059   }
1060 
1061   // Simplify (X && Y) || (X && !Y) -> X.
1062   // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1063   // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1064   // recipes to be visited during simplification.
1065   VPValue *X, *Y;
1066   if (match(Def,
1067             m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
1068                          m_LogicalAnd(m_Deferred(X), m_Not(m_Deferred(Y)))))) {
1069     Def->replaceAllUsesWith(X);
1070     Def->eraseFromParent();
1071     return;
1072   }
1073 
1074   // OR x, 1 -> 1.
1075   if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
1076     Def->replaceAllUsesWith(Def->getOperand(0) == X ? Def->getOperand(1)
1077                                                     : Def->getOperand(0));
1078     Def->eraseFromParent();
1079     return;
1080   }
1081 
1082   if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1083     return Def->replaceAllUsesWith(X);
1084 
1085   // select !c, x, y -> select c, y, x
1086   VPValue *C;
1087   if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1088     Def->setOperand(0, C);
1089     Def->setOperand(1, Y);
1090     Def->setOperand(2, X);
1091     return;
1092   }
1093 
1094   if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
1095     return Def->replaceAllUsesWith(A);
1096 
1097   if (match(Def, m_Not(m_VPValue(A)))) {
1098     if (match(A, m_Not(m_VPValue(A))))
1099       return Def->replaceAllUsesWith(A);
1100 
1101     // Try to fold Not into compares by adjusting the predicate in-place.
1102     if (isa<VPWidenRecipe>(A) && A->getNumUsers() == 1) {
1103       auto *WideCmp = cast<VPWidenRecipe>(A);
1104       if (WideCmp->getOpcode() == Instruction::ICmp ||
1105           WideCmp->getOpcode() == Instruction::FCmp) {
1106         WideCmp->setPredicate(
1107             CmpInst::getInversePredicate(WideCmp->getPredicate()));
1108         Def->replaceAllUsesWith(WideCmp);
1109         // If WideCmp doesn't have a debug location, use the one from the
1110         // negation, to preserve the location.
1111         if (!WideCmp->getDebugLoc() && R.getDebugLoc())
1112           WideCmp->setDebugLoc(R.getDebugLoc());
1113       }
1114     }
1115   }
1116 
1117   // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1118   if ((match(Def,
1119              m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) ||
1120        match(Def,
1121              m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) &&
1122       TypeInfo.inferScalarType(Def->getOperand(1)) ==
1123           TypeInfo.inferScalarType(Def))
1124     return Def->replaceAllUsesWith(Def->getOperand(1));
1125 
1126   if (match(Def, m_VPInstruction<VPInstruction::WideIVStep>(
1127                      m_VPValue(X), m_SpecificInt(1)))) {
1128     Type *WideStepTy = TypeInfo.inferScalarType(Def);
1129     if (TypeInfo.inferScalarType(X) != WideStepTy)
1130       X = VPBuilder(Def).createWidenCast(Instruction::Trunc, X, WideStepTy);
1131     Def->replaceAllUsesWith(X);
1132     return;
1133   }
1134 
1135   // For i1 vp.merges produced by AnyOf reductions:
1136   // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1137   if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A),
1138                                                   m_VPValue(X), m_VPValue())) &&
1139       match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) &&
1140       TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
1141     Def->setOperand(1, Def->getOperand(0));
1142     Def->setOperand(0, Y);
1143     return;
1144   }
1145 
1146   if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1147     if (Phi->getOperand(0) == Phi->getOperand(1))
1148       Def->replaceAllUsesWith(Phi->getOperand(0));
1149     return;
1150   }
1151 
1152   // Look through ExtractLastElement (BuildVector ....).
1153   if (match(&R, m_VPInstruction<VPInstruction::ExtractLastElement>(
1154                     m_BuildVector()))) {
1155     auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1156     Def->replaceAllUsesWith(
1157         BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1158     return;
1159   }
1160 
1161   // Look through ExtractPenultimateElement (BuildVector ....).
1162   if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
1163                     m_BuildVector()))) {
1164     auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1165     Def->replaceAllUsesWith(
1166         BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1167     return;
1168   }
1169 
1170   // Some simplifications can only be applied after unrolling. Perform them
1171   // below.
1172   if (!Plan->isUnrolled())
1173     return;
1174 
1175   // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1176   // the first lane is demanded.
1177   if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1178     if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1179       Steps->replaceAllUsesWith(Steps->getOperand(0));
1180       return;
1181     }
1182   }
1183   // Simplify redundant ReductionStartVector recipes after unrolling.
1184   VPValue *StartV;
1185   if (match(Def, m_VPInstruction<VPInstruction::ReductionStartVector>(
1186                      m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1187     Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1188       auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1189       return PhiR && PhiR->isInLoop();
1190     });
1191     return;
1192   }
1193 
1194   if (match(Def,
1195             m_VPInstruction<VPInstruction::ExtractLastElement>(
1196                 m_VPInstruction<VPInstruction::Broadcast>(m_VPValue(A))))) {
1197     Def->replaceAllUsesWith(A);
1198     return;
1199   }
1200 
1201   VPInstruction *OpVPI;
1202   if (match(Def, m_VPInstruction<VPInstruction::ExtractLastElement>(
1203                      m_VPInstruction(OpVPI))) &&
1204       OpVPI->isVectorToScalar()) {
1205     Def->replaceAllUsesWith(OpVPI);
1206     return;
1207   }
1208 }
1209 
simplifyRecipes(VPlan & Plan,Type & CanonicalIVTy)1210 void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
1211   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
1212       Plan.getEntry());
1213   VPTypeAnalysis TypeInfo(&CanonicalIVTy);
1214   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
1215     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1216       simplifyRecipe(R, TypeInfo);
1217     }
1218   }
1219 }
1220 
narrowToSingleScalarRecipes(VPlan & Plan)1221 static void narrowToSingleScalarRecipes(VPlan &Plan) {
1222   if (Plan.hasScalarVFOnly())
1223     return;
1224 
1225   // Try to narrow wide and replicating recipes to single scalar recipes,
1226   // based on VPlan analysis. Only process blocks in the loop region for now,
1227   // without traversing into nested regions, as recipes in replicate regions
1228   // cannot be converted yet.
1229   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1230            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
1231     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1232       if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))
1233         continue;
1234       auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1235       if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1236         continue;
1237 
1238       auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1239       // Skip recipes that aren't single scalars or don't have only their
1240       // scalar results used. In the latter case, we would introduce extra
1241       // broadcasts.
1242       if (!vputils::isSingleScalar(RepOrWidenR) ||
1243           any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) {
1244             return !U->usesScalars(RepOrWidenR);
1245           }))
1246         continue;
1247 
1248       auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1249                                           RepOrWidenR->operands(),
1250                                           true /*IsSingleScalar*/);
1251       Clone->insertBefore(RepOrWidenR);
1252       RepOrWidenR->replaceAllUsesWith(Clone);
1253     }
1254   }
1255 }
1256 
1257 /// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1258 /// to make sure the masks are simplified.
simplifyBlends(VPlan & Plan)1259 static void simplifyBlends(VPlan &Plan) {
1260   using namespace llvm::VPlanPatternMatch;
1261   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1262            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
1263     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1264       auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1265       if (!Blend)
1266         continue;
1267 
1268       // Try to remove redundant blend recipes.
1269       SmallPtrSet<VPValue *, 4> UniqueValues;
1270       if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1271         UniqueValues.insert(Blend->getIncomingValue(0));
1272       for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1273         if (!match(Blend->getMask(I), m_False()))
1274           UniqueValues.insert(Blend->getIncomingValue(I));
1275 
1276       if (UniqueValues.size() == 1) {
1277         Blend->replaceAllUsesWith(*UniqueValues.begin());
1278         Blend->eraseFromParent();
1279         continue;
1280       }
1281 
1282       if (Blend->isNormalized())
1283         continue;
1284 
1285       // Normalize the blend so its first incoming value is used as the initial
1286       // value with the others blended into it.
1287 
1288       unsigned StartIndex = 0;
1289       for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1290         // If a value's mask is used only by the blend then is can be deadcoded.
1291         // TODO: Find the most expensive mask that can be deadcoded, or a mask
1292         // that's used by multiple blends where it can be removed from them all.
1293         VPValue *Mask = Blend->getMask(I);
1294         if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1295           StartIndex = I;
1296           break;
1297         }
1298       }
1299 
1300       SmallVector<VPValue *, 4> OperandsWithMask;
1301       OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1302 
1303       for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1304         if (I == StartIndex)
1305           continue;
1306         OperandsWithMask.push_back(Blend->getIncomingValue(I));
1307         OperandsWithMask.push_back(Blend->getMask(I));
1308       }
1309 
1310       auto *NewBlend = new VPBlendRecipe(
1311           cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask);
1312       NewBlend->insertBefore(&R);
1313 
1314       VPValue *DeadMask = Blend->getMask(StartIndex);
1315       Blend->replaceAllUsesWith(NewBlend);
1316       Blend->eraseFromParent();
1317       recursivelyDeleteDeadRecipes(DeadMask);
1318 
1319       /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1320       VPValue *NewMask;
1321       if (NewBlend->getNumOperands() == 3 &&
1322           match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1323         VPValue *Inc0 = NewBlend->getOperand(0);
1324         VPValue *Inc1 = NewBlend->getOperand(1);
1325         VPValue *OldMask = NewBlend->getOperand(2);
1326         NewBlend->setOperand(0, Inc1);
1327         NewBlend->setOperand(1, Inc0);
1328         NewBlend->setOperand(2, NewMask);
1329         if (OldMask->getNumUsers() == 0)
1330           cast<VPInstruction>(OldMask)->eraseFromParent();
1331       }
1332     }
1333   }
1334 }
1335 
1336 /// Optimize the width of vector induction variables in \p Plan based on a known
1337 /// constant Trip Count, \p BestVF and \p BestUF.
optimizeVectorInductionWidthForTCAndVFUF(VPlan & Plan,ElementCount BestVF,unsigned BestUF)1338 static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
1339                                                      ElementCount BestVF,
1340                                                      unsigned BestUF) {
1341   // Only proceed if we have not completely removed the vector region.
1342   if (!Plan.getVectorLoopRegion())
1343     return false;
1344 
1345   if (!Plan.getTripCount()->isLiveIn())
1346     return false;
1347   auto *TC = dyn_cast_if_present<ConstantInt>(
1348       Plan.getTripCount()->getUnderlyingValue());
1349   if (!TC || !BestVF.isFixed())
1350     return false;
1351 
1352   // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1353   // and UF. Returns at least 8.
1354   auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1355     APInt AlignedTC =
1356         Align * APIntOps::RoundingUDiv(TC, APInt(TC.getBitWidth(), Align),
1357                                        APInt::Rounding::UP);
1358     APInt MaxVal = AlignedTC - 1;
1359     return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1360   };
1361   unsigned NewBitWidth =
1362       ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
1363 
1364   LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
1365   auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1366 
1367   bool MadeChange = false;
1368 
1369   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1370   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1371     auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1372 
1373     // Currently only handle canonical IVs as it is trivial to replace the start
1374     // and stop values, and we currently only perform the optimization when the
1375     // IV has a single use.
1376     if (!WideIV || !WideIV->isCanonical() ||
1377         WideIV->hasMoreThanOneUniqueUser() ||
1378         NewIVTy == WideIV->getScalarType())
1379       continue;
1380 
1381     // Currently only handle cases where the single user is a header-mask
1382     // comparison with the backedge-taken-count.
1383     using namespace VPlanPatternMatch;
1384     if (!match(
1385             *WideIV->user_begin(),
1386             m_Binary<Instruction::ICmp>(
1387                 m_Specific(WideIV),
1388                 m_Broadcast(m_Specific(Plan.getOrCreateBackedgeTakenCount())))))
1389       continue;
1390 
1391     // Update IV operands and comparison bound to use new narrower type.
1392     auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
1393     WideIV->setStartValue(NewStart);
1394     auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
1395     WideIV->setStepValue(NewStep);
1396 
1397     auto *NewBTC = new VPWidenCastRecipe(
1398         Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1399     Plan.getVectorPreheader()->appendRecipe(NewBTC);
1400     auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
1401     Cmp->setOperand(1, NewBTC);
1402 
1403     MadeChange = true;
1404   }
1405 
1406   return MadeChange;
1407 }
1408 
1409 /// Return true if \p Cond is known to be true for given \p BestVF and \p
1410 /// BestUF.
isConditionTrueViaVFAndUF(VPValue * Cond,VPlan & Plan,ElementCount BestVF,unsigned BestUF,ScalarEvolution & SE)1411 static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
1412                                       ElementCount BestVF, unsigned BestUF,
1413                                       ScalarEvolution &SE) {
1414   using namespace llvm::VPlanPatternMatch;
1415   if (match(Cond, m_Binary<Instruction::Or>(m_VPValue(), m_VPValue())))
1416     return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1417                                                           &SE](VPValue *C) {
1418       return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1419     });
1420 
1421   auto *CanIV = Plan.getCanonicalIV();
1422   if (!match(Cond, m_Binary<Instruction::ICmp>(
1423                        m_Specific(CanIV->getBackedgeValue()),
1424                        m_Specific(&Plan.getVectorTripCount()))) ||
1425       cast<VPRecipeWithIRFlags>(Cond->getDefiningRecipe())->getPredicate() !=
1426           CmpInst::ICMP_EQ)
1427     return false;
1428 
1429   // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1430   // count is not conveniently available as SCEV so far, so we compare directly
1431   // against the original trip count. This is stricter than necessary, as we
1432   // will only return true if the trip count == vector trip count.
1433   // TODO: Use SCEV for vector trip count once available, to cover cases where
1434   // vector trip count == UF * VF, but original trip count != UF * VF.
1435   const SCEV *TripCount =
1436       vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1437   assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1438          "Trip count SCEV must be computable");
1439   ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1440   const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1441   return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C);
1442 }
1443 
1444 /// Try to simplify the branch condition of \p Plan. This may restrict the
1445 /// resulting plan to \p BestVF and \p BestUF.
simplifyBranchConditionForVFAndUF(VPlan & Plan,ElementCount BestVF,unsigned BestUF,PredicatedScalarEvolution & PSE)1446 static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
1447                                               unsigned BestUF,
1448                                               PredicatedScalarEvolution &PSE) {
1449   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1450   VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1451   auto *Term = &ExitingVPBB->back();
1452   VPValue *Cond;
1453   ScalarEvolution &SE = *PSE.getSE();
1454   using namespace llvm::VPlanPatternMatch;
1455   if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
1456       match(Term, m_BranchOnCond(
1457                       m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {
1458     // Try to simplify the branch condition if TC <= VF * UF when the latch
1459     // terminator is   BranchOnCount or BranchOnCond where the input is
1460     // Not(ActiveLaneMask).
1461     const SCEV *TripCount =
1462         vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1463     assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1464            "Trip count SCEV must be computable");
1465     ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1466     const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1467     if (TripCount->isZero() ||
1468         !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1469       return false;
1470   } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1471     // For BranchOnCond, check if we can prove the condition to be true using VF
1472     // and UF.
1473     if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1474       return false;
1475   } else {
1476     return false;
1477   }
1478 
1479   // The vector loop region only executes once. If possible, completely remove
1480   // the region, otherwise replace the terminator controlling the latch with
1481   // (BranchOnCond true).
1482   auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1483   auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
1484   if (all_of(
1485           Header->phis(),
1486           IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
1487     for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1488       auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(&HeaderR);
1489       HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue());
1490       HeaderPhiR->eraseFromParent();
1491     }
1492 
1493     VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1494     VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1495     VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1496     VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1497 
1498     for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1499       B->setParent(nullptr);
1500 
1501     VPBlockUtils::connectBlocks(Preheader, Header);
1502     VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1503     VPlanTransforms::simplifyRecipes(Plan, *CanIVTy);
1504   } else {
1505     // The vector region contains header phis for which we cannot remove the
1506     // loop region yet.
1507     LLVMContext &Ctx = SE.getContext();
1508     auto *BOC = new VPInstruction(
1509         VPInstruction::BranchOnCond,
1510         {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc());
1511     ExitingVPBB->appendRecipe(BOC);
1512   }
1513 
1514   Term->eraseFromParent();
1515 
1516   return true;
1517 }
1518 
optimizeForVFAndUF(VPlan & Plan,ElementCount BestVF,unsigned BestUF,PredicatedScalarEvolution & PSE)1519 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1520                                          unsigned BestUF,
1521                                          PredicatedScalarEvolution &PSE) {
1522   assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1523   assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1524 
1525   bool MadeChange =
1526       simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1527   MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1528 
1529   if (MadeChange) {
1530     Plan.setVF(BestVF);
1531     assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1532   }
1533   // TODO: Further simplifications are possible
1534   //      1. Replace inductions with constants.
1535   //      2. Replace vector loop region with VPBasicBlock.
1536 }
1537 
1538 /// Sink users of \p FOR after the recipe defining the previous value \p
1539 /// Previous of the recurrence. \returns true if all users of \p FOR could be
1540 /// re-arranged as needed or false if it is not possible.
1541 static bool
sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe * FOR,VPRecipeBase * Previous,VPDominatorTree & VPDT)1542 sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
1543                                  VPRecipeBase *Previous,
1544                                  VPDominatorTree &VPDT) {
1545   // Collect recipes that need sinking.
1546   SmallVector<VPRecipeBase *> WorkList;
1547   SmallPtrSet<VPRecipeBase *, 8> Seen;
1548   Seen.insert(Previous);
1549   auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1550     // The previous value must not depend on the users of the recurrence phi. In
1551     // that case, FOR is not a fixed order recurrence.
1552     if (SinkCandidate == Previous)
1553       return false;
1554 
1555     if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1556         !Seen.insert(SinkCandidate).second ||
1557         VPDT.properlyDominates(Previous, SinkCandidate))
1558       return true;
1559 
1560     if (SinkCandidate->mayHaveSideEffects())
1561       return false;
1562 
1563     WorkList.push_back(SinkCandidate);
1564     return true;
1565   };
1566 
1567   // Recursively sink users of FOR after Previous.
1568   WorkList.push_back(FOR);
1569   for (unsigned I = 0; I != WorkList.size(); ++I) {
1570     VPRecipeBase *Current = WorkList[I];
1571     assert(Current->getNumDefinedValues() == 1 &&
1572            "only recipes with a single defined value expected");
1573 
1574     for (VPUser *User : Current->getVPSingleValue()->users()) {
1575       if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1576         return false;
1577     }
1578   }
1579 
1580   // Keep recipes to sink ordered by dominance so earlier instructions are
1581   // processed first.
1582   sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1583     return VPDT.properlyDominates(A, B);
1584   });
1585 
1586   for (VPRecipeBase *SinkCandidate : WorkList) {
1587     if (SinkCandidate == FOR)
1588       continue;
1589 
1590     SinkCandidate->moveAfter(Previous);
1591     Previous = SinkCandidate;
1592   }
1593   return true;
1594 }
1595 
1596 /// Try to hoist \p Previous and its operands before all users of \p FOR.
hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe * FOR,VPRecipeBase * Previous,VPDominatorTree & VPDT)1597 static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
1598                                         VPRecipeBase *Previous,
1599                                         VPDominatorTree &VPDT) {
1600   if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())
1601     return false;
1602 
1603   // Collect recipes that need hoisting.
1604   SmallVector<VPRecipeBase *> HoistCandidates;
1605   SmallPtrSet<VPRecipeBase *, 8> Visited;
1606   VPRecipeBase *HoistPoint = nullptr;
1607   // Find the closest hoist point by looking at all users of FOR and selecting
1608   // the recipe dominating all other users.
1609   for (VPUser *U : FOR->users()) {
1610     auto *R = cast<VPRecipeBase>(U);
1611     if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1612       HoistPoint = R;
1613   }
1614   assert(all_of(FOR->users(),
1615                 [&VPDT, HoistPoint](VPUser *U) {
1616                   auto *R = cast<VPRecipeBase>(U);
1617                   return HoistPoint == R ||
1618                          VPDT.properlyDominates(HoistPoint, R);
1619                 }) &&
1620          "HoistPoint must dominate all users of FOR");
1621 
1622   auto NeedsHoisting = [HoistPoint, &VPDT,
1623                         &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1624     VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1625     if (!HoistCandidate)
1626       return nullptr;
1627     VPRegionBlock *EnclosingLoopRegion =
1628         HoistCandidate->getParent()->getEnclosingLoopRegion();
1629     assert((!HoistCandidate->getParent()->getParent() ||
1630             HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
1631            "CFG in VPlan should still be flat, without replicate regions");
1632     // Hoist candidate was already visited, no need to hoist.
1633     if (!Visited.insert(HoistCandidate).second)
1634       return nullptr;
1635 
1636     // Candidate is outside loop region or a header phi, dominates FOR users w/o
1637     // hoisting.
1638     if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1639       return nullptr;
1640 
1641     // If we reached a recipe that dominates HoistPoint, we don't need to
1642     // hoist the recipe.
1643     if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1644       return nullptr;
1645     return HoistCandidate;
1646   };
1647   auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
1648     // Avoid hoisting candidates with side-effects, as we do not yet analyze
1649     // associated dependencies.
1650     return !HoistCandidate->mayHaveSideEffects();
1651   };
1652 
1653   if (!NeedsHoisting(Previous->getVPSingleValue()))
1654     return true;
1655 
1656   // Recursively try to hoist Previous and its operands before all users of FOR.
1657   HoistCandidates.push_back(Previous);
1658 
1659   for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1660     VPRecipeBase *Current = HoistCandidates[I];
1661     assert(Current->getNumDefinedValues() == 1 &&
1662            "only recipes with a single defined value expected");
1663     if (!CanHoist(Current))
1664       return false;
1665 
1666     for (VPValue *Op : Current->operands()) {
1667       // If we reach FOR, it means the original Previous depends on some other
1668       // recurrence that in turn depends on FOR. If that is the case, we would
1669       // also need to hoist recipes involving the other FOR, which may break
1670       // dependencies.
1671       if (Op == FOR)
1672         return false;
1673 
1674       if (auto *R = NeedsHoisting(Op))
1675         HoistCandidates.push_back(R);
1676     }
1677   }
1678 
1679   // Order recipes to hoist by dominance so earlier instructions are processed
1680   // first.
1681   sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1682     return VPDT.properlyDominates(A, B);
1683   });
1684 
1685   for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1686     HoistCandidate->moveBefore(*HoistPoint->getParent(),
1687                                HoistPoint->getIterator());
1688   }
1689 
1690   return true;
1691 }
1692 
adjustFixedOrderRecurrences(VPlan & Plan,VPBuilder & LoopBuilder)1693 bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
1694                                                   VPBuilder &LoopBuilder) {
1695   VPDominatorTree VPDT;
1696   VPDT.recalculate(Plan);
1697 
1698   SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
1699   for (VPRecipeBase &R :
1700        Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
1701     if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
1702       RecurrencePhis.push_back(FOR);
1703 
1704   for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1705     SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
1706     VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1707     // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1708     // to terminate.
1709     while (auto *PrevPhi =
1710                dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Previous)) {
1711       assert(PrevPhi->getParent() == FOR->getParent());
1712       assert(SeenPhis.insert(PrevPhi).second);
1713       Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1714     }
1715 
1716     if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1717         !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1718       return false;
1719 
1720     // Introduce a recipe to combine the incoming and previous values of a
1721     // fixed-order recurrence.
1722     VPBasicBlock *InsertBlock = Previous->getParent();
1723     if (isa<VPHeaderPHIRecipe>(Previous))
1724       LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
1725     else
1726       LoopBuilder.setInsertPoint(InsertBlock,
1727                                  std::next(Previous->getIterator()));
1728 
1729     auto *RecurSplice =
1730         LoopBuilder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
1731                                  {FOR, FOR->getBackedgeValue()});
1732 
1733     FOR->replaceAllUsesWith(RecurSplice);
1734     // Set the first operand of RecurSplice to FOR again, after replacing
1735     // all users.
1736     RecurSplice->setOperand(0, FOR);
1737   }
1738   return true;
1739 }
1740 
clearReductionWrapFlags(VPlan & Plan)1741 void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
1742   for (VPRecipeBase &R :
1743        Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
1744     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
1745     if (!PhiR)
1746       continue;
1747     RecurKind RK = PhiR->getRecurrenceKind();
1748     if (RK != RecurKind::Add && RK != RecurKind::Mul)
1749       continue;
1750 
1751     for (VPUser *U : collectUsersRecursively(PhiR))
1752       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
1753         RecWithFlags->dropPoisonGeneratingFlags();
1754       }
1755   }
1756 }
1757 
1758 /// Move loop-invariant recipes out of the vector loop region in \p Plan.
licm(VPlan & Plan)1759 static void licm(VPlan &Plan) {
1760   VPBasicBlock *Preheader = Plan.getVectorPreheader();
1761 
1762   // Return true if we do not know how to (mechanically) hoist a given recipe
1763   // out of a loop region. Does not address legality concerns such as aliasing
1764   // or speculation safety.
1765   auto CannotHoistRecipe = [](VPRecipeBase &R) {
1766     // Allocas cannot be hoisted.
1767     auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1768     return RepR && RepR->getOpcode() == Instruction::Alloca;
1769   };
1770 
1771   // Hoist any loop invariant recipes from the vector loop region to the
1772   // preheader. Preform a shallow traversal of the vector loop region, to
1773   // exclude recipes in replicate regions.
1774   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1775   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1776            vp_depth_first_shallow(LoopRegion->getEntry()))) {
1777     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1778       if (CannotHoistRecipe(R))
1779         continue;
1780       // TODO: Relax checks in the future, e.g. we could also hoist reads, if
1781       // their memory location is not modified in the vector loop.
1782       if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
1783           any_of(R.operands(), [](VPValue *Op) {
1784             return !Op->isDefinedOutsideLoopRegions();
1785           }))
1786         continue;
1787       R.moveBefore(*Preheader, Preheader->end());
1788     }
1789   }
1790 }
1791 
truncateToMinimalBitwidths(VPlan & Plan,const MapVector<Instruction *,uint64_t> & MinBWs)1792 void VPlanTransforms::truncateToMinimalBitwidths(
1793     VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
1794   // Keep track of created truncates, so they can be re-used. Note that we
1795   // cannot use RAUW after creating a new truncate, as this would could make
1796   // other uses have different types for their operands, making them invalidly
1797   // typed.
1798   DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
1799   Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
1800   VPTypeAnalysis TypeInfo(CanonicalIVType);
1801   VPBasicBlock *PH = Plan.getVectorPreheader();
1802   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1803            vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
1804     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1805       if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
1806                VPWidenSelectRecipe, VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(
1807               &R))
1808         continue;
1809 
1810       VPValue *ResultVPV = R.getVPSingleValue();
1811       auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
1812       unsigned NewResSizeInBits = MinBWs.lookup(UI);
1813       if (!NewResSizeInBits)
1814         continue;
1815 
1816       // If the value wasn't vectorized, we must maintain the original scalar
1817       // type. Skip those here, after incrementing NumProcessedRecipes. Also
1818       // skip casts which do not need to be handled explicitly here, as
1819       // redundant casts will be removed during recipe simplification.
1820       if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R))
1821         continue;
1822 
1823       Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
1824       unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
1825       assert(OldResTy->isIntegerTy() && "only integer types supported");
1826       (void)OldResSizeInBits;
1827 
1828       LLVMContext &Ctx = CanonicalIVType->getContext();
1829       auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);
1830 
1831       // Any wrapping introduced by shrinking this operation shouldn't be
1832       // considered undefined behavior. So, we can't unconditionally copy
1833       // arithmetic wrapping flags to VPW.
1834       if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
1835         VPW->dropPoisonGeneratingFlags();
1836 
1837       using namespace llvm::VPlanPatternMatch;
1838       if (OldResSizeInBits != NewResSizeInBits &&
1839           !match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue()))) {
1840         // Extend result to original width.
1841         auto *Ext =
1842             new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
1843         Ext->insertAfter(&R);
1844         ResultVPV->replaceAllUsesWith(Ext);
1845         Ext->setOperand(0, ResultVPV);
1846         assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
1847       } else {
1848         assert(
1849             match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) &&
1850             "Only ICmps should not need extending the result.");
1851       }
1852 
1853       assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
1854       if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(&R))
1855         continue;
1856 
1857       // Shrink operands by introducing truncates as needed.
1858       unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
1859       for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
1860         auto *Op = R.getOperand(Idx);
1861         unsigned OpSizeInBits =
1862             TypeInfo.inferScalarType(Op)->getScalarSizeInBits();
1863         if (OpSizeInBits == NewResSizeInBits)
1864           continue;
1865         assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
1866         auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
1867         VPWidenCastRecipe *NewOp =
1868             IterIsEmpty
1869                 ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)
1870                 : ProcessedIter->second;
1871         R.setOperand(Idx, NewOp);
1872         if (!IterIsEmpty)
1873           continue;
1874         ProcessedIter->second = NewOp;
1875         if (!Op->isLiveIn()) {
1876           NewOp->insertBefore(&R);
1877         } else {
1878           PH->appendRecipe(NewOp);
1879         }
1880       }
1881 
1882     }
1883   }
1884 }
1885 
1886 /// Remove BranchOnCond recipes with true or false conditions together with
1887 /// removing dead edges to their successors.
removeBranchOnConst(VPlan & Plan)1888 static void removeBranchOnConst(VPlan &Plan) {
1889   using namespace llvm::VPlanPatternMatch;
1890   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1891            vp_depth_first_shallow(Plan.getEntry()))) {
1892     VPValue *Cond;
1893     if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() ||
1894         !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
1895       continue;
1896 
1897     unsigned RemovedIdx;
1898     if (match(Cond, m_True()))
1899       RemovedIdx = 1;
1900     else if (match(Cond, m_False()))
1901       RemovedIdx = 0;
1902     else
1903       continue;
1904 
1905     VPBasicBlock *RemovedSucc =
1906         cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
1907     assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
1908            "There must be a single edge between VPBB and its successor");
1909     // Values coming from VPBB into phi recipes of RemoveSucc are removed from
1910     // these recipes.
1911     for (VPRecipeBase &R : RemovedSucc->phis()) {
1912       auto *Phi = cast<VPPhiAccessors>(&R);
1913       assert((!isa<VPIRPhi>(&R) || RemovedSucc->getNumPredecessors() == 1) &&
1914              "VPIRPhis must have a single predecessor");
1915       Phi->removeIncomingValueFor(VPBB);
1916     }
1917     // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
1918     // automatically on VPlan destruction if it becomes unreachable.
1919     VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
1920     VPBB->back().eraseFromParent();
1921   }
1922 }
1923 
optimize(VPlan & Plan)1924 void VPlanTransforms::optimize(VPlan &Plan) {
1925   runPass(removeRedundantCanonicalIVs, Plan);
1926   runPass(removeRedundantInductionCasts, Plan);
1927 
1928   runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
1929   runPass(simplifyBlends, Plan);
1930   runPass(removeDeadRecipes, Plan);
1931   runPass(narrowToSingleScalarRecipes, Plan);
1932   runPass(legalizeAndOptimizeInductions, Plan);
1933   runPass(removeRedundantExpandSCEVRecipes, Plan);
1934   runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
1935   runPass(removeBranchOnConst, Plan);
1936   runPass(removeDeadRecipes, Plan);
1937 
1938   runPass(createAndOptimizeReplicateRegions, Plan);
1939   runPass(mergeBlocksIntoPredecessors, Plan);
1940   runPass(licm, Plan);
1941 }
1942 
1943 // Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
1944 // the loop terminator with a branch-on-cond recipe with the negated
1945 // active-lane-mask as operand. Note that this turns the loop into an
1946 // uncountable one. Only the existing terminator is replaced, all other existing
1947 // recipes/users remain unchanged, except for poison-generating flags being
1948 // dropped from the canonical IV increment. Return the created
1949 // VPActiveLaneMaskPHIRecipe.
1950 //
1951 // The function uses the following definitions:
1952 //
1953 //  %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
1954 //    calculate-trip-count-minus-VF (original TC) : original TC
1955 //  %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
1956 //     CanonicalIVPhi : CanonicalIVIncrement
1957 //  %StartV is the canonical induction start value.
1958 //
1959 // The function adds the following recipes:
1960 //
1961 // vector.ph:
1962 //   %TripCount = calculate-trip-count-minus-VF (original TC)
1963 //       [if DataWithControlFlowWithoutRuntimeCheck]
1964 //   %EntryInc = canonical-iv-increment-for-part %StartV
1965 //   %EntryALM = active-lane-mask %EntryInc, %TripCount
1966 //
1967 // vector.body:
1968 //   ...
1969 //   %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
1970 //   ...
1971 //   %InLoopInc = canonical-iv-increment-for-part %IncrementValue
1972 //   %ALM = active-lane-mask %InLoopInc, TripCount
1973 //   %Negated = Not %ALM
1974 //   branch-on-cond %Negated
1975 //
addVPLaneMaskPhiAndUpdateExitBranch(VPlan & Plan,bool DataAndControlFlowWithoutRuntimeCheck)1976 static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
1977     VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
1978   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
1979   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
1980   auto *CanonicalIVPHI = Plan.getCanonicalIV();
1981   VPValue *StartV = CanonicalIVPHI->getStartValue();
1982 
1983   auto *CanonicalIVIncrement =
1984       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
1985   // TODO: Check if dropping the flags is needed if
1986   // !DataAndControlFlowWithoutRuntimeCheck.
1987   CanonicalIVIncrement->dropPoisonGeneratingFlags();
1988   DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
1989   // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
1990   // we have to take unrolling into account. Each part needs to start at
1991   //   Part * VF
1992   auto *VecPreheader = Plan.getVectorPreheader();
1993   VPBuilder Builder(VecPreheader);
1994 
1995   // Create the ActiveLaneMask instruction using the correct start values.
1996   VPValue *TC = Plan.getTripCount();
1997 
1998   VPValue *TripCount, *IncrementValue;
1999   if (!DataAndControlFlowWithoutRuntimeCheck) {
2000     // When the loop is guarded by a runtime overflow check for the loop
2001     // induction variable increment by VF, we can increment the value before
2002     // the get.active.lane mask and use the unmodified tripcount.
2003     IncrementValue = CanonicalIVIncrement;
2004     TripCount = TC;
2005   } else {
2006     // When avoiding a runtime check, the active.lane.mask inside the loop
2007     // uses a modified trip count and the induction variable increment is
2008     // done after the active.lane.mask intrinsic is called.
2009     IncrementValue = CanonicalIVPHI;
2010     TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2011                                      {TC}, DL);
2012   }
2013   auto *EntryIncrement = Builder.createOverflowingOp(
2014       VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2015       "index.part.next");
2016 
2017   // Create the active lane mask instruction in the VPlan preheader.
2018   auto *EntryALM =
2019       Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
2020                            DL, "active.lane.mask.entry");
2021 
2022   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2023   // preheader ActiveLaneMask instruction.
2024   auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
2025   LaneMaskPhi->insertAfter(CanonicalIVPHI);
2026 
2027   // Create the active lane mask for the next iteration of the loop before the
2028   // original terminator.
2029   VPRecipeBase *OriginalTerminator = EB->getTerminator();
2030   Builder.setInsertPoint(OriginalTerminator);
2031   auto *InLoopIncrement =
2032       Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2033                                   {IncrementValue}, {false, false}, DL);
2034   auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2035                                    {InLoopIncrement, TripCount}, DL,
2036                                    "active.lane.mask.next");
2037   LaneMaskPhi->addOperand(ALM);
2038 
2039   // Replace the original terminator with BranchOnCond. We have to invert the
2040   // mask here because a true condition means jumping to the exit block.
2041   auto *NotMask = Builder.createNot(ALM, DL);
2042   Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2043   OriginalTerminator->eraseFromParent();
2044   return LaneMaskPhi;
2045 }
2046 
2047 /// Collect all VPValues representing a header mask through the (ICMP_ULE,
2048 /// WideCanonicalIV, backedge-taken-count) pattern.
2049 /// TODO: Introduce explicit recipe for header-mask instead of searching
2050 /// for the header-mask pattern manually.
collectAllHeaderMasks(VPlan & Plan)2051 static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
2052   SmallVector<VPValue *> WideCanonicalIVs;
2053   auto *FoundWidenCanonicalIVUser =
2054       find_if(Plan.getCanonicalIV()->users(),
2055               [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
2056   assert(count_if(Plan.getCanonicalIV()->users(),
2057                   [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <=
2058              1 &&
2059          "Must have at most one VPWideCanonicalIVRecipe");
2060   if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
2061     auto *WideCanonicalIV =
2062         cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2063     WideCanonicalIVs.push_back(WideCanonicalIV);
2064   }
2065 
2066   // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2067   // version of the canonical induction.
2068   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2069   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2070     auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2071     if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2072       WideCanonicalIVs.push_back(WidenOriginalIV);
2073   }
2074 
2075   // Walk users of wide canonical IVs and collect to all compares of the form
2076   // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2077   SmallVector<VPValue *> HeaderMasks;
2078   for (auto *Wide : WideCanonicalIVs) {
2079     for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2080       auto *HeaderMask = dyn_cast<VPInstruction>(U);
2081       if (!HeaderMask || !vputils::isHeaderMask(HeaderMask, Plan))
2082         continue;
2083 
2084       assert(HeaderMask->getOperand(0) == Wide &&
2085              "WidenCanonicalIV must be the first operand of the compare");
2086       HeaderMasks.push_back(HeaderMask);
2087     }
2088   }
2089   return HeaderMasks;
2090 }
2091 
addActiveLaneMask(VPlan & Plan,bool UseActiveLaneMaskForControlFlow,bool DataAndControlFlowWithoutRuntimeCheck)2092 void VPlanTransforms::addActiveLaneMask(
2093     VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2094     bool DataAndControlFlowWithoutRuntimeCheck) {
2095   assert((!DataAndControlFlowWithoutRuntimeCheck ||
2096           UseActiveLaneMaskForControlFlow) &&
2097          "DataAndControlFlowWithoutRuntimeCheck implies "
2098          "UseActiveLaneMaskForControlFlow");
2099 
2100   auto *FoundWidenCanonicalIVUser =
2101       find_if(Plan.getCanonicalIV()->users(),
2102               [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
2103   assert(FoundWidenCanonicalIVUser &&
2104          "Must have widened canonical IV when tail folding!");
2105   auto *WideCanonicalIV =
2106       cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2107   VPSingleDefRecipe *LaneMask;
2108   if (UseActiveLaneMaskForControlFlow) {
2109     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
2110         Plan, DataAndControlFlowWithoutRuntimeCheck);
2111   } else {
2112     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2113     LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
2114                               {WideCanonicalIV, Plan.getTripCount()}, nullptr,
2115                               "active.lane.mask");
2116   }
2117 
2118   // Walk users of WideCanonicalIV and replace all compares of the form
2119   // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
2120   // active-lane-mask.
2121   for (VPValue *HeaderMask : collectAllHeaderMasks(Plan))
2122     HeaderMask->replaceAllUsesWith(LaneMask);
2123 }
2124 
2125 /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2126 /// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2127 /// recipe could be created.
2128 /// \p HeaderMask  Header Mask.
2129 /// \p CurRecipe   Recipe to be transform.
2130 /// \p TypeInfo    VPlan-based type analysis.
2131 /// \p AllOneMask  The vector mask parameter of vector-predication intrinsics.
2132 /// \p EVL         The explicit vector length parameter of vector-predication
2133 /// intrinsics.
optimizeMaskToEVL(VPValue * HeaderMask,VPRecipeBase & CurRecipe,VPTypeAnalysis & TypeInfo,VPValue & AllOneMask,VPValue & EVL)2134 static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
2135                                        VPRecipeBase &CurRecipe,
2136                                        VPTypeAnalysis &TypeInfo,
2137                                        VPValue &AllOneMask, VPValue &EVL) {
2138   using namespace llvm::VPlanPatternMatch;
2139   auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
2140     assert(OrigMask && "Unmasked recipe when folding tail");
2141     // HeaderMask will be handled using EVL.
2142     VPValue *Mask;
2143     if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
2144       return Mask;
2145     return HeaderMask == OrigMask ? nullptr : OrigMask;
2146   };
2147 
2148   return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
2149       .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
2150         VPValue *NewMask = GetNewMask(L->getMask());
2151         return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
2152       })
2153       .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
2154         VPValue *NewMask = GetNewMask(S->getMask());
2155         return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
2156       })
2157       .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
2158         VPValue *NewMask = GetNewMask(Red->getCondOp());
2159         return new VPReductionEVLRecipe(*Red, EVL, NewMask);
2160       })
2161       .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
2162         VPValue *LHS, *RHS;
2163         // Transform select with a header mask condition
2164         //   select(header_mask, LHS, RHS)
2165         // into vector predication merge.
2166         //   vp.merge(all-true, LHS, RHS, EVL)
2167         if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
2168                                  m_VPValue(RHS))))
2169           return nullptr;
2170         // Use all true as the condition because this transformation is
2171         // limited to selects whose condition is a header mask.
2172         return new VPWidenIntrinsicRecipe(
2173             Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2174             TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
2175       })
2176       .Default([&](VPRecipeBase *R) { return nullptr; });
2177 }
2178 
2179 /// Replace recipes with their EVL variants.
transformRecipestoEVLRecipes(VPlan & Plan,VPValue & EVL)2180 static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
2181   Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
2182   VPTypeAnalysis TypeInfo(CanonicalIVType);
2183   LLVMContext &Ctx = CanonicalIVType->getContext();
2184   VPValue *AllOneMask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
2185   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2186   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2187 
2188   assert(all_of(Plan.getVF().users(),
2189                 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
2190                         VPWidenIntOrFpInductionRecipe>) &&
2191          "User of VF that we can't transform to EVL.");
2192   Plan.getVF().replaceAllUsesWith(&EVL);
2193 
2194   // Defer erasing recipes till the end so that we don't invalidate the
2195   // VPTypeAnalysis cache.
2196   SmallVector<VPRecipeBase *> ToErase;
2197 
2198   // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2199   // contained.
2200   bool ContainsFORs =
2201       any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);
2202   if (ContainsFORs) {
2203     // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2204     VPValue *MaxEVL = &Plan.getVF();
2205     // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2206     VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2207     MaxEVL = Builder.createScalarZExtOrTrunc(MaxEVL, Type::getInt32Ty(Ctx),
2208                                              TypeInfo.inferScalarType(MaxEVL),
2209                                              DebugLoc());
2210 
2211     Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2212     VPValue *PrevEVL =
2213         Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
2214 
2215     for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2216              vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
2217       for (VPRecipeBase &R : *VPBB) {
2218         using namespace VPlanPatternMatch;
2219         VPValue *V1, *V2;
2220         if (!match(&R,
2221                    m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
2222                        m_VPValue(V1), m_VPValue(V2))))
2223           continue;
2224         VPValue *Imm = Plan.getOrAddLiveIn(
2225             ConstantInt::getSigned(Type::getInt32Ty(Ctx), -1));
2226         VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
2227             Intrinsic::experimental_vp_splice,
2228             {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
2229             TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
2230         VPSplice->insertBefore(&R);
2231         R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2232         ToErase.push_back(&R);
2233       }
2234     }
2235   }
2236 
2237   // Try to optimize header mask recipes away to their EVL variants.
2238   for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
2239     for (VPUser *U : collectUsersRecursively(HeaderMask)) {
2240       auto *CurRecipe = cast<VPRecipeBase>(U);
2241       VPRecipeBase *EVLRecipe =
2242           optimizeMaskToEVL(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
2243       if (!EVLRecipe)
2244         continue;
2245 
2246       [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2247       assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2248              "New recipe must define the same number of values as the "
2249              "original.");
2250       assert(
2251           NumDefVal <= 1 &&
2252           "Only supports recipes with a single definition or without users.");
2253       EVLRecipe->insertBefore(CurRecipe);
2254       if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
2255         VPValue *CurVPV = CurRecipe->getVPSingleValue();
2256         CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
2257       }
2258       ToErase.push_back(CurRecipe);
2259     }
2260   }
2261 
2262   for (VPRecipeBase *R : reverse(ToErase)) {
2263     SmallVector<VPValue *> PossiblyDead(R->operands());
2264     R->eraseFromParent();
2265     for (VPValue *Op : PossiblyDead)
2266       recursivelyDeleteDeadRecipes(Op);
2267   }
2268 }
2269 
2270 /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2271 /// replaces all uses except the canonical IV increment of
2272 /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2273 /// is used only for loop iterations counting after this transformation.
2274 ///
2275 /// The function uses the following definitions:
2276 ///  %StartV is the canonical induction start value.
2277 ///
2278 /// The function adds the following recipes:
2279 ///
2280 /// vector.ph:
2281 /// ...
2282 ///
2283 /// vector.body:
2284 /// ...
2285 /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2286 ///                                               [ %NextEVLIV, %vector.body ]
2287 /// %AVL = sub original TC, %EVLPhi
2288 /// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2289 /// ...
2290 /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
2291 /// ...
2292 ///
2293 /// If MaxSafeElements is provided, the function adds the following recipes:
2294 /// vector.ph:
2295 /// ...
2296 ///
2297 /// vector.body:
2298 /// ...
2299 /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2300 ///                                               [ %NextEVLIV, %vector.body ]
2301 /// %AVL = sub original TC, %EVLPhi
2302 /// %cmp = cmp ult %AVL, MaxSafeElements
2303 /// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2304 /// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2305 /// ...
2306 /// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi
2307 /// ...
2308 ///
tryAddExplicitVectorLength(VPlan & Plan,const std::optional<unsigned> & MaxSafeElements)2309 bool VPlanTransforms::tryAddExplicitVectorLength(
2310     VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2311   VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2312   // The transform updates all users of inductions to work based on EVL, instead
2313   // of the VF directly. At the moment, widened pointer inductions cannot be
2314   // updated, so bail out if the plan contains any.
2315   bool ContainsWidenPointerInductions =
2316       any_of(Header->phis(), IsaPred<VPWidenPointerInductionRecipe>);
2317   if (ContainsWidenPointerInductions)
2318     return false;
2319 
2320   auto *CanonicalIVPHI = Plan.getCanonicalIV();
2321   auto *CanIVTy = CanonicalIVPHI->getScalarType();
2322   VPValue *StartV = CanonicalIVPHI->getStartValue();
2323 
2324   // Create the ExplicitVectorLengthPhi recipe in the main loop.
2325   auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
2326   EVLPhi->insertAfter(CanonicalIVPHI);
2327   VPBuilder Builder(Header, Header->getFirstNonPhi());
2328   // Compute original TC - IV as the AVL (application vector length).
2329   VPValue *AVL = Builder.createNaryOp(
2330       Instruction::Sub, {Plan.getTripCount(), EVLPhi}, DebugLoc(), "avl");
2331   if (MaxSafeElements) {
2332     // Support for MaxSafeDist for correct loop emission.
2333     VPValue *AVLSafe =
2334         Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
2335     VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2336     AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");
2337   }
2338   auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2339                                      DebugLoc());
2340 
2341   auto *CanonicalIVIncrement =
2342       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2343   Builder.setInsertPoint(CanonicalIVIncrement);
2344   VPValue *OpVPEVL = VPEVL;
2345 
2346   auto *I32Ty = Type::getInt32Ty(CanIVTy->getContext());
2347   OpVPEVL = Builder.createScalarZExtOrTrunc(
2348       OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2349 
2350   auto *NextEVLIV = Builder.createOverflowingOp(
2351       Instruction::Add, {OpVPEVL, EVLPhi},
2352       {CanonicalIVIncrement->hasNoUnsignedWrap(),
2353        CanonicalIVIncrement->hasNoSignedWrap()},
2354       CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2355   EVLPhi->addOperand(NextEVLIV);
2356 
2357   transformRecipestoEVLRecipes(Plan, *VPEVL);
2358 
2359   // Replace all uses of VPCanonicalIVPHIRecipe by
2360   // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2361   CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2362   CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2363   // TODO: support unroll factor > 1.
2364   Plan.setUF(1);
2365   return true;
2366 }
2367 
dropPoisonGeneratingRecipes(VPlan & Plan,const std::function<bool (BasicBlock *)> & BlockNeedsPredication)2368 void VPlanTransforms::dropPoisonGeneratingRecipes(
2369     VPlan &Plan,
2370     const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2371   // Collect recipes in the backward slice of `Root` that may generate a poison
2372   // value that is used after vectorization.
2373   SmallPtrSet<VPRecipeBase *, 16> Visited;
2374   auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2375     SmallVector<VPRecipeBase *, 16> Worklist;
2376     Worklist.push_back(Root);
2377 
2378     // Traverse the backward slice of Root through its use-def chain.
2379     while (!Worklist.empty()) {
2380       VPRecipeBase *CurRec = Worklist.pop_back_val();
2381 
2382       if (!Visited.insert(CurRec).second)
2383         continue;
2384 
2385       // Prune search if we find another recipe generating a widen memory
2386       // instruction. Widen memory instructions involved in address computation
2387       // will lead to gather/scatter instructions, which don't need to be
2388       // handled.
2389       if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
2390               VPHeaderPHIRecipe>(CurRec))
2391         continue;
2392 
2393       // This recipe contributes to the address computation of a widen
2394       // load/store. If the underlying instruction has poison-generating flags,
2395       // drop them directly.
2396       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
2397         VPValue *A, *B;
2398         using namespace llvm::VPlanPatternMatch;
2399         // Dropping disjoint from an OR may yield incorrect results, as some
2400         // analysis may have converted it to an Add implicitly (e.g. SCEV used
2401         // for dependence analysis). Instead, replace it with an equivalent Add.
2402         // This is possible as all users of the disjoint OR only access lanes
2403         // where the operands are disjoint or poison otherwise.
2404         if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
2405             RecWithFlags->isDisjoint()) {
2406           VPBuilder Builder(RecWithFlags);
2407           VPInstruction *New = Builder.createOverflowingOp(
2408               Instruction::Add, {A, B}, {false, false},
2409               RecWithFlags->getDebugLoc());
2410           New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
2411           RecWithFlags->replaceAllUsesWith(New);
2412           RecWithFlags->eraseFromParent();
2413           CurRec = New;
2414         } else
2415           RecWithFlags->dropPoisonGeneratingFlags();
2416       } else {
2417         Instruction *Instr = dyn_cast_or_null<Instruction>(
2418             CurRec->getVPSingleValue()->getUnderlyingValue());
2419         (void)Instr;
2420         assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
2421                "found instruction with poison generating flags not covered by "
2422                "VPRecipeWithIRFlags");
2423       }
2424 
2425       // Add new definitions to the worklist.
2426       for (VPValue *Operand : CurRec->operands())
2427         if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
2428           Worklist.push_back(OpDef);
2429     }
2430   });
2431 
2432   // Traverse all the recipes in the VPlan and collect the poison-generating
2433   // recipes in the backward slice starting at the address of a VPWidenRecipe or
2434   // VPInterleaveRecipe.
2435   auto Iter = vp_depth_first_deep(Plan.getEntry());
2436   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
2437     for (VPRecipeBase &Recipe : *VPBB) {
2438       if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
2439         Instruction &UnderlyingInstr = WidenRec->getIngredient();
2440         VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
2441         if (AddrDef && WidenRec->isConsecutive() &&
2442             BlockNeedsPredication(UnderlyingInstr.getParent()))
2443           CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2444       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
2445         VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
2446         if (AddrDef) {
2447           // Check if any member of the interleave group needs predication.
2448           const InterleaveGroup<Instruction> *InterGroup =
2449               InterleaveRec->getInterleaveGroup();
2450           bool NeedPredication = false;
2451           for (int I = 0, NumMembers = InterGroup->getNumMembers();
2452                I < NumMembers; ++I) {
2453             Instruction *Member = InterGroup->getMember(I);
2454             if (Member)
2455               NeedPredication |= BlockNeedsPredication(Member->getParent());
2456           }
2457 
2458           if (NeedPredication)
2459             CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2460         }
2461       }
2462     }
2463   }
2464 }
2465 
createInterleaveGroups(VPlan & Plan,const SmallPtrSetImpl<const InterleaveGroup<Instruction> * > & InterleaveGroups,VPRecipeBuilder & RecipeBuilder,const bool & ScalarEpilogueAllowed)2466 void VPlanTransforms::createInterleaveGroups(
2467     VPlan &Plan,
2468     const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
2469         &InterleaveGroups,
2470     VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
2471   if (InterleaveGroups.empty())
2472     return;
2473 
2474   // Interleave memory: for each Interleave Group we marked earlier as relevant
2475   // for this VPlan, replace the Recipes widening its memory instructions with a
2476   // single VPInterleaveRecipe at its insertion point.
2477   VPDominatorTree VPDT;
2478   VPDT.recalculate(Plan);
2479   for (const auto *IG : InterleaveGroups) {
2480     SmallVector<VPValue *, 4> StoredValues;
2481     for (unsigned i = 0; i < IG->getFactor(); ++i)
2482       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
2483         auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
2484         StoredValues.push_back(StoreR->getStoredValue());
2485       }
2486 
2487     bool NeedsMaskForGaps =
2488         IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
2489 
2490     Instruction *IRInsertPos = IG->getInsertPos();
2491     auto *InsertPos =
2492         cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
2493 
2494     bool InBounds = false;
2495     if (auto *Gep = dyn_cast<GetElementPtrInst>(
2496             getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
2497       InBounds = Gep->isInBounds();
2498 
2499     // Get or create the start address for the interleave group.
2500     auto *Start =
2501         cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
2502     VPValue *Addr = Start->getAddr();
2503     VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
2504     if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
2505       // We cannot re-use the address of member zero because it does not
2506       // dominate the insert position. Instead, use the address of the insert
2507       // position and create a PtrAdd adjusting it to the address of member
2508       // zero.
2509       // TODO: Hoist Addr's defining recipe (and any operands as needed) to
2510       // InsertPos or sink loads above zero members to join it.
2511       assert(IG->getIndex(IRInsertPos) != 0 &&
2512              "index of insert position shouldn't be zero");
2513       auto &DL = IRInsertPos->getDataLayout();
2514       APInt Offset(32,
2515                    DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
2516                        IG->getIndex(IRInsertPos),
2517                    /*IsSigned=*/true);
2518       VPValue *OffsetVPV = Plan.getOrAddLiveIn(
2519           ConstantInt::get(IRInsertPos->getParent()->getContext(), -Offset));
2520       VPBuilder B(InsertPos);
2521       Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
2522                       : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
2523     }
2524     // If the group is reverse, adjust the index to refer to the last vector
2525     // lane instead of the first. We adjust the index from the first vector
2526     // lane, rather than directly getting the pointer for lane VF - 1, because
2527     // the pointer operand of the interleaved access is supposed to be uniform.
2528     if (IG->isReverse()) {
2529       auto *ReversePtr = new VPVectorEndPointerRecipe(
2530           Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
2531           -(int64_t)IG->getFactor(),
2532           InBounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none(),
2533           InsertPos->getDebugLoc());
2534       ReversePtr->insertBefore(InsertPos);
2535       Addr = ReversePtr;
2536     }
2537     auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
2538                                         InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc());
2539     VPIG->insertBefore(InsertPos);
2540 
2541     unsigned J = 0;
2542     for (unsigned i = 0; i < IG->getFactor(); ++i)
2543       if (Instruction *Member = IG->getMember(i)) {
2544         VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
2545         if (!Member->getType()->isVoidTy()) {
2546           VPValue *OriginalV = MemberR->getVPSingleValue();
2547           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
2548           J++;
2549         }
2550         MemberR->eraseFromParent();
2551       }
2552   }
2553 }
2554 
2555 /// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
2556 /// value, phi and backedge value. In the following example:
2557 ///
2558 ///  vector.ph:
2559 ///  Successor(s): vector loop
2560 ///
2561 ///  <x1> vector loop: {
2562 ///    vector.body:
2563 ///      WIDEN-INDUCTION %i = phi %start, %step, %vf
2564 ///      ...
2565 ///      EMIT branch-on-count ...
2566 ///    No successors
2567 ///  }
2568 ///
2569 /// WIDEN-INDUCTION will get expanded to:
2570 ///
2571 ///  vector.ph:
2572 ///    ...
2573 ///    vp<%induction.start> = ...
2574 ///    vp<%induction.increment> = ...
2575 ///
2576 ///  Successor(s): vector loop
2577 ///
2578 ///  <x1> vector loop: {
2579 ///    vector.body:
2580 ///      ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
2581 ///      ...
2582 ///      vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
2583 ///      EMIT branch-on-count ...
2584 ///    No successors
2585 ///  }
2586 static void
expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe * WidenIVR,VPTypeAnalysis & TypeInfo)2587 expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
2588                               VPTypeAnalysis &TypeInfo) {
2589   VPlan *Plan = WidenIVR->getParent()->getPlan();
2590   VPValue *Start = WidenIVR->getStartValue();
2591   VPValue *Step = WidenIVR->getStepValue();
2592   VPValue *VF = WidenIVR->getVFValue();
2593   DebugLoc DL = WidenIVR->getDebugLoc();
2594 
2595   // The value from the original loop to which we are mapping the new induction
2596   // variable.
2597   Type *Ty = TypeInfo.inferScalarType(WidenIVR);
2598 
2599   const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
2600   Instruction::BinaryOps AddOp;
2601   Instruction::BinaryOps MulOp;
2602   // FIXME: The newly created binary instructions should contain nsw/nuw
2603   // flags, which can be found from the original scalar operations.
2604   VPIRFlags Flags;
2605   if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
2606     AddOp = Instruction::Add;
2607     MulOp = Instruction::Mul;
2608   } else {
2609     AddOp = ID.getInductionOpcode();
2610     MulOp = Instruction::FMul;
2611     Flags = ID.getInductionBinOp()->getFastMathFlags();
2612   }
2613 
2614   // If the phi is truncated, truncate the start and step values.
2615   VPBuilder Builder(Plan->getVectorPreheader());
2616   Type *StepTy = TypeInfo.inferScalarType(Step);
2617   if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
2618     assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
2619     Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
2620     Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
2621     StepTy = Ty;
2622   }
2623 
2624   // Construct the initial value of the vector IV in the vector loop preheader.
2625   Type *IVIntTy =
2626       IntegerType::get(StepTy->getContext(), StepTy->getScalarSizeInBits());
2627   VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
2628   if (StepTy->isFloatingPointTy())
2629     Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
2630 
2631   VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
2632   VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
2633 
2634   Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
2635   Init =
2636       Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction");
2637 
2638   // Create the widened phi of the vector IV.
2639   auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
2640                                        WidenIVR->getDebugLoc(), "vec.ind");
2641   WidePHI->addOperand(Init);
2642   WidePHI->insertBefore(WidenIVR);
2643 
2644   // Create the backedge value for the vector IV.
2645   VPValue *Inc;
2646   VPValue *Prev;
2647   // If unrolled, use the increment and prev value from the operands.
2648   if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
2649     Inc = SplatVF;
2650     Prev = WidenIVR->getLastUnrolledPartOperand();
2651   } else {
2652     if (VPRecipeBase *R = VF->getDefiningRecipe())
2653       Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
2654     // Multiply the vectorization factor by the step using integer or
2655     // floating-point arithmetic as appropriate.
2656     if (StepTy->isFloatingPointTy())
2657       VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
2658                                     DL);
2659     else
2660       VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
2661                                            TypeInfo.inferScalarType(VF), DL);
2662 
2663     Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
2664     Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
2665     Prev = WidePHI;
2666   }
2667 
2668   VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
2669   Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
2670   auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
2671                                     WidenIVR->getDebugLoc(), "vec.ind.next");
2672 
2673   WidePHI->addOperand(Next);
2674 
2675   WidenIVR->replaceAllUsesWith(WidePHI);
2676 }
2677 
dissolveLoopRegions(VPlan & Plan)2678 void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
2679   // Replace loop regions with explicity CFG.
2680   SmallVector<VPRegionBlock *> LoopRegions;
2681   for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
2682            vp_depth_first_deep(Plan.getEntry()))) {
2683     if (!R->isReplicator())
2684       LoopRegions.push_back(R);
2685   }
2686   for (VPRegionBlock *R : LoopRegions)
2687     R->dissolveToCFGLoop();
2688 }
2689 
convertToConcreteRecipes(VPlan & Plan,Type & CanonicalIVTy)2690 void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
2691                                                Type &CanonicalIVTy) {
2692   using namespace llvm::VPlanPatternMatch;
2693 
2694   VPTypeAnalysis TypeInfo(&CanonicalIVTy);
2695   SmallVector<VPRecipeBase *> ToRemove;
2696   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2697            vp_depth_first_deep(Plan.getEntry()))) {
2698     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2699       if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2700         auto *ScalarR = VPBuilder(PhiR).createScalarPhi(
2701             {PhiR->getStartValue(), PhiR->getBackedgeValue()},
2702             PhiR->getDebugLoc(), "evl.based.iv");
2703         PhiR->replaceAllUsesWith(ScalarR);
2704         ToRemove.push_back(PhiR);
2705         continue;
2706       }
2707 
2708       if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
2709         expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
2710         ToRemove.push_back(WidenIVR);
2711         continue;
2712       }
2713 
2714       if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
2715         Expr->decompose();
2716         ToRemove.push_back(Expr);
2717       }
2718 
2719       VPValue *VectorStep;
2720       VPValue *ScalarStep;
2721       if (!match(&R, m_VPInstruction<VPInstruction::WideIVStep>(
2722                          m_VPValue(VectorStep), m_VPValue(ScalarStep))))
2723         continue;
2724 
2725       // Expand WideIVStep.
2726       auto *VPI = cast<VPInstruction>(&R);
2727       VPBuilder Builder(VPI);
2728       Type *IVTy = TypeInfo.inferScalarType(VPI);
2729       if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
2730         Instruction::CastOps CastOp = IVTy->isFloatingPointTy()
2731                                           ? Instruction::UIToFP
2732                                           : Instruction::Trunc;
2733         VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
2734       }
2735 
2736       [[maybe_unused]] auto *ConstStep =
2737           ScalarStep->isLiveIn()
2738               ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
2739               : nullptr;
2740       assert(!ConstStep || ConstStep->getValue() != 1);
2741       (void)ConstStep;
2742       if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
2743         ScalarStep =
2744             Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
2745       }
2746 
2747       VPIRFlags Flags;
2748       if (IVTy->isFloatingPointTy())
2749         Flags = {VPI->getFastMathFlags()};
2750 
2751       unsigned MulOpc =
2752           IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
2753       VPInstruction *Mul = Builder.createNaryOp(
2754           MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
2755       VectorStep = Mul;
2756       VPI->replaceAllUsesWith(VectorStep);
2757       ToRemove.push_back(VPI);
2758     }
2759   }
2760 
2761   for (VPRecipeBase *R : ToRemove)
2762     R->eraseFromParent();
2763 }
2764 
handleUncountableEarlyExit(VPBasicBlock * EarlyExitingVPBB,VPBasicBlock * EarlyExitVPBB,VPlan & Plan,VPBasicBlock * HeaderVPBB,VPBasicBlock * LatchVPBB,VFRange & Range)2765 void VPlanTransforms::handleUncountableEarlyExit(
2766     VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan,
2767     VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) {
2768   using namespace llvm::VPlanPatternMatch;
2769 
2770   VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
2771   if (!EarlyExitVPBB->getSinglePredecessor() &&
2772       EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
2773     assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
2774            EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
2775            "unsupported early exit VPBB");
2776     // Early exit operand should always be last phi operand. If EarlyExitVPBB
2777     // has two predecessors and EarlyExitingVPBB is the first, swap the operands
2778     // of the phis.
2779     for (VPRecipeBase &R : EarlyExitVPBB->phis())
2780       cast<VPIRPhi>(&R)->swapOperands();
2781   }
2782 
2783   VPBuilder Builder(LatchVPBB->getTerminator());
2784   VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
2785   assert(
2786       match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
2787       "Terminator must be be BranchOnCond");
2788   VPValue *CondOfEarlyExitingVPBB =
2789       EarlyExitingVPBB->getTerminator()->getOperand(0);
2790   auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
2791                               ? CondOfEarlyExitingVPBB
2792                               : Builder.createNot(CondOfEarlyExitingVPBB);
2793 
2794   // Split the middle block and have it conditionally branch to the early exit
2795   // block if CondToEarlyExit.
2796   VPValue *IsEarlyExitTaken =
2797       Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
2798   VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
2799   VPBasicBlock *VectorEarlyExitVPBB =
2800       Plan.createVPBasicBlock("vector.early.exit");
2801   VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
2802   VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
2803   NewMiddle->swapSuccessors();
2804 
2805   VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
2806 
2807   // Update the exit phis in the early exit block.
2808   VPBuilder MiddleBuilder(NewMiddle);
2809   VPBuilder EarlyExitB(VectorEarlyExitVPBB);
2810   for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
2811     auto *ExitIRI = cast<VPIRPhi>(&R);
2812     // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
2813     // a single predecessor and 1 if it has two.
2814     unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
2815     if (ExitIRI->getNumOperands() != 1) {
2816       // The first of two operands corresponds to the latch exit, via MiddleVPBB
2817       // predecessor. Extract its last lane.
2818       ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
2819     }
2820 
2821     VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
2822     auto IsVector = [](ElementCount VF) { return VF.isVector(); };
2823     // When the VFs are vectors, need to add `extract` to get the incoming value
2824     // from early exit. When the range contains scalar VF, limit the range to
2825     // scalar VF to prevent mis-compilation for the range containing both scalar
2826     // and vector VFs.
2827     if (!IncomingFromEarlyExit->isLiveIn() &&
2828         LoopVectorizationPlanner::getDecisionAndClampRange(IsVector, Range)) {
2829       // Update the incoming value from the early exit.
2830       VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
2831           VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
2832           "first.active.lane");
2833       IncomingFromEarlyExit = EarlyExitB.createNaryOp(
2834           Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane},
2835           nullptr, "early.exit.value");
2836       ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
2837     }
2838   }
2839   MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
2840 
2841   // Replace the condition controlling the non-early exit from the vector loop
2842   // with one exiting if either the original condition of the vector latch is
2843   // true or the early exit has been taken.
2844   auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
2845   assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
2846          "Unexpected terminator");
2847   auto *IsLatchExitTaken =
2848       Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
2849                          LatchExitingBranch->getOperand(1));
2850   auto *AnyExitTaken = Builder.createNaryOp(
2851       Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
2852   Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
2853   LatchExitingBranch->eraseFromParent();
2854 }
2855 
2856 /// This function tries convert extended in-loop reductions to
2857 /// VPExpressionRecipe and clamp the \p Range if it is beneficial and
2858 /// valid. The created recipe must be decomposed to its constituent
2859 /// recipes before execution.
2860 static VPExpressionRecipe *
tryToMatchAndCreateExtendedReduction(VPReductionRecipe * Red,VPCostContext & Ctx,VFRange & Range)2861 tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
2862                                      VFRange &Range) {
2863   using namespace VPlanPatternMatch;
2864 
2865   Type *RedTy = Ctx.Types.inferScalarType(Red);
2866   VPValue *VecOp = Red->getVecOp();
2867 
2868   // Clamp the range if using extended-reduction is profitable.
2869   auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
2870                                              Type *SrcTy) -> bool {
2871     return LoopVectorizationPlanner::getDecisionAndClampRange(
2872         [&](ElementCount VF) {
2873           auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
2874           TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2875           InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
2876               Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(),
2877               CostKind);
2878           InstructionCost ExtCost =
2879               cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
2880           InstructionCost RedCost = Red->computeCost(VF, Ctx);
2881           return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
2882         },
2883         Range);
2884   };
2885 
2886   VPValue *A;
2887   // Match reduce(ext)).
2888   if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
2889       IsExtendedRedValidAndClampRange(
2890           RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
2891           cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
2892               Instruction::CastOps::ZExt,
2893           Ctx.Types.inferScalarType(A)))
2894     return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
2895 
2896   return nullptr;
2897 }
2898 
2899 /// This function tries convert extended in-loop reductions to
2900 /// VPExpressionRecipe and clamp the \p Range if it is beneficial
2901 /// and valid. The created VPExpressionRecipe must be decomposed to its
2902 /// constituent recipes before execution. Patterns of the
2903 /// VPExpressionRecipe:
2904 ///   reduce.add(mul(...)),
2905 ///   reduce.add(mul(ext(A), ext(B))),
2906 ///   reduce.add(ext(mul(ext(A), ext(B)))).
2907 static VPExpressionRecipe *
tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe * Red,VPCostContext & Ctx,VFRange & Range)2908 tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
2909                                           VPCostContext &Ctx, VFRange &Range) {
2910   using namespace VPlanPatternMatch;
2911 
2912   unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
2913   if (Opcode != Instruction::Add)
2914     return nullptr;
2915 
2916   Type *RedTy = Ctx.Types.inferScalarType(Red);
2917 
2918   // Clamp the range if using multiply-accumulate-reduction is profitable.
2919   auto IsMulAccValidAndClampRange =
2920       [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
2921           VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
2922     return LoopVectorizationPlanner::getDecisionAndClampRange(
2923         [&](ElementCount VF) {
2924           TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2925           Type *SrcTy =
2926               Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
2927           auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
2928           InstructionCost MulAccCost =
2929               Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind);
2930           InstructionCost MulCost = Mul->computeCost(VF, Ctx);
2931           InstructionCost RedCost = Red->computeCost(VF, Ctx);
2932           InstructionCost ExtCost = 0;
2933           if (Ext0)
2934             ExtCost += Ext0->computeCost(VF, Ctx);
2935           if (Ext1)
2936             ExtCost += Ext1->computeCost(VF, Ctx);
2937           if (OuterExt)
2938             ExtCost += OuterExt->computeCost(VF, Ctx);
2939 
2940           return MulAccCost.isValid() &&
2941                  MulAccCost < ExtCost + MulCost + RedCost;
2942         },
2943         Range);
2944   };
2945 
2946   VPValue *VecOp = Red->getVecOp();
2947   VPValue *A, *B;
2948   // Try to match reduce.add(mul(...)).
2949   if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
2950     auto *RecipeA =
2951         dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
2952     auto *RecipeB =
2953         dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
2954     auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
2955 
2956     // Match reduce.add(mul(ext, ext)).
2957     if (RecipeA && RecipeB &&
2958         (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
2959         match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
2960         match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
2961         IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
2962                                        Instruction::CastOps::ZExt,
2963                                    Mul, RecipeA, RecipeB, nullptr)) {
2964       return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
2965     }
2966     // Match reduce.add(mul).
2967     if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
2968       return new VPExpressionRecipe(Mul, Red);
2969   }
2970   // Match reduce.add(ext(mul(ext(A), ext(B)))).
2971   // All extend recipes must have same opcode or A == B
2972   // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
2973   if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),
2974                                       m_ZExtOrSExt(m_VPValue()))))) {
2975     auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
2976     auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
2977     auto *Ext0 =
2978         cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
2979     auto *Ext1 =
2980         cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
2981     if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
2982         Ext0->getOpcode() == Ext1->getOpcode() &&
2983         IsMulAccValidAndClampRange(Ext0->getOpcode() ==
2984                                        Instruction::CastOps::ZExt,
2985                                    Mul, Ext0, Ext1, Ext)) {
2986       auto *NewExt0 = new VPWidenCastRecipe(
2987           Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
2988           Ext0->getDebugLoc());
2989       NewExt0->insertBefore(Ext0);
2990 
2991       VPWidenCastRecipe *NewExt1 = NewExt0;
2992       if (Ext0 != Ext1) {
2993         NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
2994                                         Ext->getResultType(), *Ext1,
2995                                         Ext1->getDebugLoc());
2996         NewExt1->insertBefore(Ext1);
2997       }
2998       Mul->setOperand(0, NewExt0);
2999       Mul->setOperand(1, NewExt1);
3000       Red->setOperand(1, Mul);
3001       return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3002     }
3003   }
3004   return nullptr;
3005 }
3006 
3007 /// This function tries to create abstract recipes from the reduction recipe for
3008 /// following optimizations and cost estimation.
tryToCreateAbstractReductionRecipe(VPReductionRecipe * Red,VPCostContext & Ctx,VFRange & Range)3009 static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
3010                                                VPCostContext &Ctx,
3011                                                VFRange &Range) {
3012   VPExpressionRecipe *AbstractR = nullptr;
3013   auto IP = std::next(Red->getIterator());
3014   auto *VPBB = Red->getParent();
3015   if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3016     AbstractR = MulAcc;
3017   else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3018     AbstractR = ExtRed;
3019   // Cannot create abstract inloop reduction recipes.
3020   if (!AbstractR)
3021     return;
3022 
3023   AbstractR->insertBefore(*VPBB, IP);
3024   Red->replaceAllUsesWith(AbstractR);
3025 }
3026 
convertToAbstractRecipes(VPlan & Plan,VPCostContext & Ctx,VFRange & Range)3027 void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
3028                                                VFRange &Range) {
3029   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3030            vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
3031     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3032       if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
3033         tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
3034     }
3035   }
3036 }
3037 
materializeBroadcasts(VPlan & Plan)3038 void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
3039   if (Plan.hasScalarVFOnly())
3040     return;
3041 
3042 #ifndef NDEBUG
3043   VPDominatorTree VPDT;
3044   VPDT.recalculate(Plan);
3045 #endif
3046 
3047   SmallVector<VPValue *> VPValues;
3048   if (Plan.getOrCreateBackedgeTakenCount()->getNumUsers() > 0)
3049     VPValues.push_back(Plan.getOrCreateBackedgeTakenCount());
3050   append_range(VPValues, Plan.getLiveIns());
3051   for (VPRecipeBase &R : *Plan.getEntry())
3052     append_range(VPValues, R.definedValues());
3053 
3054   auto *VectorPreheader = Plan.getVectorPreheader();
3055   for (VPValue *VPV : VPValues) {
3056     if (all_of(VPV->users(),
3057                [VPV](VPUser *U) { return U->usesScalars(VPV); }) ||
3058         (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3059          isa<Constant>(VPV->getLiveInIRValue())))
3060       continue;
3061 
3062     // Add explicit broadcast at the insert point that dominates all users.
3063     VPBasicBlock *HoistBlock = VectorPreheader;
3064     VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3065     for (VPUser *User : VPV->users()) {
3066       if (User->usesScalars(VPV))
3067         continue;
3068       if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3069         HoistPoint = HoistBlock->begin();
3070       else
3071         assert(VPDT.dominates(VectorPreheader,
3072                               cast<VPRecipeBase>(User)->getParent()) &&
3073                "All users must be in the vector preheader or dominated by it");
3074     }
3075 
3076     VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3077     auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3078     VPV->replaceUsesWithIf(Broadcast,
3079                            [VPV, Broadcast](VPUser &U, unsigned Idx) {
3080                              return Broadcast != &U && !U.usesScalars(VPV);
3081                            });
3082   }
3083 }
3084 
3085 /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
3086 /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
3087 /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
3088 /// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
3089 /// an index-independent load if it feeds all wide ops at all indices (\p OpV
3090 /// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
3091 /// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
3092 /// is defined at \p Idx of a load interleave group.
canNarrowLoad(VPWidenRecipe * WideMember0,unsigned OpIdx,VPValue * OpV,unsigned Idx)3093 static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
3094                           VPValue *OpV, unsigned Idx) {
3095   auto *DefR = OpV->getDefiningRecipe();
3096   if (!DefR)
3097     return WideMember0->getOperand(OpIdx) == OpV;
3098   if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
3099     return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
3100 
3101   if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
3102     return IR->getInterleaveGroup()->getFactor() ==
3103                IR->getInterleaveGroup()->getNumMembers() &&
3104            IR->getVPValue(Idx) == OpV;
3105   return false;
3106 }
3107 
3108 /// Returns true if \p IR is a full interleave group with factor and number of
3109 /// members both equal to \p VF. The interleave group must also access the full
3110 /// vector width \p VectorRegWidth.
isConsecutiveInterleaveGroup(VPInterleaveRecipe * InterleaveR,unsigned VF,VPTypeAnalysis & TypeInfo,unsigned VectorRegWidth)3111 static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
3112                                          unsigned VF, VPTypeAnalysis &TypeInfo,
3113                                          unsigned VectorRegWidth) {
3114   if (!InterleaveR)
3115     return false;
3116 
3117   Type *GroupElementTy = nullptr;
3118   if (InterleaveR->getStoredValues().empty()) {
3119     GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
3120     if (!all_of(InterleaveR->definedValues(),
3121                 [&TypeInfo, GroupElementTy](VPValue *Op) {
3122                   return TypeInfo.inferScalarType(Op) == GroupElementTy;
3123                 }))
3124       return false;
3125   } else {
3126     GroupElementTy =
3127         TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
3128     if (!all_of(InterleaveR->getStoredValues(),
3129                 [&TypeInfo, GroupElementTy](VPValue *Op) {
3130                   return TypeInfo.inferScalarType(Op) == GroupElementTy;
3131                 }))
3132       return false;
3133   }
3134 
3135   unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
3136   auto IG = InterleaveR->getInterleaveGroup();
3137   return IG->getFactor() == VF && IG->getNumMembers() == VF &&
3138          GroupSize == VectorRegWidth;
3139 }
3140 
3141 /// Returns true if \p VPValue is a narrow VPValue.
isAlreadyNarrow(VPValue * VPV)3142 static bool isAlreadyNarrow(VPValue *VPV) {
3143   if (VPV->isLiveIn())
3144     return true;
3145   auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
3146   return RepR && RepR->isSingleScalar();
3147 }
3148 
narrowInterleaveGroups(VPlan & Plan,ElementCount VF,unsigned VectorRegWidth)3149 void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
3150                                              unsigned VectorRegWidth) {
3151   using namespace llvm::VPlanPatternMatch;
3152   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
3153   if (VF.isScalable() || !VectorLoop)
3154     return;
3155 
3156   VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
3157   Type *CanonicalIVType = CanonicalIV->getScalarType();
3158   VPTypeAnalysis TypeInfo(CanonicalIVType);
3159 
3160   unsigned FixedVF = VF.getFixedValue();
3161   SmallVector<VPInterleaveRecipe *> StoreGroups;
3162   for (auto &R : *VectorLoop->getEntryBasicBlock()) {
3163     if (isa<VPCanonicalIVPHIRecipe>(&R) ||
3164         match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
3165       continue;
3166 
3167     if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) &&
3168         vputils::onlyFirstLaneUsed(cast<VPSingleDefRecipe>(&R)))
3169       continue;
3170 
3171     // Bail out on recipes not supported at the moment:
3172     //  * phi recipes other than the canonical induction
3173     //  * recipes writing to memory except interleave groups
3174     // Only support plans with a canonical induction phi.
3175     if (R.isPhi())
3176       return;
3177 
3178     auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
3179     if (R.mayWriteToMemory() && !InterleaveR)
3180       return;
3181 
3182     // Do not narrow interleave groups if there are VectorPointer recipes and
3183     // the plan was unrolled. The recipe implicitly uses VF from
3184     // VPTransformState.
3185     // TODO: Remove restriction once the VF for the VectorPointer offset is
3186     // modeled explicitly as operand.
3187     if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
3188       return;
3189 
3190     // All other ops are allowed, but we reject uses that cannot be converted
3191     // when checking all allowed consumers (store interleave groups) below.
3192     if (!InterleaveR)
3193       continue;
3194 
3195     // Bail out on non-consecutive interleave groups.
3196     if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo,
3197                                       VectorRegWidth))
3198       return;
3199 
3200     // Skip read interleave groups.
3201     if (InterleaveR->getStoredValues().empty())
3202       continue;
3203 
3204     // Narrow interleave groups, if all operands are already matching narrow
3205     // ops.
3206     auto *Member0 = InterleaveR->getStoredValues()[0];
3207     if (isAlreadyNarrow(Member0) &&
3208         all_of(InterleaveR->getStoredValues(),
3209                [Member0](VPValue *VPV) { return Member0 == VPV; })) {
3210       StoreGroups.push_back(InterleaveR);
3211       continue;
3212     }
3213 
3214     // For now, we only support full interleave groups storing load interleave
3215     // groups.
3216     if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
3217           VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
3218           if (!DefR)
3219             return false;
3220           auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
3221           return IR &&
3222                  IR->getInterleaveGroup()->getFactor() ==
3223                      IR->getInterleaveGroup()->getNumMembers() &&
3224                  IR->getVPValue(Op.index()) == Op.value();
3225         })) {
3226       StoreGroups.push_back(InterleaveR);
3227       continue;
3228     }
3229 
3230     // Check if all values feeding InterleaveR are matching wide recipes, which
3231     // operands that can be narrowed.
3232     auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
3233         InterleaveR->getStoredValues()[0]->getDefiningRecipe());
3234     if (!WideMember0)
3235       return;
3236     for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
3237       auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
3238       if (!R || R->getOpcode() != WideMember0->getOpcode() ||
3239           R->getNumOperands() > 2)
3240         return;
3241       if (any_of(enumerate(R->operands()),
3242                  [WideMember0, Idx = I](const auto &P) {
3243                    const auto &[OpIdx, OpV] = P;
3244                    return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
3245                  }))
3246         return;
3247     }
3248     StoreGroups.push_back(InterleaveR);
3249   }
3250 
3251   if (StoreGroups.empty())
3252     return;
3253 
3254   // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
3255   SmallPtrSet<VPValue *, 4> NarrowedOps;
3256   auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
3257     auto *R = V->getDefiningRecipe();
3258     if (!R || NarrowedOps.contains(V))
3259       return V;
3260     if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
3261       // Narrow interleave group to wide load, as transformed VPlan will only
3262       // process one original iteration.
3263       auto *L = new VPWidenLoadRecipe(
3264           *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
3265           LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
3266           /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
3267       L->insertBefore(LoadGroup);
3268       NarrowedOps.insert(L);
3269       return L;
3270     }
3271 
3272     if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
3273       assert(RepR->isSingleScalar() &&
3274              isa<LoadInst>(RepR->getUnderlyingInstr()) &&
3275              "must be a single scalar load");
3276       NarrowedOps.insert(RepR);
3277       return RepR;
3278     }
3279     auto *WideLoad = cast<VPWidenLoadRecipe>(R);
3280 
3281     // Narrow wide load to uniform scalar load, as transformed VPlan will only
3282     // process one original iteration.
3283     auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
3284                                     WideLoad->operands(), /*IsUniform*/ true,
3285                                     /*Mask*/ nullptr, *WideLoad);
3286     N->insertBefore(WideLoad);
3287     NarrowedOps.insert(N);
3288     return N;
3289   };
3290 
3291   // Narrow operation tree rooted at store groups.
3292   for (auto *StoreGroup : StoreGroups) {
3293     VPValue *Res = nullptr;
3294     VPValue *Member0 = StoreGroup->getStoredValues()[0];
3295     if (isAlreadyNarrow(Member0)) {
3296       Res = Member0;
3297     } else if (auto *WideMember0 =
3298                    dyn_cast<VPWidenRecipe>(Member0->getDefiningRecipe())) {
3299       for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
3300         WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
3301       Res = WideMember0;
3302     } else {
3303       Res = NarrowOp(Member0);
3304     }
3305 
3306     auto *S = new VPWidenStoreRecipe(
3307         *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
3308         StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
3309         /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
3310     S->insertBefore(StoreGroup);
3311     StoreGroup->eraseFromParent();
3312   }
3313 
3314   // Adjust induction to reflect that the transformed plan only processes one
3315   // original iteration.
3316   auto *CanIV = Plan.getCanonicalIV();
3317   auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
3318   Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get(
3319                          CanIV->getScalarType(), 1 * Plan.getUF())));
3320   Plan.getVF().replaceAllUsesWith(
3321       Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
3322   removeDeadRecipes(Plan);
3323 }
3324 
3325 /// Add branch weight metadata, if the \p Plan's middle block is terminated by a
3326 /// BranchOnCond recipe.
addBranchWeightToMiddleTerminator(VPlan & Plan,ElementCount VF,std::optional<unsigned> VScaleForTuning)3327 void VPlanTransforms::addBranchWeightToMiddleTerminator(
3328     VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
3329   VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
3330   auto *MiddleTerm =
3331       dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator());
3332   // Only add branch metadata if there is a (conditional) terminator.
3333   if (!MiddleTerm)
3334     return;
3335 
3336   assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
3337          "must have a BranchOnCond");
3338   // Assume that `TripCount % VectorStep ` is equally distributed.
3339   unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
3340   if (VF.isScalable() && VScaleForTuning.has_value())
3341     VectorStep *= *VScaleForTuning;
3342   assert(VectorStep > 0 && "trip count should not be zero");
3343   MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
3344   MDNode *BranchWeights =
3345       MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
3346   MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
3347 }
3348