1 //===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements a set of utility VPlan to VPlan transformations.
11 ///
12 //===----------------------------------------------------------------------===//
13
14 #include "VPlanTransforms.h"
15 #include "VPRecipeBuilder.h"
16 #include "VPlan.h"
17 #include "VPlanAnalysis.h"
18 #include "VPlanCFG.h"
19 #include "VPlanDominatorTree.h"
20 #include "VPlanHelpers.h"
21 #include "VPlanPatternMatch.h"
22 #include "VPlanUtils.h"
23 #include "VPlanVerifier.h"
24 #include "llvm/ADT/APInt.h"
25 #include "llvm/ADT/PostOrderIterator.h"
26 #include "llvm/ADT/STLExtras.h"
27 #include "llvm/ADT/SetVector.h"
28 #include "llvm/ADT/TypeSwitch.h"
29 #include "llvm/Analysis/IVDescriptors.h"
30 #include "llvm/Analysis/InstSimplifyFolder.h"
31 #include "llvm/Analysis/LoopInfo.h"
32 #include "llvm/Analysis/VectorUtils.h"
33 #include "llvm/IR/Intrinsics.h"
34 #include "llvm/IR/MDBuilder.h"
35 #include "llvm/IR/PatternMatch.h"
36 #include "llvm/Support/Casting.h"
37 #include "llvm/Support/TypeSize.h"
38
39 using namespace llvm;
40
tryToConvertVPInstructionsToVPRecipes(VPlanPtr & Plan,function_ref<const InductionDescriptor * (PHINode *)> GetIntOrFpInductionDescriptor,ScalarEvolution & SE,const TargetLibraryInfo & TLI)41 bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
42 VPlanPtr &Plan,
43 function_ref<const InductionDescriptor *(PHINode *)>
44 GetIntOrFpInductionDescriptor,
45 ScalarEvolution &SE, const TargetLibraryInfo &TLI) {
46
47 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
48 Plan->getVectorLoopRegion());
49 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
50 // Skip blocks outside region
51 if (!VPBB->getParent())
52 break;
53 VPRecipeBase *Term = VPBB->getTerminator();
54 auto EndIter = Term ? Term->getIterator() : VPBB->end();
55 // Introduce each ingredient into VPlan.
56 for (VPRecipeBase &Ingredient :
57 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
58
59 VPValue *VPV = Ingredient.getVPSingleValue();
60 if (!VPV->getUnderlyingValue())
61 continue;
62
63 Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());
64
65 VPRecipeBase *NewRecipe = nullptr;
66 if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) {
67 auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
68 const auto *II = GetIntOrFpInductionDescriptor(Phi);
69 if (!II)
70 continue;
71
72 VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());
73 VPValue *Step =
74 vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE);
75 NewRecipe = new VPWidenIntOrFpInductionRecipe(
76 Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());
77 } else {
78 assert(isa<VPInstruction>(&Ingredient) &&
79 "only VPInstructions expected here");
80 assert(!isa<PHINode>(Inst) && "phis should be handled above");
81 // Create VPWidenMemoryRecipe for loads and stores.
82 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
83 NewRecipe = new VPWidenLoadRecipe(
84 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
85 false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
86 Ingredient.getDebugLoc());
87 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
88 NewRecipe = new VPWidenStoreRecipe(
89 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
90 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
91 VPIRMetadata(*Store), Ingredient.getDebugLoc());
92 } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
93 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
94 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
95 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
96 if (VectorID == Intrinsic::not_intrinsic)
97 return false;
98 NewRecipe = new VPWidenIntrinsicRecipe(
99 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
100 {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),
101 CI->getDebugLoc());
102 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
103 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
104 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
105 NewRecipe = new VPWidenCastRecipe(
106 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
107 } else {
108 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
109 }
110 }
111
112 NewRecipe->insertBefore(&Ingredient);
113 if (NewRecipe->getNumDefinedValues() == 1)
114 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
115 else
116 assert(NewRecipe->getNumDefinedValues() == 0 &&
117 "Only recpies with zero or one defined values expected");
118 Ingredient.eraseFromParent();
119 }
120 }
121 return true;
122 }
123
sinkScalarOperands(VPlan & Plan)124 static bool sinkScalarOperands(VPlan &Plan) {
125 auto Iter = vp_depth_first_deep(Plan.getEntry());
126 bool Changed = false;
127 // First, collect the operands of all recipes in replicate blocks as seeds for
128 // sinking.
129 SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
130 for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
131 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
132 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
133 continue;
134 VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
135 if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
136 continue;
137 for (auto &Recipe : *VPBB) {
138 for (VPValue *Op : Recipe.operands())
139 if (auto *Def =
140 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
141 WorkList.insert(std::make_pair(VPBB, Def));
142 }
143 }
144
145 bool ScalarVFOnly = Plan.hasScalarVFOnly();
146 // Try to sink each replicate or scalar IV steps recipe in the worklist.
147 for (unsigned I = 0; I != WorkList.size(); ++I) {
148 VPBasicBlock *SinkTo;
149 VPSingleDefRecipe *SinkCandidate;
150 std::tie(SinkTo, SinkCandidate) = WorkList[I];
151 if (SinkCandidate->getParent() == SinkTo ||
152 SinkCandidate->mayHaveSideEffects() ||
153 SinkCandidate->mayReadOrWriteMemory())
154 continue;
155 if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
156 if (!ScalarVFOnly && RepR->isSingleScalar())
157 continue;
158 } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
159 continue;
160
161 bool NeedsDuplicating = false;
162 // All recipe users of the sink candidate must be in the same block SinkTo
163 // or all users outside of SinkTo must be uniform-after-vectorization (
164 // i.e., only first lane is used) . In the latter case, we need to duplicate
165 // SinkCandidate.
166 auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
167 SinkCandidate](VPUser *U) {
168 auto *UI = cast<VPRecipeBase>(U);
169 if (UI->getParent() == SinkTo)
170 return true;
171 NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
172 // We only know how to duplicate VPReplicateRecipes and
173 // VPScalarIVStepsRecipes for now.
174 return NeedsDuplicating &&
175 isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(SinkCandidate);
176 };
177 if (!all_of(SinkCandidate->users(), CanSinkWithUser))
178 continue;
179
180 if (NeedsDuplicating) {
181 if (ScalarVFOnly)
182 continue;
183 VPSingleDefRecipe *Clone;
184 if (auto *SinkCandidateRepR =
185 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
186 // TODO: Handle converting to uniform recipes as separate transform,
187 // then cloning should be sufficient here.
188 Instruction *I = SinkCandidate->getUnderlyingInstr();
189 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
190 nullptr /*Mask*/, *SinkCandidateRepR);
191 // TODO: add ".cloned" suffix to name of Clone's VPValue.
192 } else {
193 Clone = SinkCandidate->clone();
194 }
195
196 Clone->insertBefore(SinkCandidate);
197 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
198 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
199 });
200 }
201 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
202 for (VPValue *Op : SinkCandidate->operands())
203 if (auto *Def =
204 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
205 WorkList.insert(std::make_pair(SinkTo, Def));
206 Changed = true;
207 }
208 return Changed;
209 }
210
211 /// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
212 /// the mask.
getPredicatedMask(VPRegionBlock * R)213 VPValue *getPredicatedMask(VPRegionBlock *R) {
214 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
215 if (!EntryBB || EntryBB->size() != 1 ||
216 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
217 return nullptr;
218
219 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
220 }
221
222 /// If \p R is a triangle region, return the 'then' block of the triangle.
getPredicatedThenBlock(VPRegionBlock * R)223 static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
224 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
225 if (EntryBB->getNumSuccessors() != 2)
226 return nullptr;
227
228 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
229 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
230 if (!Succ0 || !Succ1)
231 return nullptr;
232
233 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
234 return nullptr;
235 if (Succ0->getSingleSuccessor() == Succ1)
236 return Succ0;
237 if (Succ1->getSingleSuccessor() == Succ0)
238 return Succ1;
239 return nullptr;
240 }
241
242 // Merge replicate regions in their successor region, if a replicate region
243 // is connected to a successor replicate region with the same predicate by a
244 // single, empty VPBasicBlock.
mergeReplicateRegionsIntoSuccessors(VPlan & Plan)245 static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
246 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
247
248 // Collect replicate regions followed by an empty block, followed by another
249 // replicate region with matching masks to process front. This is to avoid
250 // iterator invalidation issues while merging regions.
251 SmallVector<VPRegionBlock *, 8> WorkList;
252 for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
253 vp_depth_first_deep(Plan.getEntry()))) {
254 if (!Region1->isReplicator())
255 continue;
256 auto *MiddleBasicBlock =
257 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
258 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
259 continue;
260
261 auto *Region2 =
262 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
263 if (!Region2 || !Region2->isReplicator())
264 continue;
265
266 VPValue *Mask1 = getPredicatedMask(Region1);
267 VPValue *Mask2 = getPredicatedMask(Region2);
268 if (!Mask1 || Mask1 != Mask2)
269 continue;
270
271 assert(Mask1 && Mask2 && "both region must have conditions");
272 WorkList.push_back(Region1);
273 }
274
275 // Move recipes from Region1 to its successor region, if both are triangles.
276 for (VPRegionBlock *Region1 : WorkList) {
277 if (TransformedRegions.contains(Region1))
278 continue;
279 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
280 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
281
282 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
283 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
284 if (!Then1 || !Then2)
285 continue;
286
287 // Note: No fusion-preventing memory dependencies are expected in either
288 // region. Such dependencies should be rejected during earlier dependence
289 // checks, which guarantee accesses can be re-ordered for vectorization.
290 //
291 // Move recipes to the successor region.
292 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
293 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
294
295 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
296 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
297
298 // Move VPPredInstPHIRecipes from the merge block to the successor region's
299 // merge block. Update all users inside the successor region to use the
300 // original values.
301 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
302 VPValue *PredInst1 =
303 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
304 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
305 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
306 return cast<VPRecipeBase>(&U)->getParent() == Then2;
307 });
308
309 // Remove phi recipes that are unused after merging the regions.
310 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
311 Phi1ToMove.eraseFromParent();
312 continue;
313 }
314 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
315 }
316
317 // Remove the dead recipes in Region1's entry block.
318 for (VPRecipeBase &R :
319 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
320 R.eraseFromParent();
321
322 // Finally, remove the first region.
323 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
324 VPBlockUtils::disconnectBlocks(Pred, Region1);
325 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
326 }
327 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
328 TransformedRegions.insert(Region1);
329 }
330
331 return !TransformedRegions.empty();
332 }
333
createReplicateRegion(VPReplicateRecipe * PredRecipe,VPlan & Plan)334 static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
335 VPlan &Plan) {
336 Instruction *Instr = PredRecipe->getUnderlyingInstr();
337 // Build the triangular if-then region.
338 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
339 assert(Instr->getParent() && "Predicated instruction not in any basic block");
340 auto *BlockInMask = PredRecipe->getMask();
341 auto *MaskDef = BlockInMask->getDefiningRecipe();
342 auto *BOMRecipe = new VPBranchOnMaskRecipe(
343 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc());
344 auto *Entry =
345 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
346
347 // Replace predicated replicate recipe with a replicate recipe without a
348 // mask but in the replicate region.
349 auto *RecipeWithoutMask = new VPReplicateRecipe(
350 PredRecipe->getUnderlyingInstr(),
351 make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
352 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
353 auto *Pred =
354 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
355
356 VPPredInstPHIRecipe *PHIRecipe = nullptr;
357 if (PredRecipe->getNumUsers() != 0) {
358 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
359 RecipeWithoutMask->getDebugLoc());
360 PredRecipe->replaceAllUsesWith(PHIRecipe);
361 PHIRecipe->setOperand(0, RecipeWithoutMask);
362 }
363 PredRecipe->eraseFromParent();
364 auto *Exiting =
365 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
366 VPRegionBlock *Region =
367 Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
368
369 // Note: first set Entry as region entry and then connect successors starting
370 // from it in order, to propagate the "parent" of each VPBasicBlock.
371 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
372 VPBlockUtils::connectBlocks(Pred, Exiting);
373
374 return Region;
375 }
376
addReplicateRegions(VPlan & Plan)377 static void addReplicateRegions(VPlan &Plan) {
378 SmallVector<VPReplicateRecipe *> WorkList;
379 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
380 vp_depth_first_deep(Plan.getEntry()))) {
381 for (VPRecipeBase &R : *VPBB)
382 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
383 if (RepR->isPredicated())
384 WorkList.push_back(RepR);
385 }
386 }
387
388 unsigned BBNum = 0;
389 for (VPReplicateRecipe *RepR : WorkList) {
390 VPBasicBlock *CurrentBlock = RepR->getParent();
391 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
392
393 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
394 SplitBlock->setName(
395 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
396 // Record predicated instructions for above packing optimizations.
397 VPRegionBlock *Region = createReplicateRegion(RepR, Plan);
398 Region->setParent(CurrentBlock->getParent());
399 VPBlockUtils::insertOnEdge(CurrentBlock, SplitBlock, Region);
400
401 VPRegionBlock *ParentRegion = Region->getParent();
402 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
403 ParentRegion->setExiting(SplitBlock);
404 }
405 }
406
407 /// Remove redundant VPBasicBlocks by merging them into their predecessor if
408 /// the predecessor has a single successor.
mergeBlocksIntoPredecessors(VPlan & Plan)409 static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
410 SmallVector<VPBasicBlock *> WorkList;
411 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
412 vp_depth_first_deep(Plan.getEntry()))) {
413 // Don't fold the blocks in the skeleton of the Plan into their single
414 // predecessors for now.
415 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
416 if (!VPBB->getParent())
417 continue;
418 auto *PredVPBB =
419 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
420 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
421 isa<VPIRBasicBlock>(PredVPBB))
422 continue;
423 WorkList.push_back(VPBB);
424 }
425
426 for (VPBasicBlock *VPBB : WorkList) {
427 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
428 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
429 R.moveBefore(*PredVPBB, PredVPBB->end());
430 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
431 auto *ParentRegion = VPBB->getParent();
432 if (ParentRegion && ParentRegion->getExiting() == VPBB)
433 ParentRegion->setExiting(PredVPBB);
434 for (auto *Succ : to_vector(VPBB->successors())) {
435 VPBlockUtils::disconnectBlocks(VPBB, Succ);
436 VPBlockUtils::connectBlocks(PredVPBB, Succ);
437 }
438 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
439 }
440 return !WorkList.empty();
441 }
442
createAndOptimizeReplicateRegions(VPlan & Plan)443 void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
444 // Convert masked VPReplicateRecipes to if-then region blocks.
445 addReplicateRegions(Plan);
446
447 bool ShouldSimplify = true;
448 while (ShouldSimplify) {
449 ShouldSimplify = sinkScalarOperands(Plan);
450 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
451 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
452 }
453 }
454
455 /// Remove redundant casts of inductions.
456 ///
457 /// Such redundant casts are casts of induction variables that can be ignored,
458 /// because we already proved that the casted phi is equal to the uncasted phi
459 /// in the vectorized loop. There is no need to vectorize the cast - the same
460 /// value can be used for both the phi and casts in the vector loop.
removeRedundantInductionCasts(VPlan & Plan)461 static void removeRedundantInductionCasts(VPlan &Plan) {
462 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
463 auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
464 if (!IV || IV->getTruncInst())
465 continue;
466
467 // A sequence of IR Casts has potentially been recorded for IV, which
468 // *must be bypassed* when the IV is vectorized, because the vectorized IV
469 // will produce the desired casted value. This sequence forms a def-use
470 // chain and is provided in reverse order, ending with the cast that uses
471 // the IV phi. Search for the recipe of the last cast in the chain and
472 // replace it with the original IV. Note that only the final cast is
473 // expected to have users outside the cast-chain and the dead casts left
474 // over will be cleaned up later.
475 auto &Casts = IV->getInductionDescriptor().getCastInsts();
476 VPValue *FindMyCast = IV;
477 for (Instruction *IRCast : reverse(Casts)) {
478 VPSingleDefRecipe *FoundUserCast = nullptr;
479 for (auto *U : FindMyCast->users()) {
480 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
481 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
482 FoundUserCast = UserCast;
483 break;
484 }
485 }
486 FindMyCast = FoundUserCast;
487 }
488 FindMyCast->replaceAllUsesWith(IV);
489 }
490 }
491
492 /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
493 /// recipe, if it exists.
removeRedundantCanonicalIVs(VPlan & Plan)494 static void removeRedundantCanonicalIVs(VPlan &Plan) {
495 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
496 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
497 for (VPUser *U : CanonicalIV->users()) {
498 WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U);
499 if (WidenNewIV)
500 break;
501 }
502
503 if (!WidenNewIV)
504 return;
505
506 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
507 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
508 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
509
510 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
511 continue;
512
513 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
514 // everything WidenNewIV's users need. That is, WidenOriginalIV will
515 // generate a vector phi or all users of WidenNewIV demand the first lane
516 // only.
517 if (any_of(WidenOriginalIV->users(),
518 [WidenOriginalIV](VPUser *U) {
519 return !U->usesScalars(WidenOriginalIV);
520 }) ||
521 vputils::onlyFirstLaneUsed(WidenNewIV)) {
522 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
523 WidenNewIV->eraseFromParent();
524 return;
525 }
526 }
527 }
528
529 /// Returns true if \p R is dead and can be removed.
isDeadRecipe(VPRecipeBase & R)530 static bool isDeadRecipe(VPRecipeBase &R) {
531 using namespace llvm::PatternMatch;
532 // Do remove conditional assume instructions as their conditions may be
533 // flattened.
534 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
535 bool IsConditionalAssume =
536 RepR && RepR->isPredicated() &&
537 match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>());
538 if (IsConditionalAssume)
539 return true;
540
541 if (R.mayHaveSideEffects())
542 return false;
543
544 // Recipe is dead if no user keeps the recipe alive.
545 return all_of(R.definedValues(),
546 [](VPValue *V) { return V->getNumUsers() == 0; });
547 }
548
removeDeadRecipes(VPlan & Plan)549 void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
550 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
551 Plan.getEntry());
552
553 for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
554 // The recipes in the block are processed in reverse order, to catch chains
555 // of dead recipes.
556 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
557 if (isDeadRecipe(R))
558 R.eraseFromParent();
559 }
560 }
561 }
562
563 static VPScalarIVStepsRecipe *
createScalarIVSteps(VPlan & Plan,InductionDescriptor::InductionKind Kind,Instruction::BinaryOps InductionOpcode,FPMathOperator * FPBinOp,Instruction * TruncI,VPValue * StartV,VPValue * Step,DebugLoc DL,VPBuilder & Builder)564 createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
565 Instruction::BinaryOps InductionOpcode,
566 FPMathOperator *FPBinOp, Instruction *TruncI,
567 VPValue *StartV, VPValue *Step, DebugLoc DL,
568 VPBuilder &Builder) {
569 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
570 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
571 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
572 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
573
574 // Truncate base induction if needed.
575 Type *CanonicalIVType = CanonicalIV->getScalarType();
576 VPTypeAnalysis TypeInfo(CanonicalIVType);
577 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
578 if (TruncI) {
579 Type *TruncTy = TruncI->getType();
580 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
581 "Not truncating.");
582 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
583 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
584 ResultTy = TruncTy;
585 }
586
587 // Truncate step if needed.
588 Type *StepTy = TypeInfo.inferScalarType(Step);
589 if (ResultTy != StepTy) {
590 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
591 "Not truncating.");
592 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
593 auto *VecPreheader =
594 cast<VPBasicBlock>(HeaderVPBB->getSingleHierarchicalPredecessor());
595 VPBuilder::InsertPointGuard Guard(Builder);
596 Builder.setInsertPoint(VecPreheader);
597 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
598 }
599 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
600 &Plan.getVF(), DL);
601 }
602
collectUsersRecursively(VPValue * V)603 static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
604 SetVector<VPUser *> Users(llvm::from_range, V->users());
605 for (unsigned I = 0; I != Users.size(); ++I) {
606 VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]);
607 if (isa<VPHeaderPHIRecipe>(Cur))
608 continue;
609 for (VPValue *V : Cur->definedValues())
610 Users.insert_range(V->users());
611 }
612 return Users.takeVector();
613 }
614
615 /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
616 /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
617 /// VPWidenPointerInductionRecipe will generate vectors only. If some users
618 /// require vectors while other require scalars, the scalar uses need to extract
619 /// the scalars from the generated vectors (Note that this is different to how
620 /// int/fp inductions are handled). Legalize extract-from-ends using uniform
621 /// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
622 /// the correct end value is available. Also optimize
623 /// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
624 /// providing them scalar steps built on the canonical scalar IV and update the
625 /// original IV's users. This is an optional optimization to reduce the needs of
626 /// vector extracts.
legalizeAndOptimizeInductions(VPlan & Plan)627 static void legalizeAndOptimizeInductions(VPlan &Plan) {
628 using namespace llvm::VPlanPatternMatch;
629 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
630 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
631 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
632 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
633 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
634 if (!PhiR)
635 continue;
636
637 // Try to narrow wide and replicating recipes to uniform recipes, based on
638 // VPlan analysis.
639 // TODO: Apply to all recipes in the future, to replace legacy uniformity
640 // analysis.
641 auto Users = collectUsersRecursively(PhiR);
642 for (VPUser *U : reverse(Users)) {
643 auto *Def = dyn_cast<VPSingleDefRecipe>(U);
644 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
645 // Skip recipes that shouldn't be narrowed.
646 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
647 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
648 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
649 continue;
650
651 // Skip recipes that may have other lanes than their first used.
652 if (!vputils::isSingleScalar(Def) && !vputils::onlyFirstLaneUsed(Def))
653 continue;
654
655 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
656 Def->operands(), /*IsUniform*/ true);
657 Clone->insertAfter(Def);
658 Def->replaceAllUsesWith(Clone);
659 }
660
661 // Replace wide pointer inductions which have only their scalars used by
662 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
663 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
664 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
665 continue;
666
667 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
668 VPValue *StartV =
669 Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
670 VPValue *StepV = PtrIV->getOperand(1);
671 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
672 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
673 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
674
675 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
676 PtrIV->getDebugLoc(), "next.gep");
677
678 PtrIV->replaceAllUsesWith(PtrAdd);
679 continue;
680 }
681
682 // Replace widened induction with scalar steps for users that only use
683 // scalars.
684 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
685 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
686 return U->usesScalars(WideIV);
687 }))
688 continue;
689
690 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
691 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
692 Plan, ID.getKind(), ID.getInductionOpcode(),
693 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
694 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
695 WideIV->getDebugLoc(), Builder);
696
697 // Update scalar users of IV to use Step instead.
698 if (!HasOnlyVectorVFs)
699 WideIV->replaceAllUsesWith(Steps);
700 else
701 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
702 return U.usesScalars(WideIV);
703 });
704 }
705 }
706
707 /// Check if \p VPV is an untruncated wide induction, either before or after the
708 /// increment. If so return the header IV (before the increment), otherwise
709 /// return null.
getOptimizableIVOf(VPValue * VPV)710 static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) {
711 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
712 if (WideIV) {
713 // VPV itself is a wide induction, separately compute the end value for exit
714 // users if it is not a truncated IV.
715 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
716 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
717 }
718
719 // Check if VPV is an optimizable induction increment.
720 VPRecipeBase *Def = VPV->getDefiningRecipe();
721 if (!Def || Def->getNumOperands() != 2)
722 return nullptr;
723 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
724 if (!WideIV)
725 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
726 if (!WideIV)
727 return nullptr;
728
729 auto IsWideIVInc = [&]() {
730 using namespace VPlanPatternMatch;
731 auto &ID = WideIV->getInductionDescriptor();
732
733 // Check if VPV increments the induction by the induction step.
734 VPValue *IVStep = WideIV->getStepValue();
735 switch (ID.getInductionOpcode()) {
736 case Instruction::Add:
737 return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
738 m_Specific(IVStep)));
739 case Instruction::FAdd:
740 return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
741 m_Specific(IVStep)));
742 case Instruction::FSub:
743 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
744 m_Specific(IVStep)));
745 case Instruction::Sub: {
746 // IVStep will be the negated step of the subtraction. Check if Step == -1
747 // * IVStep.
748 VPValue *Step;
749 if (!match(VPV,
750 m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
751 !Step->isLiveIn() || !IVStep->isLiveIn())
752 return false;
753 auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
754 auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
755 return StepCI && IVStepCI &&
756 StepCI->getValue() == (-1 * IVStepCI->getValue());
757 }
758 default:
759 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
760 match(VPV, m_GetElementPtr(m_Specific(WideIV),
761 m_Specific(WideIV->getStepValue())));
762 }
763 llvm_unreachable("should have been covered by switch above");
764 };
765 return IsWideIVInc() ? WideIV : nullptr;
766 }
767
768 /// Attempts to optimize the induction variable exit values for users in the
769 /// early exit block.
optimizeEarlyExitInductionUser(VPlan & Plan,VPTypeAnalysis & TypeInfo,VPBlockBase * PredVPBB,VPValue * Op)770 static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
771 VPTypeAnalysis &TypeInfo,
772 VPBlockBase *PredVPBB,
773 VPValue *Op) {
774 using namespace VPlanPatternMatch;
775
776 VPValue *Incoming, *Mask;
777 if (!match(Op, m_VPInstruction<Instruction::ExtractElement>(
778 m_VPValue(Incoming),
779 m_VPInstruction<VPInstruction::FirstActiveLane>(
780 m_VPValue(Mask)))))
781 return nullptr;
782
783 auto *WideIV = getOptimizableIVOf(Incoming);
784 if (!WideIV)
785 return nullptr;
786
787 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
788 if (WideIntOrFp && WideIntOrFp->getTruncInst())
789 return nullptr;
790
791 // Calculate the final index.
792 VPValue *EndValue = Plan.getCanonicalIV();
793 auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
794 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
795
796 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
797 VPValue *FirstActiveLane =
798 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
799 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
800 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
801 FirstActiveLaneType, DL);
802 EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
803
804 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
805 // changed it means the exit is using the incremented value, so we need to
806 // add the step.
807 if (Incoming != WideIV) {
808 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
809 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
810 }
811
812 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
813 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
814 VPValue *Start = WideIV->getStartValue();
815 VPValue *Step = WideIV->getStepValue();
816 EndValue = B.createDerivedIV(
817 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
818 Start, EndValue, Step);
819 }
820
821 return EndValue;
822 }
823
824 /// Attempts to optimize the induction variable exit values for users in the
825 /// exit block coming from the latch in the original scalar loop.
826 static VPValue *
optimizeLatchExitInductionUser(VPlan & Plan,VPTypeAnalysis & TypeInfo,VPBlockBase * PredVPBB,VPValue * Op,DenseMap<VPValue *,VPValue * > & EndValues)827 optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo,
828 VPBlockBase *PredVPBB, VPValue *Op,
829 DenseMap<VPValue *, VPValue *> &EndValues) {
830 using namespace VPlanPatternMatch;
831
832 VPValue *Incoming;
833 if (!match(Op, m_VPInstruction<VPInstruction::ExtractLastElement>(
834 m_VPValue(Incoming))))
835 return nullptr;
836
837 auto *WideIV = getOptimizableIVOf(Incoming);
838 if (!WideIV)
839 return nullptr;
840
841 VPValue *EndValue = EndValues.lookup(WideIV);
842 assert(EndValue && "end value must have been pre-computed");
843
844 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
845 // changed it means the exit is using the incremented value, so we don't
846 // need to subtract the step.
847 if (Incoming != WideIV)
848 return EndValue;
849
850 // Otherwise, subtract the step from the EndValue.
851 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
852 VPValue *Step = WideIV->getStepValue();
853 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
854 if (ScalarTy->isIntegerTy())
855 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
856 if (ScalarTy->isPointerTy()) {
857 auto *Zero = Plan.getOrAddLiveIn(
858 ConstantInt::get(Step->getLiveInIRValue()->getType(), 0));
859 return B.createPtrAdd(EndValue,
860 B.createNaryOp(Instruction::Sub, {Zero, Step}), {},
861 "ind.escape");
862 }
863 if (ScalarTy->isFloatingPointTy()) {
864 const auto &ID = WideIV->getInductionDescriptor();
865 return B.createNaryOp(
866 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
867 ? Instruction::FSub
868 : Instruction::FAdd,
869 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
870 }
871 llvm_unreachable("all possible induction types must be handled");
872 return nullptr;
873 }
874
optimizeInductionExitUsers(VPlan & Plan,DenseMap<VPValue *,VPValue * > & EndValues)875 void VPlanTransforms::optimizeInductionExitUsers(
876 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues) {
877 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
878 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
879 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
880 for (VPRecipeBase &R : ExitVPBB->phis()) {
881 auto *ExitIRI = cast<VPIRPhi>(&R);
882
883 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
884 VPValue *Escape = nullptr;
885 if (PredVPBB == MiddleVPBB)
886 Escape = optimizeLatchExitInductionUser(
887 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), EndValues);
888 else
889 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
890 ExitIRI->getOperand(Idx));
891 if (Escape)
892 ExitIRI->setOperand(Idx, Escape);
893 }
894 }
895 }
896 }
897
898 /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
899 /// them with already existing recipes expanding the same SCEV expression.
removeRedundantExpandSCEVRecipes(VPlan & Plan)900 static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
901 DenseMap<const SCEV *, VPValue *> SCEV2VPV;
902
903 for (VPRecipeBase &R :
904 make_early_inc_range(*Plan.getEntry()->getEntryBasicBlock())) {
905 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
906 if (!ExpR)
907 continue;
908
909 auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR});
910 if (I.second)
911 continue;
912 ExpR->replaceAllUsesWith(I.first->second);
913 ExpR->eraseFromParent();
914 }
915 }
916
recursivelyDeleteDeadRecipes(VPValue * V)917 static void recursivelyDeleteDeadRecipes(VPValue *V) {
918 SmallVector<VPValue *> WorkList;
919 SmallPtrSet<VPValue *, 8> Seen;
920 WorkList.push_back(V);
921
922 while (!WorkList.empty()) {
923 VPValue *Cur = WorkList.pop_back_val();
924 if (!Seen.insert(Cur).second)
925 continue;
926 VPRecipeBase *R = Cur->getDefiningRecipe();
927 if (!R)
928 continue;
929 if (!isDeadRecipe(*R))
930 continue;
931 WorkList.append(R->op_begin(), R->op_end());
932 R->eraseFromParent();
933 }
934 }
935
936 /// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
937 /// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are
938 /// foldable live-ins.
tryToFoldLiveIns(const VPRecipeBase & R,unsigned Opcode,ArrayRef<VPValue * > Operands,const DataLayout & DL,VPTypeAnalysis & TypeInfo)939 static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
940 ArrayRef<VPValue *> Operands,
941 const DataLayout &DL, VPTypeAnalysis &TypeInfo) {
942 SmallVector<Value *, 4> Ops;
943 for (VPValue *Op : Operands) {
944 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
945 return nullptr;
946 Ops.push_back(Op->getLiveInIRValue());
947 }
948
949 InstSimplifyFolder Folder(DL);
950 if (Instruction::isBinaryOp(Opcode))
951 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), Ops[0],
952 Ops[1]);
953 if (Instruction::isCast(Opcode))
954 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
955 TypeInfo.inferScalarType(R.getVPSingleValue()));
956 switch (Opcode) {
957 case VPInstruction::LogicalAnd:
958 return Folder.FoldSelect(Ops[0], Ops[1],
959 ConstantInt::getNullValue(Ops[1]->getType()));
960 case VPInstruction::Not:
961 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
962 Constant::getAllOnesValue(Ops[0]->getType()));
963 case Instruction::Select:
964 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
965 case Instruction::ICmp:
966 case Instruction::FCmp:
967 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
968 Ops[1]);
969 case Instruction::GetElementPtr: {
970 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
971 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
972 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], drop_begin(Ops),
973 RFlags.getGEPNoWrapFlags());
974 }
975 case VPInstruction::PtrAdd:
976 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0],
977 Ops[1],
978 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
979 case Instruction::InsertElement:
980 return Folder.FoldInsertElement(Ops[0], Ops[1], Ops[2]);
981 case Instruction::ExtractElement:
982 return Folder.FoldExtractElement(Ops[0], Ops[1]);
983 }
984 return nullptr;
985 }
986
987 /// Try to simplify recipe \p R.
simplifyRecipe(VPRecipeBase & R,VPTypeAnalysis & TypeInfo)988 static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
989 using namespace llvm::VPlanPatternMatch;
990 VPlan *Plan = R.getParent()->getPlan();
991
992 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
993 if (!Def)
994 return;
995
996 // Simplification of live-in IR values for SingleDef recipes using
997 // InstSimplifyFolder.
998 if (TypeSwitch<VPRecipeBase *, bool>(&R)
999 .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
1000 VPReplicateRecipe>([&](auto *I) {
1001 const DataLayout &DL =
1002 Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
1003 Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,
1004 TypeInfo);
1005 if (V)
1006 I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));
1007 return V;
1008 })
1009 .Default([](auto *) { return false; }))
1010 return;
1011
1012 // Fold PredPHI LiveIn -> LiveIn.
1013 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
1014 VPValue *Op = PredPHI->getOperand(0);
1015 if (Op->isLiveIn())
1016 PredPHI->replaceAllUsesWith(Op);
1017 }
1018
1019 VPValue *A;
1020 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1021 Type *TruncTy = TypeInfo.inferScalarType(Def);
1022 Type *ATy = TypeInfo.inferScalarType(A);
1023 if (TruncTy == ATy) {
1024 Def->replaceAllUsesWith(A);
1025 } else {
1026 // Don't replace a scalarizing recipe with a widened cast.
1027 if (isa<VPReplicateRecipe>(Def))
1028 return;
1029 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1030
1031 unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
1032 ? Instruction::SExt
1033 : Instruction::ZExt;
1034 auto *VPC =
1035 new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
1036 if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
1037 // UnderlyingExt has distinct return type, used to retain legacy cost.
1038 VPC->setUnderlyingValue(UnderlyingExt);
1039 }
1040 VPC->insertBefore(&R);
1041 Def->replaceAllUsesWith(VPC);
1042 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1043 auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
1044 VPC->insertBefore(&R);
1045 Def->replaceAllUsesWith(VPC);
1046 }
1047 }
1048 #ifndef NDEBUG
1049 // Verify that the cached type info is for both A and its users is still
1050 // accurate by comparing it to freshly computed types.
1051 VPTypeAnalysis TypeInfo2(Plan->getCanonicalIV()->getScalarType());
1052 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1053 for (VPUser *U : A->users()) {
1054 auto *R = cast<VPRecipeBase>(U);
1055 for (VPValue *VPV : R->definedValues())
1056 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1057 }
1058 #endif
1059 }
1060
1061 // Simplify (X && Y) || (X && !Y) -> X.
1062 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1063 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1064 // recipes to be visited during simplification.
1065 VPValue *X, *Y;
1066 if (match(Def,
1067 m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
1068 m_LogicalAnd(m_Deferred(X), m_Not(m_Deferred(Y)))))) {
1069 Def->replaceAllUsesWith(X);
1070 Def->eraseFromParent();
1071 return;
1072 }
1073
1074 // OR x, 1 -> 1.
1075 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
1076 Def->replaceAllUsesWith(Def->getOperand(0) == X ? Def->getOperand(1)
1077 : Def->getOperand(0));
1078 Def->eraseFromParent();
1079 return;
1080 }
1081
1082 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1083 return Def->replaceAllUsesWith(X);
1084
1085 // select !c, x, y -> select c, y, x
1086 VPValue *C;
1087 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1088 Def->setOperand(0, C);
1089 Def->setOperand(1, Y);
1090 Def->setOperand(2, X);
1091 return;
1092 }
1093
1094 if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
1095 return Def->replaceAllUsesWith(A);
1096
1097 if (match(Def, m_Not(m_VPValue(A)))) {
1098 if (match(A, m_Not(m_VPValue(A))))
1099 return Def->replaceAllUsesWith(A);
1100
1101 // Try to fold Not into compares by adjusting the predicate in-place.
1102 if (isa<VPWidenRecipe>(A) && A->getNumUsers() == 1) {
1103 auto *WideCmp = cast<VPWidenRecipe>(A);
1104 if (WideCmp->getOpcode() == Instruction::ICmp ||
1105 WideCmp->getOpcode() == Instruction::FCmp) {
1106 WideCmp->setPredicate(
1107 CmpInst::getInversePredicate(WideCmp->getPredicate()));
1108 Def->replaceAllUsesWith(WideCmp);
1109 // If WideCmp doesn't have a debug location, use the one from the
1110 // negation, to preserve the location.
1111 if (!WideCmp->getDebugLoc() && R.getDebugLoc())
1112 WideCmp->setDebugLoc(R.getDebugLoc());
1113 }
1114 }
1115 }
1116
1117 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1118 if ((match(Def,
1119 m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) ||
1120 match(Def,
1121 m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) &&
1122 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1123 TypeInfo.inferScalarType(Def))
1124 return Def->replaceAllUsesWith(Def->getOperand(1));
1125
1126 if (match(Def, m_VPInstruction<VPInstruction::WideIVStep>(
1127 m_VPValue(X), m_SpecificInt(1)))) {
1128 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1129 if (TypeInfo.inferScalarType(X) != WideStepTy)
1130 X = VPBuilder(Def).createWidenCast(Instruction::Trunc, X, WideStepTy);
1131 Def->replaceAllUsesWith(X);
1132 return;
1133 }
1134
1135 // For i1 vp.merges produced by AnyOf reductions:
1136 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1137 if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A),
1138 m_VPValue(X), m_VPValue())) &&
1139 match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) &&
1140 TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
1141 Def->setOperand(1, Def->getOperand(0));
1142 Def->setOperand(0, Y);
1143 return;
1144 }
1145
1146 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1147 if (Phi->getOperand(0) == Phi->getOperand(1))
1148 Def->replaceAllUsesWith(Phi->getOperand(0));
1149 return;
1150 }
1151
1152 // Look through ExtractLastElement (BuildVector ....).
1153 if (match(&R, m_VPInstruction<VPInstruction::ExtractLastElement>(
1154 m_BuildVector()))) {
1155 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1156 Def->replaceAllUsesWith(
1157 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1158 return;
1159 }
1160
1161 // Look through ExtractPenultimateElement (BuildVector ....).
1162 if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
1163 m_BuildVector()))) {
1164 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1165 Def->replaceAllUsesWith(
1166 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1167 return;
1168 }
1169
1170 // Some simplifications can only be applied after unrolling. Perform them
1171 // below.
1172 if (!Plan->isUnrolled())
1173 return;
1174
1175 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1176 // the first lane is demanded.
1177 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1178 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1179 Steps->replaceAllUsesWith(Steps->getOperand(0));
1180 return;
1181 }
1182 }
1183 // Simplify redundant ReductionStartVector recipes after unrolling.
1184 VPValue *StartV;
1185 if (match(Def, m_VPInstruction<VPInstruction::ReductionStartVector>(
1186 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1187 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1188 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1189 return PhiR && PhiR->isInLoop();
1190 });
1191 return;
1192 }
1193
1194 if (match(Def,
1195 m_VPInstruction<VPInstruction::ExtractLastElement>(
1196 m_VPInstruction<VPInstruction::Broadcast>(m_VPValue(A))))) {
1197 Def->replaceAllUsesWith(A);
1198 return;
1199 }
1200
1201 VPInstruction *OpVPI;
1202 if (match(Def, m_VPInstruction<VPInstruction::ExtractLastElement>(
1203 m_VPInstruction(OpVPI))) &&
1204 OpVPI->isVectorToScalar()) {
1205 Def->replaceAllUsesWith(OpVPI);
1206 return;
1207 }
1208 }
1209
simplifyRecipes(VPlan & Plan,Type & CanonicalIVTy)1210 void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
1211 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
1212 Plan.getEntry());
1213 VPTypeAnalysis TypeInfo(&CanonicalIVTy);
1214 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
1215 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1216 simplifyRecipe(R, TypeInfo);
1217 }
1218 }
1219 }
1220
narrowToSingleScalarRecipes(VPlan & Plan)1221 static void narrowToSingleScalarRecipes(VPlan &Plan) {
1222 if (Plan.hasScalarVFOnly())
1223 return;
1224
1225 // Try to narrow wide and replicating recipes to single scalar recipes,
1226 // based on VPlan analysis. Only process blocks in the loop region for now,
1227 // without traversing into nested regions, as recipes in replicate regions
1228 // cannot be converted yet.
1229 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1230 vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
1231 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1232 if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))
1233 continue;
1234 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1235 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1236 continue;
1237
1238 auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1239 // Skip recipes that aren't single scalars or don't have only their
1240 // scalar results used. In the latter case, we would introduce extra
1241 // broadcasts.
1242 if (!vputils::isSingleScalar(RepOrWidenR) ||
1243 any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) {
1244 return !U->usesScalars(RepOrWidenR);
1245 }))
1246 continue;
1247
1248 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1249 RepOrWidenR->operands(),
1250 true /*IsSingleScalar*/);
1251 Clone->insertBefore(RepOrWidenR);
1252 RepOrWidenR->replaceAllUsesWith(Clone);
1253 }
1254 }
1255 }
1256
1257 /// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1258 /// to make sure the masks are simplified.
simplifyBlends(VPlan & Plan)1259 static void simplifyBlends(VPlan &Plan) {
1260 using namespace llvm::VPlanPatternMatch;
1261 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1262 vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
1263 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1264 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1265 if (!Blend)
1266 continue;
1267
1268 // Try to remove redundant blend recipes.
1269 SmallPtrSet<VPValue *, 4> UniqueValues;
1270 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1271 UniqueValues.insert(Blend->getIncomingValue(0));
1272 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1273 if (!match(Blend->getMask(I), m_False()))
1274 UniqueValues.insert(Blend->getIncomingValue(I));
1275
1276 if (UniqueValues.size() == 1) {
1277 Blend->replaceAllUsesWith(*UniqueValues.begin());
1278 Blend->eraseFromParent();
1279 continue;
1280 }
1281
1282 if (Blend->isNormalized())
1283 continue;
1284
1285 // Normalize the blend so its first incoming value is used as the initial
1286 // value with the others blended into it.
1287
1288 unsigned StartIndex = 0;
1289 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1290 // If a value's mask is used only by the blend then is can be deadcoded.
1291 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1292 // that's used by multiple blends where it can be removed from them all.
1293 VPValue *Mask = Blend->getMask(I);
1294 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1295 StartIndex = I;
1296 break;
1297 }
1298 }
1299
1300 SmallVector<VPValue *, 4> OperandsWithMask;
1301 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1302
1303 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1304 if (I == StartIndex)
1305 continue;
1306 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1307 OperandsWithMask.push_back(Blend->getMask(I));
1308 }
1309
1310 auto *NewBlend = new VPBlendRecipe(
1311 cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask);
1312 NewBlend->insertBefore(&R);
1313
1314 VPValue *DeadMask = Blend->getMask(StartIndex);
1315 Blend->replaceAllUsesWith(NewBlend);
1316 Blend->eraseFromParent();
1317 recursivelyDeleteDeadRecipes(DeadMask);
1318
1319 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1320 VPValue *NewMask;
1321 if (NewBlend->getNumOperands() == 3 &&
1322 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1323 VPValue *Inc0 = NewBlend->getOperand(0);
1324 VPValue *Inc1 = NewBlend->getOperand(1);
1325 VPValue *OldMask = NewBlend->getOperand(2);
1326 NewBlend->setOperand(0, Inc1);
1327 NewBlend->setOperand(1, Inc0);
1328 NewBlend->setOperand(2, NewMask);
1329 if (OldMask->getNumUsers() == 0)
1330 cast<VPInstruction>(OldMask)->eraseFromParent();
1331 }
1332 }
1333 }
1334 }
1335
1336 /// Optimize the width of vector induction variables in \p Plan based on a known
1337 /// constant Trip Count, \p BestVF and \p BestUF.
optimizeVectorInductionWidthForTCAndVFUF(VPlan & Plan,ElementCount BestVF,unsigned BestUF)1338 static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
1339 ElementCount BestVF,
1340 unsigned BestUF) {
1341 // Only proceed if we have not completely removed the vector region.
1342 if (!Plan.getVectorLoopRegion())
1343 return false;
1344
1345 if (!Plan.getTripCount()->isLiveIn())
1346 return false;
1347 auto *TC = dyn_cast_if_present<ConstantInt>(
1348 Plan.getTripCount()->getUnderlyingValue());
1349 if (!TC || !BestVF.isFixed())
1350 return false;
1351
1352 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1353 // and UF. Returns at least 8.
1354 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1355 APInt AlignedTC =
1356 Align * APIntOps::RoundingUDiv(TC, APInt(TC.getBitWidth(), Align),
1357 APInt::Rounding::UP);
1358 APInt MaxVal = AlignedTC - 1;
1359 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1360 };
1361 unsigned NewBitWidth =
1362 ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
1363
1364 LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
1365 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1366
1367 bool MadeChange = false;
1368
1369 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1370 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1371 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1372
1373 // Currently only handle canonical IVs as it is trivial to replace the start
1374 // and stop values, and we currently only perform the optimization when the
1375 // IV has a single use.
1376 if (!WideIV || !WideIV->isCanonical() ||
1377 WideIV->hasMoreThanOneUniqueUser() ||
1378 NewIVTy == WideIV->getScalarType())
1379 continue;
1380
1381 // Currently only handle cases where the single user is a header-mask
1382 // comparison with the backedge-taken-count.
1383 using namespace VPlanPatternMatch;
1384 if (!match(
1385 *WideIV->user_begin(),
1386 m_Binary<Instruction::ICmp>(
1387 m_Specific(WideIV),
1388 m_Broadcast(m_Specific(Plan.getOrCreateBackedgeTakenCount())))))
1389 continue;
1390
1391 // Update IV operands and comparison bound to use new narrower type.
1392 auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
1393 WideIV->setStartValue(NewStart);
1394 auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
1395 WideIV->setStepValue(NewStep);
1396
1397 auto *NewBTC = new VPWidenCastRecipe(
1398 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1399 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1400 auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
1401 Cmp->setOperand(1, NewBTC);
1402
1403 MadeChange = true;
1404 }
1405
1406 return MadeChange;
1407 }
1408
1409 /// Return true if \p Cond is known to be true for given \p BestVF and \p
1410 /// BestUF.
isConditionTrueViaVFAndUF(VPValue * Cond,VPlan & Plan,ElementCount BestVF,unsigned BestUF,ScalarEvolution & SE)1411 static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
1412 ElementCount BestVF, unsigned BestUF,
1413 ScalarEvolution &SE) {
1414 using namespace llvm::VPlanPatternMatch;
1415 if (match(Cond, m_Binary<Instruction::Or>(m_VPValue(), m_VPValue())))
1416 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1417 &SE](VPValue *C) {
1418 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1419 });
1420
1421 auto *CanIV = Plan.getCanonicalIV();
1422 if (!match(Cond, m_Binary<Instruction::ICmp>(
1423 m_Specific(CanIV->getBackedgeValue()),
1424 m_Specific(&Plan.getVectorTripCount()))) ||
1425 cast<VPRecipeWithIRFlags>(Cond->getDefiningRecipe())->getPredicate() !=
1426 CmpInst::ICMP_EQ)
1427 return false;
1428
1429 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1430 // count is not conveniently available as SCEV so far, so we compare directly
1431 // against the original trip count. This is stricter than necessary, as we
1432 // will only return true if the trip count == vector trip count.
1433 // TODO: Use SCEV for vector trip count once available, to cover cases where
1434 // vector trip count == UF * VF, but original trip count != UF * VF.
1435 const SCEV *TripCount =
1436 vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1437 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1438 "Trip count SCEV must be computable");
1439 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1440 const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1441 return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C);
1442 }
1443
1444 /// Try to simplify the branch condition of \p Plan. This may restrict the
1445 /// resulting plan to \p BestVF and \p BestUF.
simplifyBranchConditionForVFAndUF(VPlan & Plan,ElementCount BestVF,unsigned BestUF,PredicatedScalarEvolution & PSE)1446 static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
1447 unsigned BestUF,
1448 PredicatedScalarEvolution &PSE) {
1449 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1450 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1451 auto *Term = &ExitingVPBB->back();
1452 VPValue *Cond;
1453 ScalarEvolution &SE = *PSE.getSE();
1454 using namespace llvm::VPlanPatternMatch;
1455 if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
1456 match(Term, m_BranchOnCond(
1457 m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {
1458 // Try to simplify the branch condition if TC <= VF * UF when the latch
1459 // terminator is BranchOnCount or BranchOnCond where the input is
1460 // Not(ActiveLaneMask).
1461 const SCEV *TripCount =
1462 vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1463 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1464 "Trip count SCEV must be computable");
1465 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1466 const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1467 if (TripCount->isZero() ||
1468 !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1469 return false;
1470 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1471 // For BranchOnCond, check if we can prove the condition to be true using VF
1472 // and UF.
1473 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1474 return false;
1475 } else {
1476 return false;
1477 }
1478
1479 // The vector loop region only executes once. If possible, completely remove
1480 // the region, otherwise replace the terminator controlling the latch with
1481 // (BranchOnCond true).
1482 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1483 auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
1484 if (all_of(
1485 Header->phis(),
1486 IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
1487 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1488 auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(&HeaderR);
1489 HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue());
1490 HeaderPhiR->eraseFromParent();
1491 }
1492
1493 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1494 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1495 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1496 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1497
1498 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1499 B->setParent(nullptr);
1500
1501 VPBlockUtils::connectBlocks(Preheader, Header);
1502 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1503 VPlanTransforms::simplifyRecipes(Plan, *CanIVTy);
1504 } else {
1505 // The vector region contains header phis for which we cannot remove the
1506 // loop region yet.
1507 LLVMContext &Ctx = SE.getContext();
1508 auto *BOC = new VPInstruction(
1509 VPInstruction::BranchOnCond,
1510 {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc());
1511 ExitingVPBB->appendRecipe(BOC);
1512 }
1513
1514 Term->eraseFromParent();
1515
1516 return true;
1517 }
1518
optimizeForVFAndUF(VPlan & Plan,ElementCount BestVF,unsigned BestUF,PredicatedScalarEvolution & PSE)1519 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1520 unsigned BestUF,
1521 PredicatedScalarEvolution &PSE) {
1522 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1523 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1524
1525 bool MadeChange =
1526 simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1527 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1528
1529 if (MadeChange) {
1530 Plan.setVF(BestVF);
1531 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1532 }
1533 // TODO: Further simplifications are possible
1534 // 1. Replace inductions with constants.
1535 // 2. Replace vector loop region with VPBasicBlock.
1536 }
1537
1538 /// Sink users of \p FOR after the recipe defining the previous value \p
1539 /// Previous of the recurrence. \returns true if all users of \p FOR could be
1540 /// re-arranged as needed or false if it is not possible.
1541 static bool
sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe * FOR,VPRecipeBase * Previous,VPDominatorTree & VPDT)1542 sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
1543 VPRecipeBase *Previous,
1544 VPDominatorTree &VPDT) {
1545 // Collect recipes that need sinking.
1546 SmallVector<VPRecipeBase *> WorkList;
1547 SmallPtrSet<VPRecipeBase *, 8> Seen;
1548 Seen.insert(Previous);
1549 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1550 // The previous value must not depend on the users of the recurrence phi. In
1551 // that case, FOR is not a fixed order recurrence.
1552 if (SinkCandidate == Previous)
1553 return false;
1554
1555 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1556 !Seen.insert(SinkCandidate).second ||
1557 VPDT.properlyDominates(Previous, SinkCandidate))
1558 return true;
1559
1560 if (SinkCandidate->mayHaveSideEffects())
1561 return false;
1562
1563 WorkList.push_back(SinkCandidate);
1564 return true;
1565 };
1566
1567 // Recursively sink users of FOR after Previous.
1568 WorkList.push_back(FOR);
1569 for (unsigned I = 0; I != WorkList.size(); ++I) {
1570 VPRecipeBase *Current = WorkList[I];
1571 assert(Current->getNumDefinedValues() == 1 &&
1572 "only recipes with a single defined value expected");
1573
1574 for (VPUser *User : Current->getVPSingleValue()->users()) {
1575 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1576 return false;
1577 }
1578 }
1579
1580 // Keep recipes to sink ordered by dominance so earlier instructions are
1581 // processed first.
1582 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1583 return VPDT.properlyDominates(A, B);
1584 });
1585
1586 for (VPRecipeBase *SinkCandidate : WorkList) {
1587 if (SinkCandidate == FOR)
1588 continue;
1589
1590 SinkCandidate->moveAfter(Previous);
1591 Previous = SinkCandidate;
1592 }
1593 return true;
1594 }
1595
1596 /// Try to hoist \p Previous and its operands before all users of \p FOR.
hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe * FOR,VPRecipeBase * Previous,VPDominatorTree & VPDT)1597 static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
1598 VPRecipeBase *Previous,
1599 VPDominatorTree &VPDT) {
1600 if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())
1601 return false;
1602
1603 // Collect recipes that need hoisting.
1604 SmallVector<VPRecipeBase *> HoistCandidates;
1605 SmallPtrSet<VPRecipeBase *, 8> Visited;
1606 VPRecipeBase *HoistPoint = nullptr;
1607 // Find the closest hoist point by looking at all users of FOR and selecting
1608 // the recipe dominating all other users.
1609 for (VPUser *U : FOR->users()) {
1610 auto *R = cast<VPRecipeBase>(U);
1611 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1612 HoistPoint = R;
1613 }
1614 assert(all_of(FOR->users(),
1615 [&VPDT, HoistPoint](VPUser *U) {
1616 auto *R = cast<VPRecipeBase>(U);
1617 return HoistPoint == R ||
1618 VPDT.properlyDominates(HoistPoint, R);
1619 }) &&
1620 "HoistPoint must dominate all users of FOR");
1621
1622 auto NeedsHoisting = [HoistPoint, &VPDT,
1623 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1624 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1625 if (!HoistCandidate)
1626 return nullptr;
1627 VPRegionBlock *EnclosingLoopRegion =
1628 HoistCandidate->getParent()->getEnclosingLoopRegion();
1629 assert((!HoistCandidate->getParent()->getParent() ||
1630 HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
1631 "CFG in VPlan should still be flat, without replicate regions");
1632 // Hoist candidate was already visited, no need to hoist.
1633 if (!Visited.insert(HoistCandidate).second)
1634 return nullptr;
1635
1636 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1637 // hoisting.
1638 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1639 return nullptr;
1640
1641 // If we reached a recipe that dominates HoistPoint, we don't need to
1642 // hoist the recipe.
1643 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1644 return nullptr;
1645 return HoistCandidate;
1646 };
1647 auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
1648 // Avoid hoisting candidates with side-effects, as we do not yet analyze
1649 // associated dependencies.
1650 return !HoistCandidate->mayHaveSideEffects();
1651 };
1652
1653 if (!NeedsHoisting(Previous->getVPSingleValue()))
1654 return true;
1655
1656 // Recursively try to hoist Previous and its operands before all users of FOR.
1657 HoistCandidates.push_back(Previous);
1658
1659 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1660 VPRecipeBase *Current = HoistCandidates[I];
1661 assert(Current->getNumDefinedValues() == 1 &&
1662 "only recipes with a single defined value expected");
1663 if (!CanHoist(Current))
1664 return false;
1665
1666 for (VPValue *Op : Current->operands()) {
1667 // If we reach FOR, it means the original Previous depends on some other
1668 // recurrence that in turn depends on FOR. If that is the case, we would
1669 // also need to hoist recipes involving the other FOR, which may break
1670 // dependencies.
1671 if (Op == FOR)
1672 return false;
1673
1674 if (auto *R = NeedsHoisting(Op))
1675 HoistCandidates.push_back(R);
1676 }
1677 }
1678
1679 // Order recipes to hoist by dominance so earlier instructions are processed
1680 // first.
1681 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1682 return VPDT.properlyDominates(A, B);
1683 });
1684
1685 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1686 HoistCandidate->moveBefore(*HoistPoint->getParent(),
1687 HoistPoint->getIterator());
1688 }
1689
1690 return true;
1691 }
1692
adjustFixedOrderRecurrences(VPlan & Plan,VPBuilder & LoopBuilder)1693 bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
1694 VPBuilder &LoopBuilder) {
1695 VPDominatorTree VPDT;
1696 VPDT.recalculate(Plan);
1697
1698 SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
1699 for (VPRecipeBase &R :
1700 Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
1701 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
1702 RecurrencePhis.push_back(FOR);
1703
1704 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1705 SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
1706 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1707 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1708 // to terminate.
1709 while (auto *PrevPhi =
1710 dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Previous)) {
1711 assert(PrevPhi->getParent() == FOR->getParent());
1712 assert(SeenPhis.insert(PrevPhi).second);
1713 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1714 }
1715
1716 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1717 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1718 return false;
1719
1720 // Introduce a recipe to combine the incoming and previous values of a
1721 // fixed-order recurrence.
1722 VPBasicBlock *InsertBlock = Previous->getParent();
1723 if (isa<VPHeaderPHIRecipe>(Previous))
1724 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
1725 else
1726 LoopBuilder.setInsertPoint(InsertBlock,
1727 std::next(Previous->getIterator()));
1728
1729 auto *RecurSplice =
1730 LoopBuilder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
1731 {FOR, FOR->getBackedgeValue()});
1732
1733 FOR->replaceAllUsesWith(RecurSplice);
1734 // Set the first operand of RecurSplice to FOR again, after replacing
1735 // all users.
1736 RecurSplice->setOperand(0, FOR);
1737 }
1738 return true;
1739 }
1740
clearReductionWrapFlags(VPlan & Plan)1741 void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
1742 for (VPRecipeBase &R :
1743 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
1744 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
1745 if (!PhiR)
1746 continue;
1747 RecurKind RK = PhiR->getRecurrenceKind();
1748 if (RK != RecurKind::Add && RK != RecurKind::Mul)
1749 continue;
1750
1751 for (VPUser *U : collectUsersRecursively(PhiR))
1752 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
1753 RecWithFlags->dropPoisonGeneratingFlags();
1754 }
1755 }
1756 }
1757
1758 /// Move loop-invariant recipes out of the vector loop region in \p Plan.
licm(VPlan & Plan)1759 static void licm(VPlan &Plan) {
1760 VPBasicBlock *Preheader = Plan.getVectorPreheader();
1761
1762 // Return true if we do not know how to (mechanically) hoist a given recipe
1763 // out of a loop region. Does not address legality concerns such as aliasing
1764 // or speculation safety.
1765 auto CannotHoistRecipe = [](VPRecipeBase &R) {
1766 // Allocas cannot be hoisted.
1767 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1768 return RepR && RepR->getOpcode() == Instruction::Alloca;
1769 };
1770
1771 // Hoist any loop invariant recipes from the vector loop region to the
1772 // preheader. Preform a shallow traversal of the vector loop region, to
1773 // exclude recipes in replicate regions.
1774 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1775 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1776 vp_depth_first_shallow(LoopRegion->getEntry()))) {
1777 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1778 if (CannotHoistRecipe(R))
1779 continue;
1780 // TODO: Relax checks in the future, e.g. we could also hoist reads, if
1781 // their memory location is not modified in the vector loop.
1782 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
1783 any_of(R.operands(), [](VPValue *Op) {
1784 return !Op->isDefinedOutsideLoopRegions();
1785 }))
1786 continue;
1787 R.moveBefore(*Preheader, Preheader->end());
1788 }
1789 }
1790 }
1791
truncateToMinimalBitwidths(VPlan & Plan,const MapVector<Instruction *,uint64_t> & MinBWs)1792 void VPlanTransforms::truncateToMinimalBitwidths(
1793 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
1794 // Keep track of created truncates, so they can be re-used. Note that we
1795 // cannot use RAUW after creating a new truncate, as this would could make
1796 // other uses have different types for their operands, making them invalidly
1797 // typed.
1798 DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
1799 Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
1800 VPTypeAnalysis TypeInfo(CanonicalIVType);
1801 VPBasicBlock *PH = Plan.getVectorPreheader();
1802 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1803 vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
1804 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1805 if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
1806 VPWidenSelectRecipe, VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(
1807 &R))
1808 continue;
1809
1810 VPValue *ResultVPV = R.getVPSingleValue();
1811 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
1812 unsigned NewResSizeInBits = MinBWs.lookup(UI);
1813 if (!NewResSizeInBits)
1814 continue;
1815
1816 // If the value wasn't vectorized, we must maintain the original scalar
1817 // type. Skip those here, after incrementing NumProcessedRecipes. Also
1818 // skip casts which do not need to be handled explicitly here, as
1819 // redundant casts will be removed during recipe simplification.
1820 if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R))
1821 continue;
1822
1823 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
1824 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
1825 assert(OldResTy->isIntegerTy() && "only integer types supported");
1826 (void)OldResSizeInBits;
1827
1828 LLVMContext &Ctx = CanonicalIVType->getContext();
1829 auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);
1830
1831 // Any wrapping introduced by shrinking this operation shouldn't be
1832 // considered undefined behavior. So, we can't unconditionally copy
1833 // arithmetic wrapping flags to VPW.
1834 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
1835 VPW->dropPoisonGeneratingFlags();
1836
1837 using namespace llvm::VPlanPatternMatch;
1838 if (OldResSizeInBits != NewResSizeInBits &&
1839 !match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue()))) {
1840 // Extend result to original width.
1841 auto *Ext =
1842 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
1843 Ext->insertAfter(&R);
1844 ResultVPV->replaceAllUsesWith(Ext);
1845 Ext->setOperand(0, ResultVPV);
1846 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
1847 } else {
1848 assert(
1849 match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) &&
1850 "Only ICmps should not need extending the result.");
1851 }
1852
1853 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
1854 if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(&R))
1855 continue;
1856
1857 // Shrink operands by introducing truncates as needed.
1858 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
1859 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
1860 auto *Op = R.getOperand(Idx);
1861 unsigned OpSizeInBits =
1862 TypeInfo.inferScalarType(Op)->getScalarSizeInBits();
1863 if (OpSizeInBits == NewResSizeInBits)
1864 continue;
1865 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
1866 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
1867 VPWidenCastRecipe *NewOp =
1868 IterIsEmpty
1869 ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)
1870 : ProcessedIter->second;
1871 R.setOperand(Idx, NewOp);
1872 if (!IterIsEmpty)
1873 continue;
1874 ProcessedIter->second = NewOp;
1875 if (!Op->isLiveIn()) {
1876 NewOp->insertBefore(&R);
1877 } else {
1878 PH->appendRecipe(NewOp);
1879 }
1880 }
1881
1882 }
1883 }
1884 }
1885
1886 /// Remove BranchOnCond recipes with true or false conditions together with
1887 /// removing dead edges to their successors.
removeBranchOnConst(VPlan & Plan)1888 static void removeBranchOnConst(VPlan &Plan) {
1889 using namespace llvm::VPlanPatternMatch;
1890 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1891 vp_depth_first_shallow(Plan.getEntry()))) {
1892 VPValue *Cond;
1893 if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() ||
1894 !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
1895 continue;
1896
1897 unsigned RemovedIdx;
1898 if (match(Cond, m_True()))
1899 RemovedIdx = 1;
1900 else if (match(Cond, m_False()))
1901 RemovedIdx = 0;
1902 else
1903 continue;
1904
1905 VPBasicBlock *RemovedSucc =
1906 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
1907 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
1908 "There must be a single edge between VPBB and its successor");
1909 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
1910 // these recipes.
1911 for (VPRecipeBase &R : RemovedSucc->phis()) {
1912 auto *Phi = cast<VPPhiAccessors>(&R);
1913 assert((!isa<VPIRPhi>(&R) || RemovedSucc->getNumPredecessors() == 1) &&
1914 "VPIRPhis must have a single predecessor");
1915 Phi->removeIncomingValueFor(VPBB);
1916 }
1917 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
1918 // automatically on VPlan destruction if it becomes unreachable.
1919 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
1920 VPBB->back().eraseFromParent();
1921 }
1922 }
1923
optimize(VPlan & Plan)1924 void VPlanTransforms::optimize(VPlan &Plan) {
1925 runPass(removeRedundantCanonicalIVs, Plan);
1926 runPass(removeRedundantInductionCasts, Plan);
1927
1928 runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
1929 runPass(simplifyBlends, Plan);
1930 runPass(removeDeadRecipes, Plan);
1931 runPass(narrowToSingleScalarRecipes, Plan);
1932 runPass(legalizeAndOptimizeInductions, Plan);
1933 runPass(removeRedundantExpandSCEVRecipes, Plan);
1934 runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
1935 runPass(removeBranchOnConst, Plan);
1936 runPass(removeDeadRecipes, Plan);
1937
1938 runPass(createAndOptimizeReplicateRegions, Plan);
1939 runPass(mergeBlocksIntoPredecessors, Plan);
1940 runPass(licm, Plan);
1941 }
1942
1943 // Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
1944 // the loop terminator with a branch-on-cond recipe with the negated
1945 // active-lane-mask as operand. Note that this turns the loop into an
1946 // uncountable one. Only the existing terminator is replaced, all other existing
1947 // recipes/users remain unchanged, except for poison-generating flags being
1948 // dropped from the canonical IV increment. Return the created
1949 // VPActiveLaneMaskPHIRecipe.
1950 //
1951 // The function uses the following definitions:
1952 //
1953 // %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
1954 // calculate-trip-count-minus-VF (original TC) : original TC
1955 // %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
1956 // CanonicalIVPhi : CanonicalIVIncrement
1957 // %StartV is the canonical induction start value.
1958 //
1959 // The function adds the following recipes:
1960 //
1961 // vector.ph:
1962 // %TripCount = calculate-trip-count-minus-VF (original TC)
1963 // [if DataWithControlFlowWithoutRuntimeCheck]
1964 // %EntryInc = canonical-iv-increment-for-part %StartV
1965 // %EntryALM = active-lane-mask %EntryInc, %TripCount
1966 //
1967 // vector.body:
1968 // ...
1969 // %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
1970 // ...
1971 // %InLoopInc = canonical-iv-increment-for-part %IncrementValue
1972 // %ALM = active-lane-mask %InLoopInc, TripCount
1973 // %Negated = Not %ALM
1974 // branch-on-cond %Negated
1975 //
addVPLaneMaskPhiAndUpdateExitBranch(VPlan & Plan,bool DataAndControlFlowWithoutRuntimeCheck)1976 static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
1977 VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
1978 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
1979 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
1980 auto *CanonicalIVPHI = Plan.getCanonicalIV();
1981 VPValue *StartV = CanonicalIVPHI->getStartValue();
1982
1983 auto *CanonicalIVIncrement =
1984 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
1985 // TODO: Check if dropping the flags is needed if
1986 // !DataAndControlFlowWithoutRuntimeCheck.
1987 CanonicalIVIncrement->dropPoisonGeneratingFlags();
1988 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
1989 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
1990 // we have to take unrolling into account. Each part needs to start at
1991 // Part * VF
1992 auto *VecPreheader = Plan.getVectorPreheader();
1993 VPBuilder Builder(VecPreheader);
1994
1995 // Create the ActiveLaneMask instruction using the correct start values.
1996 VPValue *TC = Plan.getTripCount();
1997
1998 VPValue *TripCount, *IncrementValue;
1999 if (!DataAndControlFlowWithoutRuntimeCheck) {
2000 // When the loop is guarded by a runtime overflow check for the loop
2001 // induction variable increment by VF, we can increment the value before
2002 // the get.active.lane mask and use the unmodified tripcount.
2003 IncrementValue = CanonicalIVIncrement;
2004 TripCount = TC;
2005 } else {
2006 // When avoiding a runtime check, the active.lane.mask inside the loop
2007 // uses a modified trip count and the induction variable increment is
2008 // done after the active.lane.mask intrinsic is called.
2009 IncrementValue = CanonicalIVPHI;
2010 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2011 {TC}, DL);
2012 }
2013 auto *EntryIncrement = Builder.createOverflowingOp(
2014 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2015 "index.part.next");
2016
2017 // Create the active lane mask instruction in the VPlan preheader.
2018 auto *EntryALM =
2019 Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
2020 DL, "active.lane.mask.entry");
2021
2022 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2023 // preheader ActiveLaneMask instruction.
2024 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
2025 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2026
2027 // Create the active lane mask for the next iteration of the loop before the
2028 // original terminator.
2029 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2030 Builder.setInsertPoint(OriginalTerminator);
2031 auto *InLoopIncrement =
2032 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2033 {IncrementValue}, {false, false}, DL);
2034 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2035 {InLoopIncrement, TripCount}, DL,
2036 "active.lane.mask.next");
2037 LaneMaskPhi->addOperand(ALM);
2038
2039 // Replace the original terminator with BranchOnCond. We have to invert the
2040 // mask here because a true condition means jumping to the exit block.
2041 auto *NotMask = Builder.createNot(ALM, DL);
2042 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2043 OriginalTerminator->eraseFromParent();
2044 return LaneMaskPhi;
2045 }
2046
2047 /// Collect all VPValues representing a header mask through the (ICMP_ULE,
2048 /// WideCanonicalIV, backedge-taken-count) pattern.
2049 /// TODO: Introduce explicit recipe for header-mask instead of searching
2050 /// for the header-mask pattern manually.
collectAllHeaderMasks(VPlan & Plan)2051 static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
2052 SmallVector<VPValue *> WideCanonicalIVs;
2053 auto *FoundWidenCanonicalIVUser =
2054 find_if(Plan.getCanonicalIV()->users(),
2055 [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
2056 assert(count_if(Plan.getCanonicalIV()->users(),
2057 [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <=
2058 1 &&
2059 "Must have at most one VPWideCanonicalIVRecipe");
2060 if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
2061 auto *WideCanonicalIV =
2062 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2063 WideCanonicalIVs.push_back(WideCanonicalIV);
2064 }
2065
2066 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2067 // version of the canonical induction.
2068 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2069 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2070 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2071 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2072 WideCanonicalIVs.push_back(WidenOriginalIV);
2073 }
2074
2075 // Walk users of wide canonical IVs and collect to all compares of the form
2076 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2077 SmallVector<VPValue *> HeaderMasks;
2078 for (auto *Wide : WideCanonicalIVs) {
2079 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2080 auto *HeaderMask = dyn_cast<VPInstruction>(U);
2081 if (!HeaderMask || !vputils::isHeaderMask(HeaderMask, Plan))
2082 continue;
2083
2084 assert(HeaderMask->getOperand(0) == Wide &&
2085 "WidenCanonicalIV must be the first operand of the compare");
2086 HeaderMasks.push_back(HeaderMask);
2087 }
2088 }
2089 return HeaderMasks;
2090 }
2091
addActiveLaneMask(VPlan & Plan,bool UseActiveLaneMaskForControlFlow,bool DataAndControlFlowWithoutRuntimeCheck)2092 void VPlanTransforms::addActiveLaneMask(
2093 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2094 bool DataAndControlFlowWithoutRuntimeCheck) {
2095 assert((!DataAndControlFlowWithoutRuntimeCheck ||
2096 UseActiveLaneMaskForControlFlow) &&
2097 "DataAndControlFlowWithoutRuntimeCheck implies "
2098 "UseActiveLaneMaskForControlFlow");
2099
2100 auto *FoundWidenCanonicalIVUser =
2101 find_if(Plan.getCanonicalIV()->users(),
2102 [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
2103 assert(FoundWidenCanonicalIVUser &&
2104 "Must have widened canonical IV when tail folding!");
2105 auto *WideCanonicalIV =
2106 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2107 VPSingleDefRecipe *LaneMask;
2108 if (UseActiveLaneMaskForControlFlow) {
2109 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
2110 Plan, DataAndControlFlowWithoutRuntimeCheck);
2111 } else {
2112 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2113 LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
2114 {WideCanonicalIV, Plan.getTripCount()}, nullptr,
2115 "active.lane.mask");
2116 }
2117
2118 // Walk users of WideCanonicalIV and replace all compares of the form
2119 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
2120 // active-lane-mask.
2121 for (VPValue *HeaderMask : collectAllHeaderMasks(Plan))
2122 HeaderMask->replaceAllUsesWith(LaneMask);
2123 }
2124
2125 /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2126 /// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2127 /// recipe could be created.
2128 /// \p HeaderMask Header Mask.
2129 /// \p CurRecipe Recipe to be transform.
2130 /// \p TypeInfo VPlan-based type analysis.
2131 /// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
2132 /// \p EVL The explicit vector length parameter of vector-predication
2133 /// intrinsics.
optimizeMaskToEVL(VPValue * HeaderMask,VPRecipeBase & CurRecipe,VPTypeAnalysis & TypeInfo,VPValue & AllOneMask,VPValue & EVL)2134 static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
2135 VPRecipeBase &CurRecipe,
2136 VPTypeAnalysis &TypeInfo,
2137 VPValue &AllOneMask, VPValue &EVL) {
2138 using namespace llvm::VPlanPatternMatch;
2139 auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
2140 assert(OrigMask && "Unmasked recipe when folding tail");
2141 // HeaderMask will be handled using EVL.
2142 VPValue *Mask;
2143 if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
2144 return Mask;
2145 return HeaderMask == OrigMask ? nullptr : OrigMask;
2146 };
2147
2148 return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
2149 .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
2150 VPValue *NewMask = GetNewMask(L->getMask());
2151 return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
2152 })
2153 .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
2154 VPValue *NewMask = GetNewMask(S->getMask());
2155 return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
2156 })
2157 .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
2158 VPValue *NewMask = GetNewMask(Red->getCondOp());
2159 return new VPReductionEVLRecipe(*Red, EVL, NewMask);
2160 })
2161 .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
2162 VPValue *LHS, *RHS;
2163 // Transform select with a header mask condition
2164 // select(header_mask, LHS, RHS)
2165 // into vector predication merge.
2166 // vp.merge(all-true, LHS, RHS, EVL)
2167 if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
2168 m_VPValue(RHS))))
2169 return nullptr;
2170 // Use all true as the condition because this transformation is
2171 // limited to selects whose condition is a header mask.
2172 return new VPWidenIntrinsicRecipe(
2173 Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2174 TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
2175 })
2176 .Default([&](VPRecipeBase *R) { return nullptr; });
2177 }
2178
2179 /// Replace recipes with their EVL variants.
transformRecipestoEVLRecipes(VPlan & Plan,VPValue & EVL)2180 static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
2181 Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
2182 VPTypeAnalysis TypeInfo(CanonicalIVType);
2183 LLVMContext &Ctx = CanonicalIVType->getContext();
2184 VPValue *AllOneMask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
2185 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2186 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2187
2188 assert(all_of(Plan.getVF().users(),
2189 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
2190 VPWidenIntOrFpInductionRecipe>) &&
2191 "User of VF that we can't transform to EVL.");
2192 Plan.getVF().replaceAllUsesWith(&EVL);
2193
2194 // Defer erasing recipes till the end so that we don't invalidate the
2195 // VPTypeAnalysis cache.
2196 SmallVector<VPRecipeBase *> ToErase;
2197
2198 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2199 // contained.
2200 bool ContainsFORs =
2201 any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);
2202 if (ContainsFORs) {
2203 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2204 VPValue *MaxEVL = &Plan.getVF();
2205 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2206 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2207 MaxEVL = Builder.createScalarZExtOrTrunc(MaxEVL, Type::getInt32Ty(Ctx),
2208 TypeInfo.inferScalarType(MaxEVL),
2209 DebugLoc());
2210
2211 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2212 VPValue *PrevEVL =
2213 Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
2214
2215 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2216 vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
2217 for (VPRecipeBase &R : *VPBB) {
2218 using namespace VPlanPatternMatch;
2219 VPValue *V1, *V2;
2220 if (!match(&R,
2221 m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
2222 m_VPValue(V1), m_VPValue(V2))))
2223 continue;
2224 VPValue *Imm = Plan.getOrAddLiveIn(
2225 ConstantInt::getSigned(Type::getInt32Ty(Ctx), -1));
2226 VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
2227 Intrinsic::experimental_vp_splice,
2228 {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
2229 TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
2230 VPSplice->insertBefore(&R);
2231 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2232 ToErase.push_back(&R);
2233 }
2234 }
2235 }
2236
2237 // Try to optimize header mask recipes away to their EVL variants.
2238 for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
2239 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
2240 auto *CurRecipe = cast<VPRecipeBase>(U);
2241 VPRecipeBase *EVLRecipe =
2242 optimizeMaskToEVL(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
2243 if (!EVLRecipe)
2244 continue;
2245
2246 [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2247 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2248 "New recipe must define the same number of values as the "
2249 "original.");
2250 assert(
2251 NumDefVal <= 1 &&
2252 "Only supports recipes with a single definition or without users.");
2253 EVLRecipe->insertBefore(CurRecipe);
2254 if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
2255 VPValue *CurVPV = CurRecipe->getVPSingleValue();
2256 CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
2257 }
2258 ToErase.push_back(CurRecipe);
2259 }
2260 }
2261
2262 for (VPRecipeBase *R : reverse(ToErase)) {
2263 SmallVector<VPValue *> PossiblyDead(R->operands());
2264 R->eraseFromParent();
2265 for (VPValue *Op : PossiblyDead)
2266 recursivelyDeleteDeadRecipes(Op);
2267 }
2268 }
2269
2270 /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2271 /// replaces all uses except the canonical IV increment of
2272 /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2273 /// is used only for loop iterations counting after this transformation.
2274 ///
2275 /// The function uses the following definitions:
2276 /// %StartV is the canonical induction start value.
2277 ///
2278 /// The function adds the following recipes:
2279 ///
2280 /// vector.ph:
2281 /// ...
2282 ///
2283 /// vector.body:
2284 /// ...
2285 /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2286 /// [ %NextEVLIV, %vector.body ]
2287 /// %AVL = sub original TC, %EVLPhi
2288 /// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2289 /// ...
2290 /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
2291 /// ...
2292 ///
2293 /// If MaxSafeElements is provided, the function adds the following recipes:
2294 /// vector.ph:
2295 /// ...
2296 ///
2297 /// vector.body:
2298 /// ...
2299 /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2300 /// [ %NextEVLIV, %vector.body ]
2301 /// %AVL = sub original TC, %EVLPhi
2302 /// %cmp = cmp ult %AVL, MaxSafeElements
2303 /// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2304 /// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2305 /// ...
2306 /// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi
2307 /// ...
2308 ///
tryAddExplicitVectorLength(VPlan & Plan,const std::optional<unsigned> & MaxSafeElements)2309 bool VPlanTransforms::tryAddExplicitVectorLength(
2310 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2311 VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2312 // The transform updates all users of inductions to work based on EVL, instead
2313 // of the VF directly. At the moment, widened pointer inductions cannot be
2314 // updated, so bail out if the plan contains any.
2315 bool ContainsWidenPointerInductions =
2316 any_of(Header->phis(), IsaPred<VPWidenPointerInductionRecipe>);
2317 if (ContainsWidenPointerInductions)
2318 return false;
2319
2320 auto *CanonicalIVPHI = Plan.getCanonicalIV();
2321 auto *CanIVTy = CanonicalIVPHI->getScalarType();
2322 VPValue *StartV = CanonicalIVPHI->getStartValue();
2323
2324 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2325 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
2326 EVLPhi->insertAfter(CanonicalIVPHI);
2327 VPBuilder Builder(Header, Header->getFirstNonPhi());
2328 // Compute original TC - IV as the AVL (application vector length).
2329 VPValue *AVL = Builder.createNaryOp(
2330 Instruction::Sub, {Plan.getTripCount(), EVLPhi}, DebugLoc(), "avl");
2331 if (MaxSafeElements) {
2332 // Support for MaxSafeDist for correct loop emission.
2333 VPValue *AVLSafe =
2334 Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
2335 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2336 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");
2337 }
2338 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2339 DebugLoc());
2340
2341 auto *CanonicalIVIncrement =
2342 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2343 Builder.setInsertPoint(CanonicalIVIncrement);
2344 VPValue *OpVPEVL = VPEVL;
2345
2346 auto *I32Ty = Type::getInt32Ty(CanIVTy->getContext());
2347 OpVPEVL = Builder.createScalarZExtOrTrunc(
2348 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2349
2350 auto *NextEVLIV = Builder.createOverflowingOp(
2351 Instruction::Add, {OpVPEVL, EVLPhi},
2352 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2353 CanonicalIVIncrement->hasNoSignedWrap()},
2354 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2355 EVLPhi->addOperand(NextEVLIV);
2356
2357 transformRecipestoEVLRecipes(Plan, *VPEVL);
2358
2359 // Replace all uses of VPCanonicalIVPHIRecipe by
2360 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2361 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2362 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2363 // TODO: support unroll factor > 1.
2364 Plan.setUF(1);
2365 return true;
2366 }
2367
dropPoisonGeneratingRecipes(VPlan & Plan,const std::function<bool (BasicBlock *)> & BlockNeedsPredication)2368 void VPlanTransforms::dropPoisonGeneratingRecipes(
2369 VPlan &Plan,
2370 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2371 // Collect recipes in the backward slice of `Root` that may generate a poison
2372 // value that is used after vectorization.
2373 SmallPtrSet<VPRecipeBase *, 16> Visited;
2374 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2375 SmallVector<VPRecipeBase *, 16> Worklist;
2376 Worklist.push_back(Root);
2377
2378 // Traverse the backward slice of Root through its use-def chain.
2379 while (!Worklist.empty()) {
2380 VPRecipeBase *CurRec = Worklist.pop_back_val();
2381
2382 if (!Visited.insert(CurRec).second)
2383 continue;
2384
2385 // Prune search if we find another recipe generating a widen memory
2386 // instruction. Widen memory instructions involved in address computation
2387 // will lead to gather/scatter instructions, which don't need to be
2388 // handled.
2389 if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
2390 VPHeaderPHIRecipe>(CurRec))
2391 continue;
2392
2393 // This recipe contributes to the address computation of a widen
2394 // load/store. If the underlying instruction has poison-generating flags,
2395 // drop them directly.
2396 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
2397 VPValue *A, *B;
2398 using namespace llvm::VPlanPatternMatch;
2399 // Dropping disjoint from an OR may yield incorrect results, as some
2400 // analysis may have converted it to an Add implicitly (e.g. SCEV used
2401 // for dependence analysis). Instead, replace it with an equivalent Add.
2402 // This is possible as all users of the disjoint OR only access lanes
2403 // where the operands are disjoint or poison otherwise.
2404 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
2405 RecWithFlags->isDisjoint()) {
2406 VPBuilder Builder(RecWithFlags);
2407 VPInstruction *New = Builder.createOverflowingOp(
2408 Instruction::Add, {A, B}, {false, false},
2409 RecWithFlags->getDebugLoc());
2410 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
2411 RecWithFlags->replaceAllUsesWith(New);
2412 RecWithFlags->eraseFromParent();
2413 CurRec = New;
2414 } else
2415 RecWithFlags->dropPoisonGeneratingFlags();
2416 } else {
2417 Instruction *Instr = dyn_cast_or_null<Instruction>(
2418 CurRec->getVPSingleValue()->getUnderlyingValue());
2419 (void)Instr;
2420 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
2421 "found instruction with poison generating flags not covered by "
2422 "VPRecipeWithIRFlags");
2423 }
2424
2425 // Add new definitions to the worklist.
2426 for (VPValue *Operand : CurRec->operands())
2427 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
2428 Worklist.push_back(OpDef);
2429 }
2430 });
2431
2432 // Traverse all the recipes in the VPlan and collect the poison-generating
2433 // recipes in the backward slice starting at the address of a VPWidenRecipe or
2434 // VPInterleaveRecipe.
2435 auto Iter = vp_depth_first_deep(Plan.getEntry());
2436 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
2437 for (VPRecipeBase &Recipe : *VPBB) {
2438 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
2439 Instruction &UnderlyingInstr = WidenRec->getIngredient();
2440 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
2441 if (AddrDef && WidenRec->isConsecutive() &&
2442 BlockNeedsPredication(UnderlyingInstr.getParent()))
2443 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2444 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
2445 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
2446 if (AddrDef) {
2447 // Check if any member of the interleave group needs predication.
2448 const InterleaveGroup<Instruction> *InterGroup =
2449 InterleaveRec->getInterleaveGroup();
2450 bool NeedPredication = false;
2451 for (int I = 0, NumMembers = InterGroup->getNumMembers();
2452 I < NumMembers; ++I) {
2453 Instruction *Member = InterGroup->getMember(I);
2454 if (Member)
2455 NeedPredication |= BlockNeedsPredication(Member->getParent());
2456 }
2457
2458 if (NeedPredication)
2459 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2460 }
2461 }
2462 }
2463 }
2464 }
2465
createInterleaveGroups(VPlan & Plan,const SmallPtrSetImpl<const InterleaveGroup<Instruction> * > & InterleaveGroups,VPRecipeBuilder & RecipeBuilder,const bool & ScalarEpilogueAllowed)2466 void VPlanTransforms::createInterleaveGroups(
2467 VPlan &Plan,
2468 const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
2469 &InterleaveGroups,
2470 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
2471 if (InterleaveGroups.empty())
2472 return;
2473
2474 // Interleave memory: for each Interleave Group we marked earlier as relevant
2475 // for this VPlan, replace the Recipes widening its memory instructions with a
2476 // single VPInterleaveRecipe at its insertion point.
2477 VPDominatorTree VPDT;
2478 VPDT.recalculate(Plan);
2479 for (const auto *IG : InterleaveGroups) {
2480 SmallVector<VPValue *, 4> StoredValues;
2481 for (unsigned i = 0; i < IG->getFactor(); ++i)
2482 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
2483 auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
2484 StoredValues.push_back(StoreR->getStoredValue());
2485 }
2486
2487 bool NeedsMaskForGaps =
2488 IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
2489
2490 Instruction *IRInsertPos = IG->getInsertPos();
2491 auto *InsertPos =
2492 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
2493
2494 bool InBounds = false;
2495 if (auto *Gep = dyn_cast<GetElementPtrInst>(
2496 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
2497 InBounds = Gep->isInBounds();
2498
2499 // Get or create the start address for the interleave group.
2500 auto *Start =
2501 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
2502 VPValue *Addr = Start->getAddr();
2503 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
2504 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
2505 // We cannot re-use the address of member zero because it does not
2506 // dominate the insert position. Instead, use the address of the insert
2507 // position and create a PtrAdd adjusting it to the address of member
2508 // zero.
2509 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
2510 // InsertPos or sink loads above zero members to join it.
2511 assert(IG->getIndex(IRInsertPos) != 0 &&
2512 "index of insert position shouldn't be zero");
2513 auto &DL = IRInsertPos->getDataLayout();
2514 APInt Offset(32,
2515 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
2516 IG->getIndex(IRInsertPos),
2517 /*IsSigned=*/true);
2518 VPValue *OffsetVPV = Plan.getOrAddLiveIn(
2519 ConstantInt::get(IRInsertPos->getParent()->getContext(), -Offset));
2520 VPBuilder B(InsertPos);
2521 Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
2522 : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
2523 }
2524 // If the group is reverse, adjust the index to refer to the last vector
2525 // lane instead of the first. We adjust the index from the first vector
2526 // lane, rather than directly getting the pointer for lane VF - 1, because
2527 // the pointer operand of the interleaved access is supposed to be uniform.
2528 if (IG->isReverse()) {
2529 auto *ReversePtr = new VPVectorEndPointerRecipe(
2530 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
2531 -(int64_t)IG->getFactor(),
2532 InBounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none(),
2533 InsertPos->getDebugLoc());
2534 ReversePtr->insertBefore(InsertPos);
2535 Addr = ReversePtr;
2536 }
2537 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
2538 InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc());
2539 VPIG->insertBefore(InsertPos);
2540
2541 unsigned J = 0;
2542 for (unsigned i = 0; i < IG->getFactor(); ++i)
2543 if (Instruction *Member = IG->getMember(i)) {
2544 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
2545 if (!Member->getType()->isVoidTy()) {
2546 VPValue *OriginalV = MemberR->getVPSingleValue();
2547 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
2548 J++;
2549 }
2550 MemberR->eraseFromParent();
2551 }
2552 }
2553 }
2554
2555 /// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
2556 /// value, phi and backedge value. In the following example:
2557 ///
2558 /// vector.ph:
2559 /// Successor(s): vector loop
2560 ///
2561 /// <x1> vector loop: {
2562 /// vector.body:
2563 /// WIDEN-INDUCTION %i = phi %start, %step, %vf
2564 /// ...
2565 /// EMIT branch-on-count ...
2566 /// No successors
2567 /// }
2568 ///
2569 /// WIDEN-INDUCTION will get expanded to:
2570 ///
2571 /// vector.ph:
2572 /// ...
2573 /// vp<%induction.start> = ...
2574 /// vp<%induction.increment> = ...
2575 ///
2576 /// Successor(s): vector loop
2577 ///
2578 /// <x1> vector loop: {
2579 /// vector.body:
2580 /// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
2581 /// ...
2582 /// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
2583 /// EMIT branch-on-count ...
2584 /// No successors
2585 /// }
2586 static void
expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe * WidenIVR,VPTypeAnalysis & TypeInfo)2587 expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
2588 VPTypeAnalysis &TypeInfo) {
2589 VPlan *Plan = WidenIVR->getParent()->getPlan();
2590 VPValue *Start = WidenIVR->getStartValue();
2591 VPValue *Step = WidenIVR->getStepValue();
2592 VPValue *VF = WidenIVR->getVFValue();
2593 DebugLoc DL = WidenIVR->getDebugLoc();
2594
2595 // The value from the original loop to which we are mapping the new induction
2596 // variable.
2597 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
2598
2599 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
2600 Instruction::BinaryOps AddOp;
2601 Instruction::BinaryOps MulOp;
2602 // FIXME: The newly created binary instructions should contain nsw/nuw
2603 // flags, which can be found from the original scalar operations.
2604 VPIRFlags Flags;
2605 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
2606 AddOp = Instruction::Add;
2607 MulOp = Instruction::Mul;
2608 } else {
2609 AddOp = ID.getInductionOpcode();
2610 MulOp = Instruction::FMul;
2611 Flags = ID.getInductionBinOp()->getFastMathFlags();
2612 }
2613
2614 // If the phi is truncated, truncate the start and step values.
2615 VPBuilder Builder(Plan->getVectorPreheader());
2616 Type *StepTy = TypeInfo.inferScalarType(Step);
2617 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
2618 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
2619 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
2620 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
2621 StepTy = Ty;
2622 }
2623
2624 // Construct the initial value of the vector IV in the vector loop preheader.
2625 Type *IVIntTy =
2626 IntegerType::get(StepTy->getContext(), StepTy->getScalarSizeInBits());
2627 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
2628 if (StepTy->isFloatingPointTy())
2629 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
2630
2631 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
2632 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
2633
2634 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
2635 Init =
2636 Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction");
2637
2638 // Create the widened phi of the vector IV.
2639 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
2640 WidenIVR->getDebugLoc(), "vec.ind");
2641 WidePHI->addOperand(Init);
2642 WidePHI->insertBefore(WidenIVR);
2643
2644 // Create the backedge value for the vector IV.
2645 VPValue *Inc;
2646 VPValue *Prev;
2647 // If unrolled, use the increment and prev value from the operands.
2648 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
2649 Inc = SplatVF;
2650 Prev = WidenIVR->getLastUnrolledPartOperand();
2651 } else {
2652 if (VPRecipeBase *R = VF->getDefiningRecipe())
2653 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
2654 // Multiply the vectorization factor by the step using integer or
2655 // floating-point arithmetic as appropriate.
2656 if (StepTy->isFloatingPointTy())
2657 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
2658 DL);
2659 else
2660 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
2661 TypeInfo.inferScalarType(VF), DL);
2662
2663 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
2664 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
2665 Prev = WidePHI;
2666 }
2667
2668 VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
2669 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
2670 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
2671 WidenIVR->getDebugLoc(), "vec.ind.next");
2672
2673 WidePHI->addOperand(Next);
2674
2675 WidenIVR->replaceAllUsesWith(WidePHI);
2676 }
2677
dissolveLoopRegions(VPlan & Plan)2678 void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
2679 // Replace loop regions with explicity CFG.
2680 SmallVector<VPRegionBlock *> LoopRegions;
2681 for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
2682 vp_depth_first_deep(Plan.getEntry()))) {
2683 if (!R->isReplicator())
2684 LoopRegions.push_back(R);
2685 }
2686 for (VPRegionBlock *R : LoopRegions)
2687 R->dissolveToCFGLoop();
2688 }
2689
convertToConcreteRecipes(VPlan & Plan,Type & CanonicalIVTy)2690 void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
2691 Type &CanonicalIVTy) {
2692 using namespace llvm::VPlanPatternMatch;
2693
2694 VPTypeAnalysis TypeInfo(&CanonicalIVTy);
2695 SmallVector<VPRecipeBase *> ToRemove;
2696 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2697 vp_depth_first_deep(Plan.getEntry()))) {
2698 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2699 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2700 auto *ScalarR = VPBuilder(PhiR).createScalarPhi(
2701 {PhiR->getStartValue(), PhiR->getBackedgeValue()},
2702 PhiR->getDebugLoc(), "evl.based.iv");
2703 PhiR->replaceAllUsesWith(ScalarR);
2704 ToRemove.push_back(PhiR);
2705 continue;
2706 }
2707
2708 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
2709 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
2710 ToRemove.push_back(WidenIVR);
2711 continue;
2712 }
2713
2714 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
2715 Expr->decompose();
2716 ToRemove.push_back(Expr);
2717 }
2718
2719 VPValue *VectorStep;
2720 VPValue *ScalarStep;
2721 if (!match(&R, m_VPInstruction<VPInstruction::WideIVStep>(
2722 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
2723 continue;
2724
2725 // Expand WideIVStep.
2726 auto *VPI = cast<VPInstruction>(&R);
2727 VPBuilder Builder(VPI);
2728 Type *IVTy = TypeInfo.inferScalarType(VPI);
2729 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
2730 Instruction::CastOps CastOp = IVTy->isFloatingPointTy()
2731 ? Instruction::UIToFP
2732 : Instruction::Trunc;
2733 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
2734 }
2735
2736 [[maybe_unused]] auto *ConstStep =
2737 ScalarStep->isLiveIn()
2738 ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
2739 : nullptr;
2740 assert(!ConstStep || ConstStep->getValue() != 1);
2741 (void)ConstStep;
2742 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
2743 ScalarStep =
2744 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
2745 }
2746
2747 VPIRFlags Flags;
2748 if (IVTy->isFloatingPointTy())
2749 Flags = {VPI->getFastMathFlags()};
2750
2751 unsigned MulOpc =
2752 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
2753 VPInstruction *Mul = Builder.createNaryOp(
2754 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
2755 VectorStep = Mul;
2756 VPI->replaceAllUsesWith(VectorStep);
2757 ToRemove.push_back(VPI);
2758 }
2759 }
2760
2761 for (VPRecipeBase *R : ToRemove)
2762 R->eraseFromParent();
2763 }
2764
handleUncountableEarlyExit(VPBasicBlock * EarlyExitingVPBB,VPBasicBlock * EarlyExitVPBB,VPlan & Plan,VPBasicBlock * HeaderVPBB,VPBasicBlock * LatchVPBB,VFRange & Range)2765 void VPlanTransforms::handleUncountableEarlyExit(
2766 VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan,
2767 VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) {
2768 using namespace llvm::VPlanPatternMatch;
2769
2770 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
2771 if (!EarlyExitVPBB->getSinglePredecessor() &&
2772 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
2773 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
2774 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
2775 "unsupported early exit VPBB");
2776 // Early exit operand should always be last phi operand. If EarlyExitVPBB
2777 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
2778 // of the phis.
2779 for (VPRecipeBase &R : EarlyExitVPBB->phis())
2780 cast<VPIRPhi>(&R)->swapOperands();
2781 }
2782
2783 VPBuilder Builder(LatchVPBB->getTerminator());
2784 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
2785 assert(
2786 match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
2787 "Terminator must be be BranchOnCond");
2788 VPValue *CondOfEarlyExitingVPBB =
2789 EarlyExitingVPBB->getTerminator()->getOperand(0);
2790 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
2791 ? CondOfEarlyExitingVPBB
2792 : Builder.createNot(CondOfEarlyExitingVPBB);
2793
2794 // Split the middle block and have it conditionally branch to the early exit
2795 // block if CondToEarlyExit.
2796 VPValue *IsEarlyExitTaken =
2797 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
2798 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
2799 VPBasicBlock *VectorEarlyExitVPBB =
2800 Plan.createVPBasicBlock("vector.early.exit");
2801 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
2802 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
2803 NewMiddle->swapSuccessors();
2804
2805 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
2806
2807 // Update the exit phis in the early exit block.
2808 VPBuilder MiddleBuilder(NewMiddle);
2809 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
2810 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
2811 auto *ExitIRI = cast<VPIRPhi>(&R);
2812 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
2813 // a single predecessor and 1 if it has two.
2814 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
2815 if (ExitIRI->getNumOperands() != 1) {
2816 // The first of two operands corresponds to the latch exit, via MiddleVPBB
2817 // predecessor. Extract its last lane.
2818 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
2819 }
2820
2821 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
2822 auto IsVector = [](ElementCount VF) { return VF.isVector(); };
2823 // When the VFs are vectors, need to add `extract` to get the incoming value
2824 // from early exit. When the range contains scalar VF, limit the range to
2825 // scalar VF to prevent mis-compilation for the range containing both scalar
2826 // and vector VFs.
2827 if (!IncomingFromEarlyExit->isLiveIn() &&
2828 LoopVectorizationPlanner::getDecisionAndClampRange(IsVector, Range)) {
2829 // Update the incoming value from the early exit.
2830 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
2831 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
2832 "first.active.lane");
2833 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
2834 Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane},
2835 nullptr, "early.exit.value");
2836 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
2837 }
2838 }
2839 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
2840
2841 // Replace the condition controlling the non-early exit from the vector loop
2842 // with one exiting if either the original condition of the vector latch is
2843 // true or the early exit has been taken.
2844 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
2845 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
2846 "Unexpected terminator");
2847 auto *IsLatchExitTaken =
2848 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
2849 LatchExitingBranch->getOperand(1));
2850 auto *AnyExitTaken = Builder.createNaryOp(
2851 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
2852 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
2853 LatchExitingBranch->eraseFromParent();
2854 }
2855
2856 /// This function tries convert extended in-loop reductions to
2857 /// VPExpressionRecipe and clamp the \p Range if it is beneficial and
2858 /// valid. The created recipe must be decomposed to its constituent
2859 /// recipes before execution.
2860 static VPExpressionRecipe *
tryToMatchAndCreateExtendedReduction(VPReductionRecipe * Red,VPCostContext & Ctx,VFRange & Range)2861 tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
2862 VFRange &Range) {
2863 using namespace VPlanPatternMatch;
2864
2865 Type *RedTy = Ctx.Types.inferScalarType(Red);
2866 VPValue *VecOp = Red->getVecOp();
2867
2868 // Clamp the range if using extended-reduction is profitable.
2869 auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
2870 Type *SrcTy) -> bool {
2871 return LoopVectorizationPlanner::getDecisionAndClampRange(
2872 [&](ElementCount VF) {
2873 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
2874 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2875 InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
2876 Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(),
2877 CostKind);
2878 InstructionCost ExtCost =
2879 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
2880 InstructionCost RedCost = Red->computeCost(VF, Ctx);
2881 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
2882 },
2883 Range);
2884 };
2885
2886 VPValue *A;
2887 // Match reduce(ext)).
2888 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
2889 IsExtendedRedValidAndClampRange(
2890 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
2891 cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
2892 Instruction::CastOps::ZExt,
2893 Ctx.Types.inferScalarType(A)))
2894 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
2895
2896 return nullptr;
2897 }
2898
2899 /// This function tries convert extended in-loop reductions to
2900 /// VPExpressionRecipe and clamp the \p Range if it is beneficial
2901 /// and valid. The created VPExpressionRecipe must be decomposed to its
2902 /// constituent recipes before execution. Patterns of the
2903 /// VPExpressionRecipe:
2904 /// reduce.add(mul(...)),
2905 /// reduce.add(mul(ext(A), ext(B))),
2906 /// reduce.add(ext(mul(ext(A), ext(B)))).
2907 static VPExpressionRecipe *
tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe * Red,VPCostContext & Ctx,VFRange & Range)2908 tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
2909 VPCostContext &Ctx, VFRange &Range) {
2910 using namespace VPlanPatternMatch;
2911
2912 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
2913 if (Opcode != Instruction::Add)
2914 return nullptr;
2915
2916 Type *RedTy = Ctx.Types.inferScalarType(Red);
2917
2918 // Clamp the range if using multiply-accumulate-reduction is profitable.
2919 auto IsMulAccValidAndClampRange =
2920 [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
2921 VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
2922 return LoopVectorizationPlanner::getDecisionAndClampRange(
2923 [&](ElementCount VF) {
2924 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2925 Type *SrcTy =
2926 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
2927 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
2928 InstructionCost MulAccCost =
2929 Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind);
2930 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
2931 InstructionCost RedCost = Red->computeCost(VF, Ctx);
2932 InstructionCost ExtCost = 0;
2933 if (Ext0)
2934 ExtCost += Ext0->computeCost(VF, Ctx);
2935 if (Ext1)
2936 ExtCost += Ext1->computeCost(VF, Ctx);
2937 if (OuterExt)
2938 ExtCost += OuterExt->computeCost(VF, Ctx);
2939
2940 return MulAccCost.isValid() &&
2941 MulAccCost < ExtCost + MulCost + RedCost;
2942 },
2943 Range);
2944 };
2945
2946 VPValue *VecOp = Red->getVecOp();
2947 VPValue *A, *B;
2948 // Try to match reduce.add(mul(...)).
2949 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
2950 auto *RecipeA =
2951 dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
2952 auto *RecipeB =
2953 dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
2954 auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
2955
2956 // Match reduce.add(mul(ext, ext)).
2957 if (RecipeA && RecipeB &&
2958 (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
2959 match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
2960 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
2961 IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
2962 Instruction::CastOps::ZExt,
2963 Mul, RecipeA, RecipeB, nullptr)) {
2964 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
2965 }
2966 // Match reduce.add(mul).
2967 if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
2968 return new VPExpressionRecipe(Mul, Red);
2969 }
2970 // Match reduce.add(ext(mul(ext(A), ext(B)))).
2971 // All extend recipes must have same opcode or A == B
2972 // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
2973 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),
2974 m_ZExtOrSExt(m_VPValue()))))) {
2975 auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
2976 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
2977 auto *Ext0 =
2978 cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
2979 auto *Ext1 =
2980 cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
2981 if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
2982 Ext0->getOpcode() == Ext1->getOpcode() &&
2983 IsMulAccValidAndClampRange(Ext0->getOpcode() ==
2984 Instruction::CastOps::ZExt,
2985 Mul, Ext0, Ext1, Ext)) {
2986 auto *NewExt0 = new VPWidenCastRecipe(
2987 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
2988 Ext0->getDebugLoc());
2989 NewExt0->insertBefore(Ext0);
2990
2991 VPWidenCastRecipe *NewExt1 = NewExt0;
2992 if (Ext0 != Ext1) {
2993 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
2994 Ext->getResultType(), *Ext1,
2995 Ext1->getDebugLoc());
2996 NewExt1->insertBefore(Ext1);
2997 }
2998 Mul->setOperand(0, NewExt0);
2999 Mul->setOperand(1, NewExt1);
3000 Red->setOperand(1, Mul);
3001 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3002 }
3003 }
3004 return nullptr;
3005 }
3006
3007 /// This function tries to create abstract recipes from the reduction recipe for
3008 /// following optimizations and cost estimation.
tryToCreateAbstractReductionRecipe(VPReductionRecipe * Red,VPCostContext & Ctx,VFRange & Range)3009 static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
3010 VPCostContext &Ctx,
3011 VFRange &Range) {
3012 VPExpressionRecipe *AbstractR = nullptr;
3013 auto IP = std::next(Red->getIterator());
3014 auto *VPBB = Red->getParent();
3015 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3016 AbstractR = MulAcc;
3017 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3018 AbstractR = ExtRed;
3019 // Cannot create abstract inloop reduction recipes.
3020 if (!AbstractR)
3021 return;
3022
3023 AbstractR->insertBefore(*VPBB, IP);
3024 Red->replaceAllUsesWith(AbstractR);
3025 }
3026
convertToAbstractRecipes(VPlan & Plan,VPCostContext & Ctx,VFRange & Range)3027 void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
3028 VFRange &Range) {
3029 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3030 vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
3031 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3032 if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
3033 tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
3034 }
3035 }
3036 }
3037
materializeBroadcasts(VPlan & Plan)3038 void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
3039 if (Plan.hasScalarVFOnly())
3040 return;
3041
3042 #ifndef NDEBUG
3043 VPDominatorTree VPDT;
3044 VPDT.recalculate(Plan);
3045 #endif
3046
3047 SmallVector<VPValue *> VPValues;
3048 if (Plan.getOrCreateBackedgeTakenCount()->getNumUsers() > 0)
3049 VPValues.push_back(Plan.getOrCreateBackedgeTakenCount());
3050 append_range(VPValues, Plan.getLiveIns());
3051 for (VPRecipeBase &R : *Plan.getEntry())
3052 append_range(VPValues, R.definedValues());
3053
3054 auto *VectorPreheader = Plan.getVectorPreheader();
3055 for (VPValue *VPV : VPValues) {
3056 if (all_of(VPV->users(),
3057 [VPV](VPUser *U) { return U->usesScalars(VPV); }) ||
3058 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3059 isa<Constant>(VPV->getLiveInIRValue())))
3060 continue;
3061
3062 // Add explicit broadcast at the insert point that dominates all users.
3063 VPBasicBlock *HoistBlock = VectorPreheader;
3064 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3065 for (VPUser *User : VPV->users()) {
3066 if (User->usesScalars(VPV))
3067 continue;
3068 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3069 HoistPoint = HoistBlock->begin();
3070 else
3071 assert(VPDT.dominates(VectorPreheader,
3072 cast<VPRecipeBase>(User)->getParent()) &&
3073 "All users must be in the vector preheader or dominated by it");
3074 }
3075
3076 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3077 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3078 VPV->replaceUsesWithIf(Broadcast,
3079 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3080 return Broadcast != &U && !U.usesScalars(VPV);
3081 });
3082 }
3083 }
3084
3085 /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
3086 /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
3087 /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
3088 /// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
3089 /// an index-independent load if it feeds all wide ops at all indices (\p OpV
3090 /// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
3091 /// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
3092 /// is defined at \p Idx of a load interleave group.
canNarrowLoad(VPWidenRecipe * WideMember0,unsigned OpIdx,VPValue * OpV,unsigned Idx)3093 static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
3094 VPValue *OpV, unsigned Idx) {
3095 auto *DefR = OpV->getDefiningRecipe();
3096 if (!DefR)
3097 return WideMember0->getOperand(OpIdx) == OpV;
3098 if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
3099 return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
3100
3101 if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
3102 return IR->getInterleaveGroup()->getFactor() ==
3103 IR->getInterleaveGroup()->getNumMembers() &&
3104 IR->getVPValue(Idx) == OpV;
3105 return false;
3106 }
3107
3108 /// Returns true if \p IR is a full interleave group with factor and number of
3109 /// members both equal to \p VF. The interleave group must also access the full
3110 /// vector width \p VectorRegWidth.
isConsecutiveInterleaveGroup(VPInterleaveRecipe * InterleaveR,unsigned VF,VPTypeAnalysis & TypeInfo,unsigned VectorRegWidth)3111 static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
3112 unsigned VF, VPTypeAnalysis &TypeInfo,
3113 unsigned VectorRegWidth) {
3114 if (!InterleaveR)
3115 return false;
3116
3117 Type *GroupElementTy = nullptr;
3118 if (InterleaveR->getStoredValues().empty()) {
3119 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
3120 if (!all_of(InterleaveR->definedValues(),
3121 [&TypeInfo, GroupElementTy](VPValue *Op) {
3122 return TypeInfo.inferScalarType(Op) == GroupElementTy;
3123 }))
3124 return false;
3125 } else {
3126 GroupElementTy =
3127 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
3128 if (!all_of(InterleaveR->getStoredValues(),
3129 [&TypeInfo, GroupElementTy](VPValue *Op) {
3130 return TypeInfo.inferScalarType(Op) == GroupElementTy;
3131 }))
3132 return false;
3133 }
3134
3135 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
3136 auto IG = InterleaveR->getInterleaveGroup();
3137 return IG->getFactor() == VF && IG->getNumMembers() == VF &&
3138 GroupSize == VectorRegWidth;
3139 }
3140
3141 /// Returns true if \p VPValue is a narrow VPValue.
isAlreadyNarrow(VPValue * VPV)3142 static bool isAlreadyNarrow(VPValue *VPV) {
3143 if (VPV->isLiveIn())
3144 return true;
3145 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
3146 return RepR && RepR->isSingleScalar();
3147 }
3148
narrowInterleaveGroups(VPlan & Plan,ElementCount VF,unsigned VectorRegWidth)3149 void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
3150 unsigned VectorRegWidth) {
3151 using namespace llvm::VPlanPatternMatch;
3152 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
3153 if (VF.isScalable() || !VectorLoop)
3154 return;
3155
3156 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
3157 Type *CanonicalIVType = CanonicalIV->getScalarType();
3158 VPTypeAnalysis TypeInfo(CanonicalIVType);
3159
3160 unsigned FixedVF = VF.getFixedValue();
3161 SmallVector<VPInterleaveRecipe *> StoreGroups;
3162 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
3163 if (isa<VPCanonicalIVPHIRecipe>(&R) ||
3164 match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
3165 continue;
3166
3167 if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) &&
3168 vputils::onlyFirstLaneUsed(cast<VPSingleDefRecipe>(&R)))
3169 continue;
3170
3171 // Bail out on recipes not supported at the moment:
3172 // * phi recipes other than the canonical induction
3173 // * recipes writing to memory except interleave groups
3174 // Only support plans with a canonical induction phi.
3175 if (R.isPhi())
3176 return;
3177
3178 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
3179 if (R.mayWriteToMemory() && !InterleaveR)
3180 return;
3181
3182 // Do not narrow interleave groups if there are VectorPointer recipes and
3183 // the plan was unrolled. The recipe implicitly uses VF from
3184 // VPTransformState.
3185 // TODO: Remove restriction once the VF for the VectorPointer offset is
3186 // modeled explicitly as operand.
3187 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
3188 return;
3189
3190 // All other ops are allowed, but we reject uses that cannot be converted
3191 // when checking all allowed consumers (store interleave groups) below.
3192 if (!InterleaveR)
3193 continue;
3194
3195 // Bail out on non-consecutive interleave groups.
3196 if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo,
3197 VectorRegWidth))
3198 return;
3199
3200 // Skip read interleave groups.
3201 if (InterleaveR->getStoredValues().empty())
3202 continue;
3203
3204 // Narrow interleave groups, if all operands are already matching narrow
3205 // ops.
3206 auto *Member0 = InterleaveR->getStoredValues()[0];
3207 if (isAlreadyNarrow(Member0) &&
3208 all_of(InterleaveR->getStoredValues(),
3209 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
3210 StoreGroups.push_back(InterleaveR);
3211 continue;
3212 }
3213
3214 // For now, we only support full interleave groups storing load interleave
3215 // groups.
3216 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
3217 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
3218 if (!DefR)
3219 return false;
3220 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
3221 return IR &&
3222 IR->getInterleaveGroup()->getFactor() ==
3223 IR->getInterleaveGroup()->getNumMembers() &&
3224 IR->getVPValue(Op.index()) == Op.value();
3225 })) {
3226 StoreGroups.push_back(InterleaveR);
3227 continue;
3228 }
3229
3230 // Check if all values feeding InterleaveR are matching wide recipes, which
3231 // operands that can be narrowed.
3232 auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
3233 InterleaveR->getStoredValues()[0]->getDefiningRecipe());
3234 if (!WideMember0)
3235 return;
3236 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
3237 auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
3238 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
3239 R->getNumOperands() > 2)
3240 return;
3241 if (any_of(enumerate(R->operands()),
3242 [WideMember0, Idx = I](const auto &P) {
3243 const auto &[OpIdx, OpV] = P;
3244 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
3245 }))
3246 return;
3247 }
3248 StoreGroups.push_back(InterleaveR);
3249 }
3250
3251 if (StoreGroups.empty())
3252 return;
3253
3254 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
3255 SmallPtrSet<VPValue *, 4> NarrowedOps;
3256 auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
3257 auto *R = V->getDefiningRecipe();
3258 if (!R || NarrowedOps.contains(V))
3259 return V;
3260 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
3261 // Narrow interleave group to wide load, as transformed VPlan will only
3262 // process one original iteration.
3263 auto *L = new VPWidenLoadRecipe(
3264 *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
3265 LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
3266 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
3267 L->insertBefore(LoadGroup);
3268 NarrowedOps.insert(L);
3269 return L;
3270 }
3271
3272 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
3273 assert(RepR->isSingleScalar() &&
3274 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
3275 "must be a single scalar load");
3276 NarrowedOps.insert(RepR);
3277 return RepR;
3278 }
3279 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
3280
3281 // Narrow wide load to uniform scalar load, as transformed VPlan will only
3282 // process one original iteration.
3283 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
3284 WideLoad->operands(), /*IsUniform*/ true,
3285 /*Mask*/ nullptr, *WideLoad);
3286 N->insertBefore(WideLoad);
3287 NarrowedOps.insert(N);
3288 return N;
3289 };
3290
3291 // Narrow operation tree rooted at store groups.
3292 for (auto *StoreGroup : StoreGroups) {
3293 VPValue *Res = nullptr;
3294 VPValue *Member0 = StoreGroup->getStoredValues()[0];
3295 if (isAlreadyNarrow(Member0)) {
3296 Res = Member0;
3297 } else if (auto *WideMember0 =
3298 dyn_cast<VPWidenRecipe>(Member0->getDefiningRecipe())) {
3299 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
3300 WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
3301 Res = WideMember0;
3302 } else {
3303 Res = NarrowOp(Member0);
3304 }
3305
3306 auto *S = new VPWidenStoreRecipe(
3307 *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
3308 StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
3309 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
3310 S->insertBefore(StoreGroup);
3311 StoreGroup->eraseFromParent();
3312 }
3313
3314 // Adjust induction to reflect that the transformed plan only processes one
3315 // original iteration.
3316 auto *CanIV = Plan.getCanonicalIV();
3317 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
3318 Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get(
3319 CanIV->getScalarType(), 1 * Plan.getUF())));
3320 Plan.getVF().replaceAllUsesWith(
3321 Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
3322 removeDeadRecipes(Plan);
3323 }
3324
3325 /// Add branch weight metadata, if the \p Plan's middle block is terminated by a
3326 /// BranchOnCond recipe.
addBranchWeightToMiddleTerminator(VPlan & Plan,ElementCount VF,std::optional<unsigned> VScaleForTuning)3327 void VPlanTransforms::addBranchWeightToMiddleTerminator(
3328 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
3329 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
3330 auto *MiddleTerm =
3331 dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator());
3332 // Only add branch metadata if there is a (conditional) terminator.
3333 if (!MiddleTerm)
3334 return;
3335
3336 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
3337 "must have a BranchOnCond");
3338 // Assume that `TripCount % VectorStep ` is equally distributed.
3339 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
3340 if (VF.isScalable() && VScaleForTuning.has_value())
3341 VectorStep *= *VScaleForTuning;
3342 assert(VectorStep > 0 && "trip count should not be zero");
3343 MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
3344 MDNode *BranchWeights =
3345 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
3346 MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
3347 }
3348