1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46 // Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanCFG.h"
62 #include "VPlanHelpers.h"
63 #include "VPlanPatternMatch.h"
64 #include "VPlanTransforms.h"
65 #include "VPlanUtils.h"
66 #include "VPlanVerifier.h"
67 #include "llvm/ADT/APInt.h"
68 #include "llvm/ADT/ArrayRef.h"
69 #include "llvm/ADT/DenseMap.h"
70 #include "llvm/ADT/DenseMapInfo.h"
71 #include "llvm/ADT/Hashing.h"
72 #include "llvm/ADT/MapVector.h"
73 #include "llvm/ADT/STLExtras.h"
74 #include "llvm/ADT/SmallPtrSet.h"
75 #include "llvm/ADT/SmallVector.h"
76 #include "llvm/ADT/Statistic.h"
77 #include "llvm/ADT/StringRef.h"
78 #include "llvm/ADT/Twine.h"
79 #include "llvm/ADT/TypeSwitch.h"
80 #include "llvm/ADT/iterator_range.h"
81 #include "llvm/Analysis/AssumptionCache.h"
82 #include "llvm/Analysis/BasicAliasAnalysis.h"
83 #include "llvm/Analysis/BlockFrequencyInfo.h"
84 #include "llvm/Analysis/CFG.h"
85 #include "llvm/Analysis/CodeMetrics.h"
86 #include "llvm/Analysis/DemandedBits.h"
87 #include "llvm/Analysis/GlobalsModRef.h"
88 #include "llvm/Analysis/LoopAccessAnalysis.h"
89 #include "llvm/Analysis/LoopAnalysisManager.h"
90 #include "llvm/Analysis/LoopInfo.h"
91 #include "llvm/Analysis/LoopIterator.h"
92 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
93 #include "llvm/Analysis/ProfileSummaryInfo.h"
94 #include "llvm/Analysis/ScalarEvolution.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/ValueTracking.h"
99 #include "llvm/Analysis/VectorUtils.h"
100 #include "llvm/IR/Attributes.h"
101 #include "llvm/IR/BasicBlock.h"
102 #include "llvm/IR/CFG.h"
103 #include "llvm/IR/Constant.h"
104 #include "llvm/IR/Constants.h"
105 #include "llvm/IR/DataLayout.h"
106 #include "llvm/IR/DebugInfo.h"
107 #include "llvm/IR/DebugLoc.h"
108 #include "llvm/IR/DerivedTypes.h"
109 #include "llvm/IR/DiagnosticInfo.h"
110 #include "llvm/IR/Dominators.h"
111 #include "llvm/IR/Function.h"
112 #include "llvm/IR/IRBuilder.h"
113 #include "llvm/IR/InstrTypes.h"
114 #include "llvm/IR/Instruction.h"
115 #include "llvm/IR/Instructions.h"
116 #include "llvm/IR/IntrinsicInst.h"
117 #include "llvm/IR/Intrinsics.h"
118 #include "llvm/IR/MDBuilder.h"
119 #include "llvm/IR/Metadata.h"
120 #include "llvm/IR/Module.h"
121 #include "llvm/IR/Operator.h"
122 #include "llvm/IR/PatternMatch.h"
123 #include "llvm/IR/ProfDataUtils.h"
124 #include "llvm/IR/Type.h"
125 #include "llvm/IR/Use.h"
126 #include "llvm/IR/User.h"
127 #include "llvm/IR/Value.h"
128 #include "llvm/IR/Verifier.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/NativeFormatting.h"
136 #include "llvm/Support/raw_ostream.h"
137 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
138 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
139 #include "llvm/Transforms/Utils/Local.h"
140 #include "llvm/Transforms/Utils/LoopSimplify.h"
141 #include "llvm/Transforms/Utils/LoopUtils.h"
142 #include "llvm/Transforms/Utils/LoopVersioning.h"
143 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
144 #include "llvm/Transforms/Utils/SizeOpts.h"
145 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
146 #include <algorithm>
147 #include <cassert>
148 #include <cstdint>
149 #include <functional>
150 #include <iterator>
151 #include <limits>
152 #include <memory>
153 #include <string>
154 #include <tuple>
155 #include <utility>
156
157 using namespace llvm;
158
159 #define LV_NAME "loop-vectorize"
160 #define DEBUG_TYPE LV_NAME
161
162 #ifndef NDEBUG
163 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
164 #endif
165
166 /// @{
167 /// Metadata attribute names
168 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169 const char LLVMLoopVectorizeFollowupVectorized[] =
170 "llvm.loop.vectorize.followup_vectorized";
171 const char LLVMLoopVectorizeFollowupEpilogue[] =
172 "llvm.loop.vectorize.followup_epilogue";
173 /// @}
174
175 STATISTIC(LoopsVectorized, "Number of loops vectorized");
176 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
177 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178 STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
179
180 static cl::opt<bool> EnableEpilogueVectorization(
181 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
182 cl::desc("Enable vectorization of epilogue loops."));
183
184 static cl::opt<unsigned> EpilogueVectorizationForceVF(
185 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
186 cl::desc("When epilogue vectorization is enabled, and a value greater than "
187 "1 is specified, forces the given VF for all applicable epilogue "
188 "loops."));
189
190 static cl::opt<unsigned> EpilogueVectorizationMinVF(
191 "epilogue-vectorization-minimum-VF", cl::Hidden,
192 cl::desc("Only loops with vectorization factor equal to or larger than "
193 "the specified value are considered for epilogue vectorization."));
194
195 /// Loops with a known constant trip count below this number are vectorized only
196 /// if no scalar iteration overheads are incurred.
197 static cl::opt<unsigned> TinyTripCountVectorThreshold(
198 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
199 cl::desc("Loops with a constant trip count that is smaller than this "
200 "value are vectorized only if no scalar iteration overheads "
201 "are incurred."));
202
203 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
204 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
205 cl::desc("The maximum allowed number of runtime memory checks"));
206
207 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208 // that predication is preferred, and this lists all options. I.e., the
209 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
210 // and predicate the instructions accordingly. If tail-folding fails, there are
211 // different fallback strategies depending on these values:
212 namespace PreferPredicateTy {
213 enum Option {
214 ScalarEpilogue = 0,
215 PredicateElseScalarEpilogue,
216 PredicateOrDontVectorize
217 };
218 } // namespace PreferPredicateTy
219
220 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
221 "prefer-predicate-over-epilogue",
222 cl::init(PreferPredicateTy::ScalarEpilogue),
223 cl::Hidden,
224 cl::desc("Tail-folding and predication preferences over creating a scalar "
225 "epilogue loop."),
226 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
227 "scalar-epilogue",
228 "Don't tail-predicate loops, create scalar epilogue"),
229 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
230 "predicate-else-scalar-epilogue",
231 "prefer tail-folding, create scalar epilogue if tail "
232 "folding fails."),
233 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
234 "predicate-dont-vectorize",
235 "prefers tail-folding, don't attempt vectorization if "
236 "tail-folding fails.")));
237
238 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
239 "force-tail-folding-style", cl::desc("Force the tail folding style"),
240 cl::init(TailFoldingStyle::None),
241 cl::values(
242 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
243 clEnumValN(
244 TailFoldingStyle::Data, "data",
245 "Create lane mask for data only, using active.lane.mask intrinsic"),
246 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
247 "data-without-lane-mask",
248 "Create lane mask with compare/stepvector"),
249 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
250 "Create lane mask using active.lane.mask intrinsic, and use "
251 "it for both data and control flow"),
252 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253 "data-and-control-without-rt-check",
254 "Similar to data-and-control, but remove the runtime check"),
255 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
256 "Use predicated EVL instructions for tail folding. If EVL "
257 "is unsupported, fallback to data-without-lane-mask.")));
258
259 static cl::opt<bool> MaximizeBandwidth(
260 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
261 cl::desc("Maximize bandwidth when selecting vectorization factor which "
262 "will be determined by the smallest type in loop."));
263
264 static cl::opt<bool> EnableInterleavedMemAccesses(
265 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
266 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
267
268 /// An interleave-group may need masking if it resides in a block that needs
269 /// predication, or in order to mask away gaps.
270 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
271 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
272 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
273
274 static cl::opt<unsigned> ForceTargetNumScalarRegs(
275 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
276 cl::desc("A flag that overrides the target's number of scalar registers."));
277
278 static cl::opt<unsigned> ForceTargetNumVectorRegs(
279 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
280 cl::desc("A flag that overrides the target's number of vector registers."));
281
282 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
283 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
284 cl::desc("A flag that overrides the target's max interleave factor for "
285 "scalar loops."));
286
287 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
288 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
289 cl::desc("A flag that overrides the target's max interleave factor for "
290 "vectorized loops."));
291
292 cl::opt<unsigned> llvm::ForceTargetInstructionCost(
293 "force-target-instruction-cost", cl::init(0), cl::Hidden,
294 cl::desc("A flag that overrides the target's expected cost for "
295 "an instruction to a single constant value. Mostly "
296 "useful for getting consistent testing."));
297
298 static cl::opt<bool> ForceTargetSupportsScalableVectors(
299 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
300 cl::desc(
301 "Pretend that scalable vectors are supported, even if the target does "
302 "not support them. This flag should only be used for testing."));
303
304 static cl::opt<unsigned> SmallLoopCost(
305 "small-loop-cost", cl::init(20), cl::Hidden,
306 cl::desc(
307 "The cost of a loop that is considered 'small' by the interleaver."));
308
309 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
310 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
311 cl::desc("Enable the use of the block frequency analysis to access PGO "
312 "heuristics minimizing code growth in cold regions and being more "
313 "aggressive in hot regions."));
314
315 // Runtime interleave loops for load/store throughput.
316 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
317 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
318 cl::desc(
319 "Enable runtime interleaving until load/store ports are saturated"));
320
321 /// The number of stores in a loop that are allowed to need predication.
322 static cl::opt<unsigned> NumberOfStoresToPredicate(
323 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
324 cl::desc("Max number of stores to be predicated behind an if."));
325
326 static cl::opt<bool> EnableIndVarRegisterHeur(
327 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
328 cl::desc("Count the induction variable only once when interleaving"));
329
330 static cl::opt<bool> EnableCondStoresVectorization(
331 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
332 cl::desc("Enable if predication of stores during vectorization."));
333
334 static cl::opt<unsigned> MaxNestedScalarReductionIC(
335 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
336 cl::desc("The maximum interleave count to use when interleaving a scalar "
337 "reduction in a nested loop."));
338
339 static cl::opt<bool>
340 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
341 cl::Hidden,
342 cl::desc("Prefer in-loop vector reductions, "
343 "overriding the targets preference."));
344
345 static cl::opt<bool> ForceOrderedReductions(
346 "force-ordered-reductions", cl::init(false), cl::Hidden,
347 cl::desc("Enable the vectorisation of loops with in-order (strict) "
348 "FP reductions"));
349
350 static cl::opt<bool> PreferPredicatedReductionSelect(
351 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
352 cl::desc(
353 "Prefer predicating a reduction operation over an after loop select."));
354
355 cl::opt<bool> llvm::EnableVPlanNativePath(
356 "enable-vplan-native-path", cl::Hidden,
357 cl::desc("Enable VPlan-native vectorization path with "
358 "support for outer loop vectorization."));
359
360 cl::opt<bool>
361 llvm::VerifyEachVPlan("vplan-verify-each",
362 #ifdef EXPENSIVE_CHECKS
363 cl::init(true),
364 #else
365 cl::init(false),
366 #endif
367 cl::Hidden,
368 cl::desc("Verfiy VPlans after VPlan transforms."));
369
370 // This flag enables the stress testing of the VPlan H-CFG construction in the
371 // VPlan-native vectorization path. It must be used in conjuction with
372 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
373 // verification of the H-CFGs built.
374 static cl::opt<bool> VPlanBuildStressTest(
375 "vplan-build-stress-test", cl::init(false), cl::Hidden,
376 cl::desc(
377 "Build VPlan for every supported loop nest in the function and bail "
378 "out right after the build (stress test the VPlan H-CFG construction "
379 "in the VPlan-native vectorization path)."));
380
381 cl::opt<bool> llvm::EnableLoopInterleaving(
382 "interleave-loops", cl::init(true), cl::Hidden,
383 cl::desc("Enable loop interleaving in Loop vectorization passes"));
384 cl::opt<bool> llvm::EnableLoopVectorization(
385 "vectorize-loops", cl::init(true), cl::Hidden,
386 cl::desc("Run the Loop vectorization passes"));
387
388 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
389 "force-widen-divrem-via-safe-divisor", cl::Hidden,
390 cl::desc(
391 "Override cost based safe divisor widening for div/rem instructions"));
392
393 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
394 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
395 cl::Hidden,
396 cl::desc("Try wider VFs if they enable the use of vector variants"));
397
398 static cl::opt<bool> EnableEarlyExitVectorization(
399 "enable-early-exit-vectorization", cl::init(true), cl::Hidden,
400 cl::desc(
401 "Enable vectorization of early exit loops with uncountable exits."));
402
403 // Likelyhood of bypassing the vectorized loop because there are zero trips left
404 // after prolog. See `emitIterationCountCheck`.
405 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
406
407 /// A helper function that returns true if the given type is irregular. The
408 /// type is irregular if its allocated size doesn't equal the store size of an
409 /// element of the corresponding vector type.
hasIrregularType(Type * Ty,const DataLayout & DL)410 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
411 // Determine if an array of N elements of type Ty is "bitcast compatible"
412 // with a <N x Ty> vector.
413 // This is only true if there is no padding between the array elements.
414 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
415 }
416
417 /// A version of ScalarEvolution::getSmallConstantTripCount that returns an
418 /// ElementCount to include loops whose trip count is a function of vscale.
getSmallConstantTripCount(ScalarEvolution * SE,const Loop * L)419 static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
420 const Loop *L) {
421 return ElementCount::getFixed(SE->getSmallConstantTripCount(L));
422 }
423
424 /// Returns "best known" trip count, which is either a valid positive trip count
425 /// or std::nullopt when an estimate cannot be made (including when the trip
426 /// count would overflow), for the specified loop \p L as defined by the
427 /// following procedure:
428 /// 1) Returns exact trip count if it is known.
429 /// 2) Returns expected trip count according to profile data if any.
430 /// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
431 /// 4) Returns std::nullopt if all of the above failed.
432 static std::optional<ElementCount>
getSmallBestKnownTC(PredicatedScalarEvolution & PSE,Loop * L,bool CanUseConstantMax=true)433 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
434 bool CanUseConstantMax = true) {
435 // Check if exact trip count is known.
436 if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
437 return ExpectedTC;
438
439 // Check if there is an expected trip count available from profile data.
440 if (LoopVectorizeWithBlockFrequency)
441 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
442 return ElementCount::getFixed(*EstimatedTC);
443
444 if (!CanUseConstantMax)
445 return std::nullopt;
446
447 // Check if upper bound estimate is known.
448 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
449 return ElementCount::getFixed(ExpectedTC);
450
451 return std::nullopt;
452 }
453
454 namespace {
455 // Forward declare GeneratedRTChecks.
456 class GeneratedRTChecks;
457
458 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
459 } // namespace
460
461 namespace llvm {
462
463 AnalysisKey ShouldRunExtraVectorPasses::Key;
464
465 /// InnerLoopVectorizer vectorizes loops which contain only one basic
466 /// block to a specified vectorization factor (VF).
467 /// This class performs the widening of scalars into vectors, or multiple
468 /// scalars. This class also implements the following features:
469 /// * It inserts an epilogue loop for handling loops that don't have iteration
470 /// counts that are known to be a multiple of the vectorization factor.
471 /// * It handles the code generation for reduction variables.
472 /// * Scalarization (implementation using scalars) of un-vectorizable
473 /// instructions.
474 /// InnerLoopVectorizer does not perform any vectorization-legality
475 /// checks, and relies on the caller to check for the different legality
476 /// aspects. The InnerLoopVectorizer relies on the
477 /// LoopVectorizationLegality class to provide information about the induction
478 /// and reduction variables that were found to a given vectorization factor.
479 class InnerLoopVectorizer {
480 public:
InnerLoopVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,ElementCount VecWidth,ElementCount MinProfitableTripCount,unsigned UnrollFactor,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & RTChecks,VPlan & Plan)481 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
482 LoopInfo *LI, DominatorTree *DT,
483 const TargetLibraryInfo *TLI,
484 const TargetTransformInfo *TTI, AssumptionCache *AC,
485 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
486 ElementCount MinProfitableTripCount,
487 unsigned UnrollFactor, LoopVectorizationCostModel *CM,
488 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
489 GeneratedRTChecks &RTChecks, VPlan &Plan)
490 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
491 AC(AC), ORE(ORE), VF(VecWidth),
492 MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
493 Builder(PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI),
494 RTChecks(RTChecks), Plan(Plan),
495 VectorPHVPB(Plan.getVectorLoopRegion()->getSinglePredecessor()) {}
496
497 virtual ~InnerLoopVectorizer() = default;
498
499 /// Create a new empty loop that will contain vectorized instructions later
500 /// on, while the old loop will be used as the scalar remainder. Control flow
501 /// is generated around the vectorized (and scalar epilogue) loops consisting
502 /// of various checks and bypasses. Return the pre-header block of the new
503 /// loop. In the case of epilogue vectorization, this function is overriden to
504 /// handle the more complex control flow around the loops.
505 virtual BasicBlock *createVectorizedLoopSkeleton();
506
507 /// Fix the vectorized code, taking care of header phi's, and more.
508 void fixVectorizedLoop(VPTransformState &State);
509
510 /// Fix the non-induction PHIs in \p Plan.
511 void fixNonInductionPHIs(VPTransformState &State);
512
513 /// Returns the original loop trip count.
getTripCount() const514 Value *getTripCount() const { return TripCount; }
515
516 /// Used to set the trip count after ILV's construction and after the
517 /// preheader block has been executed. Note that this always holds the trip
518 /// count of the original loop for both main loop and epilogue vectorization.
setTripCount(Value * TC)519 void setTripCount(Value *TC) { TripCount = TC; }
520
521 /// Return the additional bypass block which targets the scalar loop by
522 /// skipping the epilogue loop after completing the main loop.
getAdditionalBypassBlock() const523 BasicBlock *getAdditionalBypassBlock() const {
524 assert(AdditionalBypassBlock &&
525 "Trying to access AdditionalBypassBlock but it has not been set");
526 return AdditionalBypassBlock;
527 }
528
529 protected:
530 friend class LoopVectorizationPlanner;
531
532 /// Returns (and creates if needed) the trip count of the widened loop.
533 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
534
535 // Create a check to see if the vector loop should be executed
536 Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
537
538 /// Emit a bypass check to see if the vector trip count is zero, including if
539 /// it overflows.
540 void emitIterationCountCheck(BasicBlock *Bypass);
541
542 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
543 /// vector loop preheader, middle block and scalar preheader.
544 void createVectorLoopSkeleton(StringRef Prefix);
545
546 /// Allow subclasses to override and print debug traces before/after vplan
547 /// execution, when trace information is requested.
printDebugTracesAtStart()548 virtual void printDebugTracesAtStart() {}
printDebugTracesAtEnd()549 virtual void printDebugTracesAtEnd() {}
550
551 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
552 /// vector preheader and its predecessor, also connecting the new block to the
553 /// scalar preheader.
554 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
555
556 /// The original loop.
557 Loop *OrigLoop;
558
559 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
560 /// dynamic knowledge to simplify SCEV expressions and converts them to a
561 /// more usable form.
562 PredicatedScalarEvolution &PSE;
563
564 /// Loop Info.
565 LoopInfo *LI;
566
567 /// Dominator Tree.
568 DominatorTree *DT;
569
570 /// Target Library Info.
571 const TargetLibraryInfo *TLI;
572
573 /// Target Transform Info.
574 const TargetTransformInfo *TTI;
575
576 /// Assumption Cache.
577 AssumptionCache *AC;
578
579 /// Interface to emit optimization remarks.
580 OptimizationRemarkEmitter *ORE;
581
582 /// The vectorization SIMD factor to use. Each vector will have this many
583 /// vector elements.
584 ElementCount VF;
585
586 ElementCount MinProfitableTripCount;
587
588 /// The vectorization unroll factor to use. Each scalar is vectorized to this
589 /// many different vector instructions.
590 unsigned UF;
591
592 /// The builder that we use
593 IRBuilder<> Builder;
594
595 // --- Vectorization state ---
596
597 /// The vector-loop preheader.
598 BasicBlock *LoopVectorPreHeader = nullptr;
599
600 /// The scalar-loop preheader.
601 BasicBlock *LoopScalarPreHeader = nullptr;
602
603 /// Middle Block between the vector and the scalar.
604 BasicBlock *LoopMiddleBlock = nullptr;
605
606 /// Trip count of the original loop.
607 Value *TripCount = nullptr;
608
609 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
610 Value *VectorTripCount = nullptr;
611
612 /// The profitablity analysis.
613 LoopVectorizationCostModel *Cost;
614
615 /// BFI and PSI are used to check for profile guided size optimizations.
616 BlockFrequencyInfo *BFI;
617 ProfileSummaryInfo *PSI;
618
619 /// Structure to hold information about generated runtime checks, responsible
620 /// for cleaning the checks, if vectorization turns out unprofitable.
621 GeneratedRTChecks &RTChecks;
622
623 /// The additional bypass block which conditionally skips over the epilogue
624 /// loop after executing the main loop. Needed to resume inductions and
625 /// reductions during epilogue vectorization.
626 BasicBlock *AdditionalBypassBlock = nullptr;
627
628 VPlan &Plan;
629
630 /// The vector preheader block of \p Plan, used as target for check blocks
631 /// introduced during skeleton creation.
632 VPBlockBase *VectorPHVPB;
633 };
634
635 /// Encapsulate information regarding vectorization of a loop and its epilogue.
636 /// This information is meant to be updated and used across two stages of
637 /// epilogue vectorization.
638 struct EpilogueLoopVectorizationInfo {
639 ElementCount MainLoopVF = ElementCount::getFixed(0);
640 unsigned MainLoopUF = 0;
641 ElementCount EpilogueVF = ElementCount::getFixed(0);
642 unsigned EpilogueUF = 0;
643 BasicBlock *MainLoopIterationCountCheck = nullptr;
644 BasicBlock *EpilogueIterationCountCheck = nullptr;
645 Value *TripCount = nullptr;
646 Value *VectorTripCount = nullptr;
647 VPlan &EpiloguePlan;
648
EpilogueLoopVectorizationInfollvm::EpilogueLoopVectorizationInfo649 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
650 ElementCount EVF, unsigned EUF,
651 VPlan &EpiloguePlan)
652 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
653 EpiloguePlan(EpiloguePlan) {
654 assert(EUF == 1 &&
655 "A high UF for the epilogue loop is likely not beneficial.");
656 }
657 };
658
659 /// An extension of the inner loop vectorizer that creates a skeleton for a
660 /// vectorized loop that has its epilogue (residual) also vectorized.
661 /// The idea is to run the vplan on a given loop twice, firstly to setup the
662 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
663 /// from the first step and vectorize the epilogue. This is achieved by
664 /// deriving two concrete strategy classes from this base class and invoking
665 /// them in succession from the loop vectorizer planner.
666 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
667 public:
InnerLoopAndEpilogueVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks,VPlan & Plan)668 InnerLoopAndEpilogueVectorizer(
669 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
670 DominatorTree *DT, const TargetLibraryInfo *TLI,
671 const TargetTransformInfo *TTI, AssumptionCache *AC,
672 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
673 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
674 ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
675 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
676 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, CM,
677 BFI, PSI, Checks, Plan),
678 EPI(EPI) {}
679
680 // Override this function to handle the more complex control flow around the
681 // three loops.
createVectorizedLoopSkeleton()682 BasicBlock *createVectorizedLoopSkeleton() final {
683 return createEpilogueVectorizedLoopSkeleton();
684 }
685
686 /// The interface for creating a vectorized skeleton using one of two
687 /// different strategies, each corresponding to one execution of the vplan
688 /// as described above.
689 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
690
691 /// Holds and updates state information required to vectorize the main loop
692 /// and its epilogue in two separate passes. This setup helps us avoid
693 /// regenerating and recomputing runtime safety checks. It also helps us to
694 /// shorten the iteration-count-check path length for the cases where the
695 /// iteration count of the loop is so small that the main vector loop is
696 /// completely skipped.
697 EpilogueLoopVectorizationInfo &EPI;
698 };
699
700 /// A specialized derived class of inner loop vectorizer that performs
701 /// vectorization of *main* loops in the process of vectorizing loops and their
702 /// epilogues.
703 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
704 public:
EpilogueVectorizerMainLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check,VPlan & Plan)705 EpilogueVectorizerMainLoop(
706 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
707 DominatorTree *DT, const TargetLibraryInfo *TLI,
708 const TargetTransformInfo *TTI, AssumptionCache *AC,
709 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
710 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
711 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
712 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
713 EPI, CM, BFI, PSI, Check, Plan) {}
714 /// Implements the interface for creating a vectorized skeleton using the
715 /// *main loop* strategy (ie the first pass of vplan execution).
716 BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
717
718 protected:
719 /// Emits an iteration count bypass check once for the main loop (when \p
720 /// ForEpilogue is false) and once for the epilogue loop (when \p
721 /// ForEpilogue is true).
722 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
723 void printDebugTracesAtStart() override;
724 void printDebugTracesAtEnd() override;
725 };
726
727 // A specialized derived class of inner loop vectorizer that performs
728 // vectorization of *epilogue* loops in the process of vectorizing loops and
729 // their epilogues.
730 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
731 public:
EpilogueVectorizerEpilogueLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks,VPlan & Plan)732 EpilogueVectorizerEpilogueLoop(
733 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
734 DominatorTree *DT, const TargetLibraryInfo *TLI,
735 const TargetTransformInfo *TTI, AssumptionCache *AC,
736 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
737 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
738 ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
739 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
740 EPI, CM, BFI, PSI, Checks, Plan) {
741 TripCount = EPI.TripCount;
742 }
743 /// Implements the interface for creating a vectorized skeleton using the
744 /// *epilogue loop* strategy (ie the second pass of vplan execution).
745 BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
746
747 protected:
748 /// Emits an iteration count bypass check after the main vector loop has
749 /// finished to see if there are any iterations left to execute by either
750 /// the vector epilogue or the scalar epilogue.
751 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
752 BasicBlock *Bypass,
753 BasicBlock *Insert);
754 void printDebugTracesAtStart() override;
755 void printDebugTracesAtEnd() override;
756 };
757 } // end namespace llvm
758
759 /// Look for a meaningful debug location on the instruction or its operands.
getDebugLocFromInstOrOperands(Instruction * I)760 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
761 if (!I)
762 return DebugLoc::getUnknown();
763
764 DebugLoc Empty;
765 if (I->getDebugLoc() != Empty)
766 return I->getDebugLoc();
767
768 for (Use &Op : I->operands()) {
769 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
770 if (OpInst->getDebugLoc() != Empty)
771 return OpInst->getDebugLoc();
772 }
773
774 return I->getDebugLoc();
775 }
776
777 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
778 /// is passed, the message relates to that particular instruction.
779 #ifndef NDEBUG
debugVectorizationMessage(const StringRef Prefix,const StringRef DebugMsg,Instruction * I)780 static void debugVectorizationMessage(const StringRef Prefix,
781 const StringRef DebugMsg,
782 Instruction *I) {
783 dbgs() << "LV: " << Prefix << DebugMsg;
784 if (I != nullptr)
785 dbgs() << " " << *I;
786 else
787 dbgs() << '.';
788 dbgs() << '\n';
789 }
790 #endif
791
792 /// Create an analysis remark that explains why vectorization failed
793 ///
794 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
795 /// RemarkName is the identifier for the remark. If \p I is passed it is an
796 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
797 /// the location of the remark. If \p DL is passed, use it as debug location for
798 /// the remark. \return the remark object that can be streamed to.
799 static OptimizationRemarkAnalysis
createLVAnalysis(const char * PassName,StringRef RemarkName,Loop * TheLoop,Instruction * I,DebugLoc DL={})800 createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
801 Instruction *I, DebugLoc DL = {}) {
802 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
803 // If debug location is attached to the instruction, use it. Otherwise if DL
804 // was not provided, use the loop's.
805 if (I && I->getDebugLoc())
806 DL = I->getDebugLoc();
807 else if (!DL)
808 DL = TheLoop->getStartLoc();
809
810 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
811 }
812
813 namespace llvm {
814
815 /// Return a value for Step multiplied by VF.
createStepForVF(IRBuilderBase & B,Type * Ty,ElementCount VF,int64_t Step)816 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
817 int64_t Step) {
818 assert(Ty->isIntegerTy() && "Expected an integer step");
819 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
820 }
821
822 /// Return the runtime value for VF.
getRuntimeVF(IRBuilderBase & B,Type * Ty,ElementCount VF)823 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
824 return B.CreateElementCount(Ty, VF);
825 }
826
reportVectorizationFailure(const StringRef DebugMsg,const StringRef OREMsg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)827 void reportVectorizationFailure(const StringRef DebugMsg,
828 const StringRef OREMsg, const StringRef ORETag,
829 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
830 Instruction *I) {
831 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
832 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
833 ORE->emit(
834 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
835 << "loop not vectorized: " << OREMsg);
836 }
837
838 /// Reports an informative message: print \p Msg for debugging purposes as well
839 /// as an optimization remark. Uses either \p I as location of the remark, or
840 /// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
841 /// remark. If \p DL is passed, use it as debug location for the remark.
reportVectorizationInfo(const StringRef Msg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I=nullptr,DebugLoc DL={})842 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
843 OptimizationRemarkEmitter *ORE,
844 Loop *TheLoop, Instruction *I = nullptr,
845 DebugLoc DL = {}) {
846 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
847 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
848 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
849 I, DL)
850 << Msg);
851 }
852
853 /// Report successful vectorization of the loop. In case an outer loop is
854 /// vectorized, prepend "outer" to the vectorization remark.
reportVectorization(OptimizationRemarkEmitter * ORE,Loop * TheLoop,VectorizationFactor VF,unsigned IC)855 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
856 VectorizationFactor VF, unsigned IC) {
857 LLVM_DEBUG(debugVectorizationMessage(
858 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
859 nullptr));
860 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
861 ORE->emit([&]() {
862 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
863 TheLoop->getHeader())
864 << "vectorized " << LoopType << "loop (vectorization width: "
865 << ore::NV("VectorizationFactor", VF.Width)
866 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
867 });
868 }
869
870 } // end namespace llvm
871
872 namespace llvm {
873
874 // Loop vectorization cost-model hints how the scalar epilogue loop should be
875 // lowered.
876 enum ScalarEpilogueLowering {
877
878 // The default: allowing scalar epilogues.
879 CM_ScalarEpilogueAllowed,
880
881 // Vectorization with OptForSize: don't allow epilogues.
882 CM_ScalarEpilogueNotAllowedOptSize,
883
884 // A special case of vectorisation with OptForSize: loops with a very small
885 // trip count are considered for vectorization under OptForSize, thereby
886 // making sure the cost of their loop body is dominant, free of runtime
887 // guards and scalar iteration overheads.
888 CM_ScalarEpilogueNotAllowedLowTripLoop,
889
890 // Loop hint predicate indicating an epilogue is undesired.
891 CM_ScalarEpilogueNotNeededUsePredicate,
892
893 // Directive indicating we must either tail fold or not vectorize
894 CM_ScalarEpilogueNotAllowedUsePredicate
895 };
896
897 /// LoopVectorizationCostModel - estimates the expected speedups due to
898 /// vectorization.
899 /// In many cases vectorization is not profitable. This can happen because of
900 /// a number of reasons. In this class we mainly attempt to predict the
901 /// expected speedup/slowdowns due to the supported instruction set. We use the
902 /// TargetTransformInfo to query the different backends for the cost of
903 /// different operations.
904 class LoopVectorizationCostModel {
905 friend class LoopVectorizationPlanner;
906
907 public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL,Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,LoopVectorizationLegality * Legal,const TargetTransformInfo & TTI,const TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,const Function * F,const LoopVectorizeHints * Hints,InterleavedAccessInfo & IAI,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)908 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
909 PredicatedScalarEvolution &PSE, LoopInfo *LI,
910 LoopVectorizationLegality *Legal,
911 const TargetTransformInfo &TTI,
912 const TargetLibraryInfo *TLI, DemandedBits *DB,
913 AssumptionCache *AC,
914 OptimizationRemarkEmitter *ORE, const Function *F,
915 const LoopVectorizeHints *Hints,
916 InterleavedAccessInfo &IAI,
917 ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
918 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
919 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
920 Hints(Hints), InterleaveInfo(IAI) {
921 if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
922 initializeVScaleForTuning();
923 CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
924 // Query this against the original loop and save it here because the profile
925 // of the original loop header may change as the transformation happens.
926 OptForSize = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
927 PGSOQueryType::IRPass);
928 }
929
930 /// \return An upper bound for the vectorization factors (both fixed and
931 /// scalable). If the factors are 0, vectorization and interleaving should be
932 /// avoided up front.
933 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
934
935 /// \return True if runtime checks are required for vectorization, and false
936 /// otherwise.
937 bool runtimeChecksRequired();
938
939 /// Setup cost-based decisions for user vectorization factor.
940 /// \return true if the UserVF is a feasible VF to be chosen.
selectUserVectorizationFactor(ElementCount UserVF)941 bool selectUserVectorizationFactor(ElementCount UserVF) {
942 collectNonVectorizedAndSetWideningDecisions(UserVF);
943 return expectedCost(UserVF).isValid();
944 }
945
946 /// \return True if maximizing vector bandwidth is enabled by the target or
947 /// user options, for the given register kind.
948 bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
949
950 /// \return True if maximizing vector bandwidth is enabled by the target or
951 /// user options, for the given vector factor.
952 bool useMaxBandwidth(ElementCount VF);
953
954 /// \return The size (in bits) of the smallest and widest types in the code
955 /// that needs to be vectorized. We ignore values that remain scalar such as
956 /// 64 bit loop indices.
957 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
958
959 /// \return The desired interleave count.
960 /// If interleave count has been specified by metadata it will be returned.
961 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
962 /// are the selected vectorization factor and the cost of the selected VF.
963 unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
964 InstructionCost LoopCost);
965
966 /// Memory access instruction may be vectorized in more than one way.
967 /// Form of instruction after vectorization depends on cost.
968 /// This function takes cost-based decisions for Load/Store instructions
969 /// and collects them in a map. This decisions map is used for building
970 /// the lists of loop-uniform and loop-scalar instructions.
971 /// The calculated cost is saved with widening decision in order to
972 /// avoid redundant calculations.
973 void setCostBasedWideningDecision(ElementCount VF);
974
975 /// A call may be vectorized in different ways depending on whether we have
976 /// vectorized variants available and whether the target supports masking.
977 /// This function analyzes all calls in the function at the supplied VF,
978 /// makes a decision based on the costs of available options, and stores that
979 /// decision in a map for use in planning and plan execution.
980 void setVectorizedCallDecision(ElementCount VF);
981
982 /// Collect values we want to ignore in the cost model.
983 void collectValuesToIgnore();
984
985 /// Collect all element types in the loop for which widening is needed.
986 void collectElementTypesForWidening();
987
988 /// Split reductions into those that happen in the loop, and those that happen
989 /// outside. In loop reductions are collected into InLoopReductions.
990 void collectInLoopReductions();
991
992 /// Returns true if we should use strict in-order reductions for the given
993 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
994 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
995 /// of FP operations.
useOrderedReductions(const RecurrenceDescriptor & RdxDesc) const996 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
997 return !Hints->allowReordering() && RdxDesc.isOrdered();
998 }
999
1000 /// \returns The smallest bitwidth each instruction can be represented with.
1001 /// The vector equivalents of these instructions should be truncated to this
1002 /// type.
getMinimalBitwidths() const1003 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1004 return MinBWs;
1005 }
1006
1007 /// \returns True if it is more profitable to scalarize instruction \p I for
1008 /// vectorization factor \p VF.
isProfitableToScalarize(Instruction * I,ElementCount VF) const1009 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1010 assert(VF.isVector() &&
1011 "Profitable to scalarize relevant only for VF > 1.");
1012 assert(
1013 TheLoop->isInnermost() &&
1014 "cost-model should not be used for outer loops (in VPlan-native path)");
1015
1016 auto Scalars = InstsToScalarize.find(VF);
1017 assert(Scalars != InstsToScalarize.end() &&
1018 "VF not yet analyzed for scalarization profitability");
1019 return Scalars->second.contains(I);
1020 }
1021
1022 /// Returns true if \p I is known to be uniform after vectorization.
isUniformAfterVectorization(Instruction * I,ElementCount VF) const1023 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1024 assert(
1025 TheLoop->isInnermost() &&
1026 "cost-model should not be used for outer loops (in VPlan-native path)");
1027 // Pseudo probe needs to be duplicated for each unrolled iteration and
1028 // vector lane so that profiled loop trip count can be accurately
1029 // accumulated instead of being under counted.
1030 if (isa<PseudoProbeInst>(I))
1031 return false;
1032
1033 if (VF.isScalar())
1034 return true;
1035
1036 auto UniformsPerVF = Uniforms.find(VF);
1037 assert(UniformsPerVF != Uniforms.end() &&
1038 "VF not yet analyzed for uniformity");
1039 return UniformsPerVF->second.count(I);
1040 }
1041
1042 /// Returns true if \p I is known to be scalar after vectorization.
isScalarAfterVectorization(Instruction * I,ElementCount VF) const1043 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1044 assert(
1045 TheLoop->isInnermost() &&
1046 "cost-model should not be used for outer loops (in VPlan-native path)");
1047 if (VF.isScalar())
1048 return true;
1049
1050 auto ScalarsPerVF = Scalars.find(VF);
1051 assert(ScalarsPerVF != Scalars.end() &&
1052 "Scalar values are not calculated for VF");
1053 return ScalarsPerVF->second.count(I);
1054 }
1055
1056 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1057 /// for vectorization factor \p VF.
canTruncateToMinimalBitwidth(Instruction * I,ElementCount VF) const1058 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1059 return VF.isVector() && MinBWs.contains(I) &&
1060 !isProfitableToScalarize(I, VF) &&
1061 !isScalarAfterVectorization(I, VF);
1062 }
1063
1064 /// Decision that was taken during cost calculation for memory instruction.
1065 enum InstWidening {
1066 CM_Unknown,
1067 CM_Widen, // For consecutive accesses with stride +1.
1068 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1069 CM_Interleave,
1070 CM_GatherScatter,
1071 CM_Scalarize,
1072 CM_VectorCall,
1073 CM_IntrinsicCall
1074 };
1075
1076 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1077 /// instruction \p I and vector width \p VF.
setWideningDecision(Instruction * I,ElementCount VF,InstWidening W,InstructionCost Cost)1078 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1079 InstructionCost Cost) {
1080 assert(VF.isVector() && "Expected VF >=2");
1081 WideningDecisions[{I, VF}] = {W, Cost};
1082 }
1083
1084 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1085 /// interleaving group \p Grp and vector width \p VF.
setWideningDecision(const InterleaveGroup<Instruction> * Grp,ElementCount VF,InstWidening W,InstructionCost Cost)1086 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1087 ElementCount VF, InstWidening W,
1088 InstructionCost Cost) {
1089 assert(VF.isVector() && "Expected VF >=2");
1090 /// Broadcast this decicion to all instructions inside the group.
1091 /// When interleaving, the cost will only be assigned one instruction, the
1092 /// insert position. For other cases, add the appropriate fraction of the
1093 /// total cost to each instruction. This ensures accurate costs are used,
1094 /// even if the insert position instruction is not used.
1095 InstructionCost InsertPosCost = Cost;
1096 InstructionCost OtherMemberCost = 0;
1097 if (W != CM_Interleave)
1098 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1099 ;
1100 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1101 if (auto *I = Grp->getMember(Idx)) {
1102 if (Grp->getInsertPos() == I)
1103 WideningDecisions[{I, VF}] = {W, InsertPosCost};
1104 else
1105 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1106 }
1107 }
1108 }
1109
1110 /// Return the cost model decision for the given instruction \p I and vector
1111 /// width \p VF. Return CM_Unknown if this instruction did not pass
1112 /// through the cost modeling.
getWideningDecision(Instruction * I,ElementCount VF) const1113 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1114 assert(VF.isVector() && "Expected VF to be a vector VF");
1115 assert(
1116 TheLoop->isInnermost() &&
1117 "cost-model should not be used for outer loops (in VPlan-native path)");
1118
1119 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1120 auto Itr = WideningDecisions.find(InstOnVF);
1121 if (Itr == WideningDecisions.end())
1122 return CM_Unknown;
1123 return Itr->second.first;
1124 }
1125
1126 /// Return the vectorization cost for the given instruction \p I and vector
1127 /// width \p VF.
getWideningCost(Instruction * I,ElementCount VF)1128 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1129 assert(VF.isVector() && "Expected VF >=2");
1130 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1131 assert(WideningDecisions.contains(InstOnVF) &&
1132 "The cost is not calculated");
1133 return WideningDecisions[InstOnVF].second;
1134 }
1135
1136 struct CallWideningDecision {
1137 InstWidening Kind;
1138 Function *Variant;
1139 Intrinsic::ID IID;
1140 std::optional<unsigned> MaskPos;
1141 InstructionCost Cost;
1142 };
1143
setCallWideningDecision(CallInst * CI,ElementCount VF,InstWidening Kind,Function * Variant,Intrinsic::ID IID,std::optional<unsigned> MaskPos,InstructionCost Cost)1144 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1145 Function *Variant, Intrinsic::ID IID,
1146 std::optional<unsigned> MaskPos,
1147 InstructionCost Cost) {
1148 assert(!VF.isScalar() && "Expected vector VF");
1149 CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, MaskPos, Cost};
1150 }
1151
getCallWideningDecision(CallInst * CI,ElementCount VF) const1152 CallWideningDecision getCallWideningDecision(CallInst *CI,
1153 ElementCount VF) const {
1154 assert(!VF.isScalar() && "Expected vector VF");
1155 return CallWideningDecisions.at({CI, VF});
1156 }
1157
1158 /// Return True if instruction \p I is an optimizable truncate whose operand
1159 /// is an induction variable. Such a truncate will be removed by adding a new
1160 /// induction variable with the destination type.
isOptimizableIVTruncate(Instruction * I,ElementCount VF)1161 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1162 // If the instruction is not a truncate, return false.
1163 auto *Trunc = dyn_cast<TruncInst>(I);
1164 if (!Trunc)
1165 return false;
1166
1167 // Get the source and destination types of the truncate.
1168 Type *SrcTy = toVectorTy(Trunc->getSrcTy(), VF);
1169 Type *DestTy = toVectorTy(Trunc->getDestTy(), VF);
1170
1171 // If the truncate is free for the given types, return false. Replacing a
1172 // free truncate with an induction variable would add an induction variable
1173 // update instruction to each iteration of the loop. We exclude from this
1174 // check the primary induction variable since it will need an update
1175 // instruction regardless.
1176 Value *Op = Trunc->getOperand(0);
1177 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1178 return false;
1179
1180 // If the truncated value is not an induction variable, return false.
1181 return Legal->isInductionPhi(Op);
1182 }
1183
1184 /// Collects the instructions to scalarize for each predicated instruction in
1185 /// the loop.
1186 void collectInstsToScalarize(ElementCount VF);
1187
1188 /// Collect values that will not be widened, including Uniforms, Scalars, and
1189 /// Instructions to Scalarize for the given \p VF.
1190 /// The sets depend on CM decision for Load/Store instructions
1191 /// that may be vectorized as interleave, gather-scatter or scalarized.
1192 /// Also make a decision on what to do about call instructions in the loop
1193 /// at that VF -- scalarize, call a known vector routine, or call a
1194 /// vector intrinsic.
collectNonVectorizedAndSetWideningDecisions(ElementCount VF)1195 void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
1196 // Do the analysis once.
1197 if (VF.isScalar() || Uniforms.contains(VF))
1198 return;
1199 setCostBasedWideningDecision(VF);
1200 collectLoopUniforms(VF);
1201 setVectorizedCallDecision(VF);
1202 collectLoopScalars(VF);
1203 collectInstsToScalarize(VF);
1204 }
1205
1206 /// Returns true if the target machine supports masked store operation
1207 /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedStore(Type * DataType,Value * Ptr,Align Alignment,unsigned AddressSpace) const1208 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1209 unsigned AddressSpace) const {
1210 return Legal->isConsecutivePtr(DataType, Ptr) &&
1211 TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
1212 }
1213
1214 /// Returns true if the target machine supports masked load operation
1215 /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedLoad(Type * DataType,Value * Ptr,Align Alignment,unsigned AddressSpace) const1216 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1217 unsigned AddressSpace) const {
1218 return Legal->isConsecutivePtr(DataType, Ptr) &&
1219 TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1220 }
1221
1222 /// Returns true if the target machine can represent \p V as a masked gather
1223 /// or scatter operation.
isLegalGatherOrScatter(Value * V,ElementCount VF)1224 bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1225 bool LI = isa<LoadInst>(V);
1226 bool SI = isa<StoreInst>(V);
1227 if (!LI && !SI)
1228 return false;
1229 auto *Ty = getLoadStoreType(V);
1230 Align Align = getLoadStoreAlignment(V);
1231 if (VF.isVector())
1232 Ty = VectorType::get(Ty, VF);
1233 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1234 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1235 }
1236
1237 /// Returns true if the target machine supports all of the reduction
1238 /// variables found for the given VF.
canVectorizeReductions(ElementCount VF) const1239 bool canVectorizeReductions(ElementCount VF) const {
1240 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1241 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1242 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1243 }));
1244 }
1245
1246 /// Given costs for both strategies, return true if the scalar predication
1247 /// lowering should be used for div/rem. This incorporates an override
1248 /// option so it is not simply a cost comparison.
isDivRemScalarWithPredication(InstructionCost ScalarCost,InstructionCost SafeDivisorCost) const1249 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1250 InstructionCost SafeDivisorCost) const {
1251 switch (ForceSafeDivisor) {
1252 case cl::BOU_UNSET:
1253 return ScalarCost < SafeDivisorCost;
1254 case cl::BOU_TRUE:
1255 return false;
1256 case cl::BOU_FALSE:
1257 return true;
1258 }
1259 llvm_unreachable("impossible case value");
1260 }
1261
1262 /// Returns true if \p I is an instruction which requires predication and
1263 /// for which our chosen predication strategy is scalarization (i.e. we
1264 /// don't have an alternate strategy such as masking available).
1265 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1266 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1267
1268 /// Returns true if \p I is an instruction that needs to be predicated
1269 /// at runtime. The result is independent of the predication mechanism.
1270 /// Superset of instructions that return true for isScalarWithPredication.
1271 bool isPredicatedInst(Instruction *I) const;
1272
1273 /// Return the costs for our two available strategies for lowering a
1274 /// div/rem operation which requires speculating at least one lane.
1275 /// First result is for scalarization (will be invalid for scalable
1276 /// vectors); second is for the safe-divisor strategy.
1277 std::pair<InstructionCost, InstructionCost>
1278 getDivRemSpeculationCost(Instruction *I,
1279 ElementCount VF) const;
1280
1281 /// Returns true if \p I is a memory instruction with consecutive memory
1282 /// access that can be widened.
1283 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1284
1285 /// Returns true if \p I is a memory instruction in an interleaved-group
1286 /// of memory accesses that can be vectorized with wide vector loads/stores
1287 /// and shuffles.
1288 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1289
1290 /// Check if \p Instr belongs to any interleaved access group.
isAccessInterleaved(Instruction * Instr) const1291 bool isAccessInterleaved(Instruction *Instr) const {
1292 return InterleaveInfo.isInterleaved(Instr);
1293 }
1294
1295 /// Get the interleaved access group that \p Instr belongs to.
1296 const InterleaveGroup<Instruction> *
getInterleavedAccessGroup(Instruction * Instr) const1297 getInterleavedAccessGroup(Instruction *Instr) const {
1298 return InterleaveInfo.getInterleaveGroup(Instr);
1299 }
1300
1301 /// Returns true if we're required to use a scalar epilogue for at least
1302 /// the final iteration of the original loop.
requiresScalarEpilogue(bool IsVectorizing) const1303 bool requiresScalarEpilogue(bool IsVectorizing) const {
1304 if (!isScalarEpilogueAllowed()) {
1305 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1306 return false;
1307 }
1308 // If we might exit from anywhere but the latch and early exit vectorization
1309 // is disabled, we must run the exiting iteration in scalar form.
1310 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1311 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1312 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1313 "from latch block\n");
1314 return true;
1315 }
1316 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1317 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1318 "interleaved group requires scalar epilogue\n");
1319 return true;
1320 }
1321 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1322 return false;
1323 }
1324
1325 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1326 /// loop hint annotation.
isScalarEpilogueAllowed() const1327 bool isScalarEpilogueAllowed() const {
1328 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1329 }
1330
1331 /// Returns the TailFoldingStyle that is best for the current loop.
getTailFoldingStyle(bool IVUpdateMayOverflow=true) const1332 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1333 if (!ChosenTailFoldingStyle)
1334 return TailFoldingStyle::None;
1335 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1336 : ChosenTailFoldingStyle->second;
1337 }
1338
1339 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1340 /// overflow or not.
1341 /// \param IsScalableVF true if scalable vector factors enabled.
1342 /// \param UserIC User specific interleave count.
setTailFoldingStyles(bool IsScalableVF,unsigned UserIC)1343 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1344 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1345 if (!Legal->canFoldTailByMasking()) {
1346 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1347 return;
1348 }
1349
1350 // Default to TTI preference, but allow command line override.
1351 ChosenTailFoldingStyle = {
1352 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1353 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
1354 if (ForceTailFoldingStyle.getNumOccurrences())
1355 ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
1356 ForceTailFoldingStyle.getValue()};
1357
1358 if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1359 return;
1360 // Override forced styles if needed.
1361 // FIXME: Investigate opportunity for fixed vector factor.
1362 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1363 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1364 if (EVLIsLegal)
1365 return;
1366 // If for some reason EVL mode is unsupported, fallback to
1367 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1368 // in a generic way.
1369 ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
1370 TailFoldingStyle::DataWithoutLaneMask};
1371 LLVM_DEBUG(
1372 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1373 "not try to generate VP Intrinsics "
1374 << (UserIC > 1
1375 ? "since interleave count specified is greater than 1.\n"
1376 : "due to non-interleaving reasons.\n"));
1377 }
1378
1379 /// Returns true if all loop blocks should be masked to fold tail loop.
foldTailByMasking() const1380 bool foldTailByMasking() const {
1381 // TODO: check if it is possible to check for None style independent of
1382 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1383 return getTailFoldingStyle() != TailFoldingStyle::None;
1384 }
1385
1386 /// Return maximum safe number of elements to be processed per vector
1387 /// iteration, which do not prevent store-load forwarding and are safe with
1388 /// regard to the memory dependencies. Required for EVL-based VPlans to
1389 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1390 /// MaxSafeElements).
1391 /// TODO: need to consider adjusting cost model to use this value as a
1392 /// vectorization factor for EVL-based vectorization.
getMaxSafeElements() const1393 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1394
1395 /// Returns true if the instructions in this block requires predication
1396 /// for any reason, e.g. because tail folding now requires a predicate
1397 /// or because the block in the original loop was predicated.
blockNeedsPredicationForAnyReason(BasicBlock * BB) const1398 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1399 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1400 }
1401
1402 /// Returns true if VP intrinsics with explicit vector length support should
1403 /// be generated in the tail folded loop.
foldTailWithEVL() const1404 bool foldTailWithEVL() const {
1405 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1406 }
1407
1408 /// Returns true if the Phi is part of an inloop reduction.
isInLoopReduction(PHINode * Phi) const1409 bool isInLoopReduction(PHINode *Phi) const {
1410 return InLoopReductions.contains(Phi);
1411 }
1412
1413 /// Returns true if the predicated reduction select should be used to set the
1414 /// incoming value for the reduction phi.
usePredicatedReductionSelect() const1415 bool usePredicatedReductionSelect() const {
1416 // Force to use predicated reduction select since the EVL of the
1417 // second-to-last iteration might not be VF*UF.
1418 if (foldTailWithEVL())
1419 return true;
1420 return PreferPredicatedReductionSelect ||
1421 TTI.preferPredicatedReductionSelect();
1422 }
1423
1424 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1425 /// with factor VF. Return the cost of the instruction, including
1426 /// scalarization overhead if it's needed.
1427 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1428
1429 /// Estimate cost of a call instruction CI if it were vectorized with factor
1430 /// VF. Return the cost of the instruction, including scalarization overhead
1431 /// if it's needed.
1432 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1433
1434 /// Invalidates decisions already taken by the cost model.
invalidateCostModelingDecisions()1435 void invalidateCostModelingDecisions() {
1436 WideningDecisions.clear();
1437 CallWideningDecisions.clear();
1438 Uniforms.clear();
1439 Scalars.clear();
1440 }
1441
1442 /// Returns the expected execution cost. The unit of the cost does
1443 /// not matter because we use the 'cost' units to compare different
1444 /// vector widths. The cost that is returned is *not* normalized by
1445 /// the factor width.
1446 InstructionCost expectedCost(ElementCount VF);
1447
hasPredStores() const1448 bool hasPredStores() const { return NumPredStores > 0; }
1449
1450 /// Returns true if epilogue vectorization is considered profitable, and
1451 /// false otherwise.
1452 /// \p VF is the vectorization factor chosen for the original loop.
1453 /// \p Multiplier is an aditional scaling factor applied to VF before
1454 /// comparing to EpilogueVectorizationMinVF.
1455 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1456 const unsigned IC) const;
1457
1458 /// Returns the execution time cost of an instruction for a given vector
1459 /// width. Vector width of one means scalar.
1460 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1461
1462 /// Return the cost of instructions in an inloop reduction pattern, if I is
1463 /// part of that pattern.
1464 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1465 ElementCount VF,
1466 Type *VectorTy) const;
1467
1468 /// Returns true if \p Op should be considered invariant and if it is
1469 /// trivially hoistable.
1470 bool shouldConsiderInvariant(Value *Op);
1471
1472 /// Return the value of vscale used for tuning the cost model.
getVScaleForTuning() const1473 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1474
1475 private:
1476 unsigned NumPredStores = 0;
1477
1478 /// Used to store the value of vscale used for tuning the cost model. It is
1479 /// initialized during object construction.
1480 std::optional<unsigned> VScaleForTuning;
1481
1482 /// Initializes the value of vscale used for tuning the cost model. If
1483 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1484 /// return the value returned by the corresponding TTI method.
initializeVScaleForTuning()1485 void initializeVScaleForTuning() {
1486 const Function *Fn = TheLoop->getHeader()->getParent();
1487 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
1488 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
1489 auto Min = Attr.getVScaleRangeMin();
1490 auto Max = Attr.getVScaleRangeMax();
1491 if (Max && Min == Max) {
1492 VScaleForTuning = Max;
1493 return;
1494 }
1495 }
1496
1497 VScaleForTuning = TTI.getVScaleForTuning();
1498 }
1499
1500 /// \return An upper bound for the vectorization factors for both
1501 /// fixed and scalable vectorization, where the minimum-known number of
1502 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1503 /// disabled or unsupported, then the scalable part will be equal to
1504 /// ElementCount::getScalable(0).
1505 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1506 ElementCount UserVF,
1507 bool FoldTailByMasking);
1508
1509 /// \return the maximized element count based on the targets vector
1510 /// registers and the loop trip-count, but limited to a maximum safe VF.
1511 /// This is a helper function of computeFeasibleMaxVF.
1512 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1513 unsigned SmallestType,
1514 unsigned WidestType,
1515 ElementCount MaxSafeVF,
1516 bool FoldTailByMasking);
1517
1518 /// Checks if scalable vectorization is supported and enabled. Caches the
1519 /// result to avoid repeated debug dumps for repeated queries.
1520 bool isScalableVectorizationAllowed();
1521
1522 /// \return the maximum legal scalable VF, based on the safe max number
1523 /// of elements.
1524 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1525
1526 /// Calculate vectorization cost of memory instruction \p I.
1527 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1528
1529 /// The cost computation for scalarized memory instruction.
1530 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1531
1532 /// The cost computation for interleaving group of memory instructions.
1533 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1534
1535 /// The cost computation for Gather/Scatter instruction.
1536 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1537
1538 /// The cost computation for widening instruction \p I with consecutive
1539 /// memory access.
1540 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1541
1542 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1543 /// Load: scalar load + broadcast.
1544 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1545 /// element)
1546 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1547
1548 /// Estimate the overhead of scalarizing an instruction. This is a
1549 /// convenience wrapper for the type-based getScalarizationOverhead API.
1550 InstructionCost getScalarizationOverhead(Instruction *I,
1551 ElementCount VF) const;
1552
1553 /// Returns true if an artificially high cost for emulated masked memrefs
1554 /// should be used.
1555 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1556
1557 /// Map of scalar integer values to the smallest bitwidth they can be legally
1558 /// represented as. The vector equivalents of these values should be truncated
1559 /// to this type.
1560 MapVector<Instruction *, uint64_t> MinBWs;
1561
1562 /// A type representing the costs for instructions if they were to be
1563 /// scalarized rather than vectorized. The entries are Instruction-Cost
1564 /// pairs.
1565 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1566
1567 /// A set containing all BasicBlocks that are known to present after
1568 /// vectorization as a predicated block.
1569 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1570 PredicatedBBsAfterVectorization;
1571
1572 /// Records whether it is allowed to have the original scalar loop execute at
1573 /// least once. This may be needed as a fallback loop in case runtime
1574 /// aliasing/dependence checks fail, or to handle the tail/remainder
1575 /// iterations when the trip count is unknown or doesn't divide by the VF,
1576 /// or as a peel-loop to handle gaps in interleave-groups.
1577 /// Under optsize and when the trip count is very small we don't allow any
1578 /// iterations to execute in the scalar loop.
1579 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1580
1581 /// Control finally chosen tail folding style. The first element is used if
1582 /// the IV update may overflow, the second element - if it does not.
1583 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1584 ChosenTailFoldingStyle;
1585
1586 /// true if scalable vectorization is supported and enabled.
1587 std::optional<bool> IsScalableVectorizationAllowed;
1588
1589 /// Maximum safe number of elements to be processed per vector iteration,
1590 /// which do not prevent store-load forwarding and are safe with regard to the
1591 /// memory dependencies. Required for EVL-based veectorization, where this
1592 /// value is used as the upper bound of the safe AVL.
1593 std::optional<unsigned> MaxSafeElements;
1594
1595 /// A map holding scalar costs for different vectorization factors. The
1596 /// presence of a cost for an instruction in the mapping indicates that the
1597 /// instruction will be scalarized when vectorizing with the associated
1598 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1599 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1600
1601 /// Holds the instructions known to be uniform after vectorization.
1602 /// The data is collected per VF.
1603 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1604
1605 /// Holds the instructions known to be scalar after vectorization.
1606 /// The data is collected per VF.
1607 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1608
1609 /// Holds the instructions (address computations) that are forced to be
1610 /// scalarized.
1611 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1612
1613 /// PHINodes of the reductions that should be expanded in-loop.
1614 SmallPtrSet<PHINode *, 4> InLoopReductions;
1615
1616 /// A Map of inloop reduction operations and their immediate chain operand.
1617 /// FIXME: This can be removed once reductions can be costed correctly in
1618 /// VPlan. This was added to allow quick lookup of the inloop operations.
1619 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1620
1621 /// Returns the expected difference in cost from scalarizing the expression
1622 /// feeding a predicated instruction \p PredInst. The instructions to
1623 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1624 /// non-negative return value implies the expression will be scalarized.
1625 /// Currently, only single-use chains are considered for scalarization.
1626 InstructionCost computePredInstDiscount(Instruction *PredInst,
1627 ScalarCostsTy &ScalarCosts,
1628 ElementCount VF);
1629
1630 /// Collect the instructions that are uniform after vectorization. An
1631 /// instruction is uniform if we represent it with a single scalar value in
1632 /// the vectorized loop corresponding to each vector iteration. Examples of
1633 /// uniform instructions include pointer operands of consecutive or
1634 /// interleaved memory accesses. Note that although uniformity implies an
1635 /// instruction will be scalar, the reverse is not true. In general, a
1636 /// scalarized instruction will be represented by VF scalar values in the
1637 /// vectorized loop, each corresponding to an iteration of the original
1638 /// scalar loop.
1639 void collectLoopUniforms(ElementCount VF);
1640
1641 /// Collect the instructions that are scalar after vectorization. An
1642 /// instruction is scalar if it is known to be uniform or will be scalarized
1643 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1644 /// to the list if they are used by a load/store instruction that is marked as
1645 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1646 /// VF values in the vectorized loop, each corresponding to an iteration of
1647 /// the original scalar loop.
1648 void collectLoopScalars(ElementCount VF);
1649
1650 /// Keeps cost model vectorization decision and cost for instructions.
1651 /// Right now it is used for memory instructions only.
1652 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1653 std::pair<InstWidening, InstructionCost>>;
1654
1655 DecisionList WideningDecisions;
1656
1657 using CallDecisionList =
1658 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1659
1660 CallDecisionList CallWideningDecisions;
1661
1662 /// Returns true if \p V is expected to be vectorized and it needs to be
1663 /// extracted.
needsExtract(Value * V,ElementCount VF) const1664 bool needsExtract(Value *V, ElementCount VF) const {
1665 Instruction *I = dyn_cast<Instruction>(V);
1666 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1667 TheLoop->isLoopInvariant(I) ||
1668 getWideningDecision(I, VF) == CM_Scalarize)
1669 return false;
1670
1671 // Assume we can vectorize V (and hence we need extraction) if the
1672 // scalars are not computed yet. This can happen, because it is called
1673 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1674 // the scalars are collected. That should be a safe assumption in most
1675 // cases, because we check if the operands have vectorizable types
1676 // beforehand in LoopVectorizationLegality.
1677 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1678 };
1679
1680 /// Returns a range containing only operands needing to be extracted.
filterExtractingOperands(Instruction::op_range Ops,ElementCount VF) const1681 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1682 ElementCount VF) const {
1683 return SmallVector<Value *, 4>(make_filter_range(
1684 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1685 }
1686
1687 public:
1688 /// The loop that we evaluate.
1689 Loop *TheLoop;
1690
1691 /// Predicated scalar evolution analysis.
1692 PredicatedScalarEvolution &PSE;
1693
1694 /// Loop Info analysis.
1695 LoopInfo *LI;
1696
1697 /// Vectorization legality.
1698 LoopVectorizationLegality *Legal;
1699
1700 /// Vector target information.
1701 const TargetTransformInfo &TTI;
1702
1703 /// Target Library Info.
1704 const TargetLibraryInfo *TLI;
1705
1706 /// Demanded bits analysis.
1707 DemandedBits *DB;
1708
1709 /// Assumption cache.
1710 AssumptionCache *AC;
1711
1712 /// Interface to emit optimization remarks.
1713 OptimizationRemarkEmitter *ORE;
1714
1715 const Function *TheFunction;
1716
1717 /// Loop Vectorize Hint.
1718 const LoopVectorizeHints *Hints;
1719
1720 /// The interleave access information contains groups of interleaved accesses
1721 /// with the same stride and close to each other.
1722 InterleavedAccessInfo &InterleaveInfo;
1723
1724 /// Values to ignore in the cost model.
1725 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1726
1727 /// Values to ignore in the cost model when VF > 1.
1728 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1729
1730 /// All element types found in the loop.
1731 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1732
1733 /// The kind of cost that we are calculating
1734 TTI::TargetCostKind CostKind;
1735
1736 /// Whether this loop should be optimized for size based on function attribute
1737 /// or profile information.
1738 bool OptForSize;
1739 };
1740 } // end namespace llvm
1741
1742 namespace {
1743 /// Helper struct to manage generating runtime checks for vectorization.
1744 ///
1745 /// The runtime checks are created up-front in temporary blocks to allow better
1746 /// estimating the cost and un-linked from the existing IR. After deciding to
1747 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1748 /// temporary blocks are completely removed.
1749 class GeneratedRTChecks {
1750 /// Basic block which contains the generated SCEV checks, if any.
1751 BasicBlock *SCEVCheckBlock = nullptr;
1752
1753 /// The value representing the result of the generated SCEV checks. If it is
1754 /// nullptr no SCEV checks have been generated.
1755 Value *SCEVCheckCond = nullptr;
1756
1757 /// Basic block which contains the generated memory runtime checks, if any.
1758 BasicBlock *MemCheckBlock = nullptr;
1759
1760 /// The value representing the result of the generated memory runtime checks.
1761 /// If it is nullptr no memory runtime checks have been generated.
1762 Value *MemRuntimeCheckCond = nullptr;
1763
1764 DominatorTree *DT;
1765 LoopInfo *LI;
1766 TargetTransformInfo *TTI;
1767
1768 SCEVExpander SCEVExp;
1769 SCEVExpander MemCheckExp;
1770
1771 bool CostTooHigh = false;
1772
1773 Loop *OuterLoop = nullptr;
1774
1775 PredicatedScalarEvolution &PSE;
1776
1777 /// The kind of cost that we are calculating
1778 TTI::TargetCostKind CostKind;
1779
1780 public:
GeneratedRTChecks(PredicatedScalarEvolution & PSE,DominatorTree * DT,LoopInfo * LI,TargetTransformInfo * TTI,const DataLayout & DL,TTI::TargetCostKind CostKind)1781 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1782 LoopInfo *LI, TargetTransformInfo *TTI,
1783 const DataLayout &DL, TTI::TargetCostKind CostKind)
1784 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1785 MemCheckExp(*PSE.getSE(), DL, "scev.check"), PSE(PSE),
1786 CostKind(CostKind) {}
1787
1788 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1789 /// accurately estimate the cost of the runtime checks. The blocks are
1790 /// un-linked from the IR and are added back during vector code generation. If
1791 /// there is no vector code generation, the check blocks are removed
1792 /// completely.
create(Loop * L,const LoopAccessInfo & LAI,const SCEVPredicate & UnionPred,ElementCount VF,unsigned IC)1793 void create(Loop *L, const LoopAccessInfo &LAI,
1794 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1795
1796 // Hard cutoff to limit compile-time increase in case a very large number of
1797 // runtime checks needs to be generated.
1798 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1799 // profile info.
1800 CostTooHigh =
1801 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1802 if (CostTooHigh)
1803 return;
1804
1805 BasicBlock *LoopHeader = L->getHeader();
1806 BasicBlock *Preheader = L->getLoopPreheader();
1807
1808 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1809 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1810 // may be used by SCEVExpander. The blocks will be un-linked from their
1811 // predecessors and removed from LI & DT at the end of the function.
1812 if (!UnionPred.isAlwaysTrue()) {
1813 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1814 nullptr, "vector.scevcheck");
1815
1816 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1817 &UnionPred, SCEVCheckBlock->getTerminator());
1818 if (isa<Constant>(SCEVCheckCond)) {
1819 // Clean up directly after expanding the predicate to a constant, to
1820 // avoid further expansions re-using anything left over from SCEVExp.
1821 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1822 SCEVCleaner.cleanup();
1823 }
1824 }
1825
1826 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1827 if (RtPtrChecking.Need) {
1828 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1829 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1830 "vector.memcheck");
1831
1832 auto DiffChecks = RtPtrChecking.getDiffChecks();
1833 if (DiffChecks) {
1834 Value *RuntimeVF = nullptr;
1835 MemRuntimeCheckCond = addDiffRuntimeChecks(
1836 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1837 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1838 if (!RuntimeVF)
1839 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1840 return RuntimeVF;
1841 },
1842 IC);
1843 } else {
1844 MemRuntimeCheckCond = addRuntimeChecks(
1845 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1846 MemCheckExp, VectorizerParams::HoistRuntimeChecks);
1847 }
1848 assert(MemRuntimeCheckCond &&
1849 "no RT checks generated although RtPtrChecking "
1850 "claimed checks are required");
1851 }
1852
1853 if (!MemCheckBlock && !SCEVCheckBlock)
1854 return;
1855
1856 // Unhook the temporary block with the checks, update various places
1857 // accordingly.
1858 if (SCEVCheckBlock)
1859 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1860 if (MemCheckBlock)
1861 MemCheckBlock->replaceAllUsesWith(Preheader);
1862
1863 if (SCEVCheckBlock) {
1864 SCEVCheckBlock->getTerminator()->moveBefore(
1865 Preheader->getTerminator()->getIterator());
1866 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1867 UI->setDebugLoc(DebugLoc::getTemporary());
1868 Preheader->getTerminator()->eraseFromParent();
1869 }
1870 if (MemCheckBlock) {
1871 MemCheckBlock->getTerminator()->moveBefore(
1872 Preheader->getTerminator()->getIterator());
1873 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1874 UI->setDebugLoc(DebugLoc::getTemporary());
1875 Preheader->getTerminator()->eraseFromParent();
1876 }
1877
1878 DT->changeImmediateDominator(LoopHeader, Preheader);
1879 if (MemCheckBlock) {
1880 DT->eraseNode(MemCheckBlock);
1881 LI->removeBlock(MemCheckBlock);
1882 }
1883 if (SCEVCheckBlock) {
1884 DT->eraseNode(SCEVCheckBlock);
1885 LI->removeBlock(SCEVCheckBlock);
1886 }
1887
1888 // Outer loop is used as part of the later cost calculations.
1889 OuterLoop = L->getParentLoop();
1890 }
1891
getCost()1892 InstructionCost getCost() {
1893 if (SCEVCheckBlock || MemCheckBlock)
1894 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1895
1896 if (CostTooHigh) {
1897 InstructionCost Cost;
1898 Cost.setInvalid();
1899 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1900 return Cost;
1901 }
1902
1903 InstructionCost RTCheckCost = 0;
1904 if (SCEVCheckBlock)
1905 for (Instruction &I : *SCEVCheckBlock) {
1906 if (SCEVCheckBlock->getTerminator() == &I)
1907 continue;
1908 InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1909 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1910 RTCheckCost += C;
1911 }
1912 if (MemCheckBlock) {
1913 InstructionCost MemCheckCost = 0;
1914 for (Instruction &I : *MemCheckBlock) {
1915 if (MemCheckBlock->getTerminator() == &I)
1916 continue;
1917 InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1918 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1919 MemCheckCost += C;
1920 }
1921
1922 // If the runtime memory checks are being created inside an outer loop
1923 // we should find out if these checks are outer loop invariant. If so,
1924 // the checks will likely be hoisted out and so the effective cost will
1925 // reduce according to the outer loop trip count.
1926 if (OuterLoop) {
1927 ScalarEvolution *SE = MemCheckExp.getSE();
1928 // TODO: If profitable, we could refine this further by analysing every
1929 // individual memory check, since there could be a mixture of loop
1930 // variant and invariant checks that mean the final condition is
1931 // variant.
1932 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1933 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1934 // It seems reasonable to assume that we can reduce the effective
1935 // cost of the checks even when we know nothing about the trip
1936 // count. Assume that the outer loop executes at least twice.
1937 unsigned BestTripCount = 2;
1938
1939 // Get the best known TC estimate.
1940 if (auto EstimatedTC = getSmallBestKnownTC(
1941 PSE, OuterLoop, /* CanUseConstantMax = */ false))
1942 if (EstimatedTC->isFixed())
1943 BestTripCount = EstimatedTC->getFixedValue();
1944
1945 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1946
1947 // Let's ensure the cost is always at least 1.
1948 NewMemCheckCost = std::max(NewMemCheckCost.getValue(),
1949 (InstructionCost::CostType)1);
1950
1951 if (BestTripCount > 1)
1952 LLVM_DEBUG(dbgs()
1953 << "We expect runtime memory checks to be hoisted "
1954 << "out of the outer loop. Cost reduced from "
1955 << MemCheckCost << " to " << NewMemCheckCost << '\n');
1956
1957 MemCheckCost = NewMemCheckCost;
1958 }
1959 }
1960
1961 RTCheckCost += MemCheckCost;
1962 }
1963
1964 if (SCEVCheckBlock || MemCheckBlock)
1965 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1966 << "\n");
1967
1968 return RTCheckCost;
1969 }
1970
1971 /// Remove the created SCEV & memory runtime check blocks & instructions, if
1972 /// unused.
~GeneratedRTChecks()1973 ~GeneratedRTChecks() {
1974 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1975 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1976 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(SCEVCheckBlock);
1977 bool MemChecksUsed = !MemCheckBlock || !pred_empty(MemCheckBlock);
1978 if (SCEVChecksUsed)
1979 SCEVCleaner.markResultUsed();
1980
1981 if (MemChecksUsed) {
1982 MemCheckCleaner.markResultUsed();
1983 } else {
1984 auto &SE = *MemCheckExp.getSE();
1985 // Memory runtime check generation creates compares that use expanded
1986 // values. Remove them before running the SCEVExpanderCleaners.
1987 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
1988 if (MemCheckExp.isInsertedInstruction(&I))
1989 continue;
1990 SE.forgetValue(&I);
1991 I.eraseFromParent();
1992 }
1993 }
1994 MemCheckCleaner.cleanup();
1995 SCEVCleaner.cleanup();
1996
1997 if (!SCEVChecksUsed)
1998 SCEVCheckBlock->eraseFromParent();
1999 if (!MemChecksUsed)
2000 MemCheckBlock->eraseFromParent();
2001 }
2002
2003 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2004 /// outside VPlan.
getSCEVChecks()2005 std::pair<Value *, BasicBlock *> getSCEVChecks() {
2006 using namespace llvm::PatternMatch;
2007 if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
2008 return {nullptr, nullptr};
2009
2010 return {SCEVCheckCond, SCEVCheckBlock};
2011 }
2012
2013 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2014 /// outside VPlan.
getMemRuntimeChecks()2015 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() {
2016 return {MemRuntimeCheckCond, MemCheckBlock};
2017 }
2018
2019 /// Return true if any runtime checks have been added
hasChecks() const2020 bool hasChecks() const {
2021 using namespace llvm::PatternMatch;
2022 return (SCEVCheckCond && !match(SCEVCheckCond, m_ZeroInt())) ||
2023 MemRuntimeCheckCond;
2024 }
2025 };
2026 } // namespace
2027
useActiveLaneMask(TailFoldingStyle Style)2028 static bool useActiveLaneMask(TailFoldingStyle Style) {
2029 return Style == TailFoldingStyle::Data ||
2030 Style == TailFoldingStyle::DataAndControlFlow ||
2031 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2032 }
2033
useActiveLaneMaskForControlFlow(TailFoldingStyle Style)2034 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2035 return Style == TailFoldingStyle::DataAndControlFlow ||
2036 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2037 }
2038
2039 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2040 // vectorization. The loop needs to be annotated with #pragma omp simd
2041 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2042 // vector length information is not provided, vectorization is not considered
2043 // explicit. Interleave hints are not allowed either. These limitations will be
2044 // relaxed in the future.
2045 // Please, note that we are currently forced to abuse the pragma 'clang
2046 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2047 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2048 // provides *explicit vectorization hints* (LV can bypass legal checks and
2049 // assume that vectorization is legal). However, both hints are implemented
2050 // using the same metadata (llvm.loop.vectorize, processed by
2051 // LoopVectorizeHints). This will be fixed in the future when the native IR
2052 // representation for pragma 'omp simd' is introduced.
isExplicitVecOuterLoop(Loop * OuterLp,OptimizationRemarkEmitter * ORE)2053 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2054 OptimizationRemarkEmitter *ORE) {
2055 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2056 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2057
2058 // Only outer loops with an explicit vectorization hint are supported.
2059 // Unannotated outer loops are ignored.
2060 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2061 return false;
2062
2063 Function *Fn = OuterLp->getHeader()->getParent();
2064 if (!Hints.allowVectorization(Fn, OuterLp,
2065 true /*VectorizeOnlyWhenForced*/)) {
2066 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2067 return false;
2068 }
2069
2070 if (Hints.getInterleave() > 1) {
2071 // TODO: Interleave support is future work.
2072 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2073 "outer loops.\n");
2074 Hints.emitRemarkWithHints();
2075 return false;
2076 }
2077
2078 return true;
2079 }
2080
collectSupportedLoops(Loop & L,LoopInfo * LI,OptimizationRemarkEmitter * ORE,SmallVectorImpl<Loop * > & V)2081 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2082 OptimizationRemarkEmitter *ORE,
2083 SmallVectorImpl<Loop *> &V) {
2084 // Collect inner loops and outer loops without irreducible control flow. For
2085 // now, only collect outer loops that have explicit vectorization hints. If we
2086 // are stress testing the VPlan H-CFG construction, we collect the outermost
2087 // loop of every loop nest.
2088 if (L.isInnermost() || VPlanBuildStressTest ||
2089 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2090 LoopBlocksRPO RPOT(&L);
2091 RPOT.perform(LI);
2092 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2093 V.push_back(&L);
2094 // TODO: Collect inner loops inside marked outer loops in case
2095 // vectorization fails for the outer loop. Do not invoke
2096 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2097 // already known to be reducible. We can use an inherited attribute for
2098 // that.
2099 return;
2100 }
2101 }
2102 for (Loop *InnerL : L)
2103 collectSupportedLoops(*InnerL, LI, ORE, V);
2104 }
2105
2106 //===----------------------------------------------------------------------===//
2107 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2108 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2109 //===----------------------------------------------------------------------===//
2110
2111 /// Compute the transformed value of Index at offset StartValue using step
2112 /// StepValue.
2113 /// For integer induction, returns StartValue + Index * StepValue.
2114 /// For pointer induction, returns StartValue[Index * StepValue].
2115 /// FIXME: The newly created binary instructions should contain nsw/nuw
2116 /// flags, which can be found from the original scalar operations.
2117 static Value *
emitTransformedIndex(IRBuilderBase & B,Value * Index,Value * StartValue,Value * Step,InductionDescriptor::InductionKind InductionKind,const BinaryOperator * InductionBinOp)2118 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2119 Value *Step,
2120 InductionDescriptor::InductionKind InductionKind,
2121 const BinaryOperator *InductionBinOp) {
2122 using namespace llvm::PatternMatch;
2123 Type *StepTy = Step->getType();
2124 Value *CastedIndex = StepTy->isIntegerTy()
2125 ? B.CreateSExtOrTrunc(Index, StepTy)
2126 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2127 if (CastedIndex != Index) {
2128 CastedIndex->setName(CastedIndex->getName() + ".cast");
2129 Index = CastedIndex;
2130 }
2131
2132 // Note: the IR at this point is broken. We cannot use SE to create any new
2133 // SCEV and then expand it, hoping that SCEV's simplification will give us
2134 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2135 // lead to various SCEV crashes. So all we can do is to use builder and rely
2136 // on InstCombine for future simplifications. Here we handle some trivial
2137 // cases only.
2138 auto CreateAdd = [&B](Value *X, Value *Y) {
2139 assert(X->getType() == Y->getType() && "Types don't match!");
2140 if (match(X, m_ZeroInt()))
2141 return Y;
2142 if (match(Y, m_ZeroInt()))
2143 return X;
2144 return B.CreateAdd(X, Y);
2145 };
2146
2147 // We allow X to be a vector type, in which case Y will potentially be
2148 // splatted into a vector with the same element count.
2149 auto CreateMul = [&B](Value *X, Value *Y) {
2150 assert(X->getType()->getScalarType() == Y->getType() &&
2151 "Types don't match!");
2152 if (match(X, m_One()))
2153 return Y;
2154 if (match(Y, m_One()))
2155 return X;
2156 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2157 if (XVTy && !isa<VectorType>(Y->getType()))
2158 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2159 return B.CreateMul(X, Y);
2160 };
2161
2162 switch (InductionKind) {
2163 case InductionDescriptor::IK_IntInduction: {
2164 assert(!isa<VectorType>(Index->getType()) &&
2165 "Vector indices not supported for integer inductions yet");
2166 assert(Index->getType() == StartValue->getType() &&
2167 "Index type does not match StartValue type");
2168 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2169 return B.CreateSub(StartValue, Index);
2170 auto *Offset = CreateMul(Index, Step);
2171 return CreateAdd(StartValue, Offset);
2172 }
2173 case InductionDescriptor::IK_PtrInduction:
2174 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2175 case InductionDescriptor::IK_FpInduction: {
2176 assert(!isa<VectorType>(Index->getType()) &&
2177 "Vector indices not supported for FP inductions yet");
2178 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2179 assert(InductionBinOp &&
2180 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2181 InductionBinOp->getOpcode() == Instruction::FSub) &&
2182 "Original bin op should be defined for FP induction");
2183
2184 Value *MulExp = B.CreateFMul(Step, Index);
2185 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2186 "induction");
2187 }
2188 case InductionDescriptor::IK_NoInduction:
2189 return nullptr;
2190 }
2191 llvm_unreachable("invalid enum");
2192 }
2193
getMaxVScale(const Function & F,const TargetTransformInfo & TTI)2194 static std::optional<unsigned> getMaxVScale(const Function &F,
2195 const TargetTransformInfo &TTI) {
2196 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2197 return MaxVScale;
2198
2199 if (F.hasFnAttribute(Attribute::VScaleRange))
2200 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2201
2202 return std::nullopt;
2203 }
2204
2205 /// For the given VF and UF and maximum trip count computed for the loop, return
2206 /// whether the induction variable might overflow in the vectorized loop. If not,
2207 /// then we know a runtime overflow check always evaluates to false and can be
2208 /// removed.
isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel * Cost,ElementCount VF,std::optional<unsigned> UF=std::nullopt)2209 static bool isIndvarOverflowCheckKnownFalse(
2210 const LoopVectorizationCostModel *Cost,
2211 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2212 // Always be conservative if we don't know the exact unroll factor.
2213 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2214
2215 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2216 APInt MaxUIntTripCount = IdxTy->getMask();
2217
2218 // We know the runtime overflow check is known false iff the (max) trip-count
2219 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2220 // the vector loop induction variable.
2221 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2222 uint64_t MaxVF = VF.getKnownMinValue();
2223 if (VF.isScalable()) {
2224 std::optional<unsigned> MaxVScale =
2225 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2226 if (!MaxVScale)
2227 return false;
2228 MaxVF *= *MaxVScale;
2229 }
2230
2231 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2232 }
2233
2234 return false;
2235 }
2236
2237 // Return whether we allow using masked interleave-groups (for dealing with
2238 // strided loads/stores that reside in predicated blocks, or for dealing
2239 // with gaps).
useMaskedInterleavedAccesses(const TargetTransformInfo & TTI)2240 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2241 // If an override option has been passed in for interleaved accesses, use it.
2242 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2243 return EnableMaskedInterleavedMemAccesses;
2244
2245 return TTI.enableMaskedInterleavedAccessVectorization();
2246 }
2247
2248 Value *
getOrCreateVectorTripCount(BasicBlock * InsertBlock)2249 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2250 if (VectorTripCount)
2251 return VectorTripCount;
2252
2253 Value *TC = getTripCount();
2254 IRBuilder<> Builder(InsertBlock->getTerminator());
2255
2256 Type *Ty = TC->getType();
2257 // This is where we can make the step a runtime constant.
2258 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2259
2260 // If the tail is to be folded by masking, round the number of iterations N
2261 // up to a multiple of Step instead of rounding down. This is done by first
2262 // adding Step-1 and then rounding down. Note that it's ok if this addition
2263 // overflows: the vector induction variable will eventually wrap to zero given
2264 // that it starts at zero and its Step is a power of two; the loop will then
2265 // exit, with the last early-exit vector comparison also producing all-true.
2266 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2267 // is accounted for in emitIterationCountCheck that adds an overflow check.
2268 if (Cost->foldTailByMasking()) {
2269 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2270 "VF*UF must be a power of 2 when folding tail by masking");
2271 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2272 "n.rnd.up");
2273 }
2274
2275 // Now we need to generate the expression for the part of the loop that the
2276 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2277 // iterations are not required for correctness, or N - Step, otherwise. Step
2278 // is equal to the vectorization factor (number of SIMD elements) times the
2279 // unroll factor (number of SIMD instructions).
2280 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2281
2282 // There are cases where we *must* run at least one iteration in the remainder
2283 // loop. See the cost model for when this can happen. If the step evenly
2284 // divides the trip count, we set the remainder to be equal to the step. If
2285 // the step does not evenly divide the trip count, no adjustment is necessary
2286 // since there will already be scalar iterations. Note that the minimum
2287 // iterations check ensures that N >= Step.
2288 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2289 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2290 R = Builder.CreateSelect(IsZero, Step, R);
2291 }
2292
2293 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2294
2295 return VectorTripCount;
2296 }
2297
introduceCheckBlockInVPlan(BasicBlock * CheckIRBB)2298 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
2299 // Note: The block with the minimum trip-count check is already connected
2300 // during earlier VPlan construction.
2301 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2302 VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
2303 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2304 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
2305 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2306 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
2307 PreVectorPH = CheckVPIRBB;
2308 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2309 PreVectorPH->swapSuccessors();
2310
2311 // We just connected a new block to the scalar preheader. Update all
2312 // VPPhis by adding an incoming value for it, replicating the last value.
2313 unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2314 for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
2315 assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2316 assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
2317 "must have incoming values for all operands");
2318 R.addOperand(R.getOperand(NumPredecessors - 2));
2319 }
2320 }
2321
createIterationCountCheck(ElementCount VF,unsigned UF) const2322 Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
2323 unsigned UF) const {
2324 // Generate code to check if the loop's trip count is less than VF * UF, or
2325 // equal to it in case a scalar epilogue is required; this implies that the
2326 // vector trip count is zero. This check also covers the case where adding one
2327 // to the backedge-taken count overflowed leading to an incorrect trip count
2328 // of zero. In this case we will also jump to the scalar loop.
2329 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2330 : ICmpInst::ICMP_ULT;
2331
2332 // Reuse existing vector loop preheader for TC checks.
2333 // Note that new preheader block is generated for vector loop.
2334 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2335 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2336
2337 // If tail is to be folded, vector loop takes care of all iterations.
2338 Value *Count = getTripCount();
2339 Type *CountTy = Count->getType();
2340 Value *CheckMinIters = Builder.getFalse();
2341 auto CreateStep = [&]() -> Value * {
2342 // Create step with max(MinProTripCount, UF * VF).
2343 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2344 return createStepForVF(Builder, CountTy, VF, UF);
2345
2346 Value *MinProfTC =
2347 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2348 if (!VF.isScalable())
2349 return MinProfTC;
2350 return Builder.CreateBinaryIntrinsic(
2351 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2352 };
2353
2354 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2355 if (Style == TailFoldingStyle::None) {
2356 Value *Step = CreateStep();
2357 ScalarEvolution &SE = *PSE.getSE();
2358 // TODO: Emit unconditional branch to vector preheader instead of
2359 // conditional branch with known condition.
2360 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2361 // Check if the trip count is < the step.
2362 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2363 // TODO: Ensure step is at most the trip count when determining max VF and
2364 // UF, w/o tail folding.
2365 CheckMinIters = Builder.getTrue();
2366 } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P),
2367 TripCountSCEV, SE.getSCEV(Step))) {
2368 // Generate the minimum iteration check only if we cannot prove the
2369 // check is known to be true, or known to be false.
2370 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2371 } // else step known to be < trip count, use CheckMinIters preset to false.
2372 } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
2373 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2374 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2375 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2376 // an overflow to zero when updating induction variables and so an
2377 // additional overflow check is required before entering the vector loop.
2378
2379 // Get the maximum unsigned value for the type.
2380 Value *MaxUIntTripCount =
2381 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2382 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2383
2384 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2385 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2386 }
2387 return CheckMinIters;
2388 }
2389
emitIterationCountCheck(BasicBlock * Bypass)2390 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2391 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2392 Value *CheckMinIters = createIterationCountCheck(VF, UF);
2393 // Create new preheader for vector loop.
2394 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
2395 static_cast<DominatorTree *>(nullptr), LI,
2396 nullptr, "vector.ph");
2397
2398 BranchInst &BI =
2399 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2400 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2401 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2402 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2403
2404 assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
2405 TCCheckBlock &&
2406 "Plan's entry must be TCCCheckBlock");
2407 }
2408
2409 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2410 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2411 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2412 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
replaceVPBBWithIRVPBB(VPBasicBlock * VPBB,BasicBlock * IRBB)2413 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
2414 VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2415 for (auto &R : make_early_inc_range(*VPBB)) {
2416 assert((IRVPBB->empty() || IRVPBB->back().isPhi() || !R.isPhi()) &&
2417 "Tried to move phi recipe after a non-phi recipe");
2418 R.moveBefore(*IRVPBB, IRVPBB->end());
2419 }
2420
2421 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2422 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2423 }
2424
createVectorLoopSkeleton(StringRef Prefix)2425 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2426 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2427 assert(LoopVectorPreHeader && "Invalid loop structure");
2428 assert((OrigLoop->getUniqueLatchExitBlock() ||
2429 Cost->requiresScalarEpilogue(VF.isVector())) &&
2430 "loops not exiting via the latch without required epilogue?");
2431
2432 LoopScalarPreHeader =
2433 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2434 LI, nullptr, Twine(Prefix) + "scalar.ph");
2435 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2436 // wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar
2437 // preheader may be unreachable at this point. Instead it is replaced in
2438 // createVectorizedLoopSkeleton.
2439 }
2440
2441 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2442 /// expansion results.
getExpandedStep(const InductionDescriptor & ID,const SCEV2ValueTy & ExpandedSCEVs)2443 static Value *getExpandedStep(const InductionDescriptor &ID,
2444 const SCEV2ValueTy &ExpandedSCEVs) {
2445 const SCEV *Step = ID.getStep();
2446 if (auto *C = dyn_cast<SCEVConstant>(Step))
2447 return C->getValue();
2448 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2449 return U->getValue();
2450 Value *V = ExpandedSCEVs.lookup(Step);
2451 assert(V && "SCEV must be expanded at this point");
2452 return V;
2453 }
2454
2455 /// Knowing that loop \p L executes a single vector iteration, add instructions
2456 /// that will get simplified and thus should not have any cost to \p
2457 /// InstsToIgnore.
addFullyUnrolledInstructionsToIgnore(Loop * L,const LoopVectorizationLegality::InductionList & IL,SmallPtrSetImpl<Instruction * > & InstsToIgnore)2458 static void addFullyUnrolledInstructionsToIgnore(
2459 Loop *L, const LoopVectorizationLegality::InductionList &IL,
2460 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2461 auto *Cmp = L->getLatchCmpInst();
2462 if (Cmp)
2463 InstsToIgnore.insert(Cmp);
2464 for (const auto &KV : IL) {
2465 // Extract the key by hand so that it can be used in the lambda below. Note
2466 // that captured structured bindings are a C++20 extension.
2467 const PHINode *IV = KV.first;
2468
2469 // Get next iteration value of the induction variable.
2470 Instruction *IVInst =
2471 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2472 if (all_of(IVInst->users(),
2473 [&](const User *U) { return U == IV || U == Cmp; }))
2474 InstsToIgnore.insert(IVInst);
2475 }
2476 }
2477
createVectorizedLoopSkeleton()2478 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2479 /*
2480 In this function we generate a new loop. The new loop will contain
2481 the vectorized instructions while the old loop will continue to run the
2482 scalar remainder.
2483
2484 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2485 / | preheader are expanded here. Eventually all required SCEV
2486 / | expansion should happen here.
2487 / v
2488 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2489 | / |
2490 | / v
2491 || [ ] <-- vector pre header.
2492 |/ |
2493 | v
2494 | [ ] \
2495 | [ ]_| <-- vector loop (created during VPlan execution).
2496 | |
2497 | v
2498 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2499 | | successors created during VPlan execution)
2500 \/ |
2501 /\ v
2502 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2503 | |
2504 (opt) v <-- edge from middle to exit iff epilogue is not required.
2505 | [ ] \
2506 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
2507 | | wrapped in VPIRBasicBlock).
2508 \ |
2509 \ v
2510 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2511 ...
2512 */
2513
2514 // Create an empty vector loop, and prepare basic blocks for the runtime
2515 // checks.
2516 createVectorLoopSkeleton("");
2517
2518 // Now, compare the new count to zero. If it is zero skip the vector loop and
2519 // jump to the scalar loop. This check also covers the case where the
2520 // backedge-taken count is uint##_max: adding one to it will overflow leading
2521 // to an incorrect trip count of zero. In this (rare) case we will also jump
2522 // to the scalar loop.
2523 emitIterationCountCheck(LoopScalarPreHeader);
2524
2525 replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
2526 return LoopVectorPreHeader;
2527 }
2528
2529 namespace {
2530
2531 struct CSEDenseMapInfo {
canHandle__anon71de2b2d0b11::CSEDenseMapInfo2532 static bool canHandle(const Instruction *I) {
2533 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2534 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2535 }
2536
getEmptyKey__anon71de2b2d0b11::CSEDenseMapInfo2537 static inline Instruction *getEmptyKey() {
2538 return DenseMapInfo<Instruction *>::getEmptyKey();
2539 }
2540
getTombstoneKey__anon71de2b2d0b11::CSEDenseMapInfo2541 static inline Instruction *getTombstoneKey() {
2542 return DenseMapInfo<Instruction *>::getTombstoneKey();
2543 }
2544
getHashValue__anon71de2b2d0b11::CSEDenseMapInfo2545 static unsigned getHashValue(const Instruction *I) {
2546 assert(canHandle(I) && "Unknown instruction!");
2547 return hash_combine(I->getOpcode(),
2548 hash_combine_range(I->operand_values()));
2549 }
2550
isEqual__anon71de2b2d0b11::CSEDenseMapInfo2551 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2552 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2553 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2554 return LHS == RHS;
2555 return LHS->isIdenticalTo(RHS);
2556 }
2557 };
2558
2559 } // end anonymous namespace
2560
2561 ///Perform cse of induction variable instructions.
cse(BasicBlock * BB)2562 static void cse(BasicBlock *BB) {
2563 // Perform simple cse.
2564 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2565 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2566 if (!CSEDenseMapInfo::canHandle(&In))
2567 continue;
2568
2569 // Check if we can replace this instruction with any of the
2570 // visited instructions.
2571 if (Instruction *V = CSEMap.lookup(&In)) {
2572 In.replaceAllUsesWith(V);
2573 In.eraseFromParent();
2574 continue;
2575 }
2576
2577 CSEMap[&In] = &In;
2578 }
2579 }
2580
2581 /// This function attempts to return a value that represents the vectorization
2582 /// factor at runtime. For fixed-width VFs we know this precisely at compile
2583 /// time, but for scalable VFs we calculate it based on an estimate of the
2584 /// vscale value.
getEstimatedRuntimeVF(ElementCount VF,std::optional<unsigned> VScale)2585 static unsigned getEstimatedRuntimeVF(ElementCount VF,
2586 std::optional<unsigned> VScale) {
2587 unsigned EstimatedVF = VF.getKnownMinValue();
2588 if (VF.isScalable())
2589 if (VScale)
2590 EstimatedVF *= *VScale;
2591 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2592 return EstimatedVF;
2593 }
2594
2595 InstructionCost
getVectorCallCost(CallInst * CI,ElementCount VF) const2596 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2597 ElementCount VF) const {
2598 // We only need to calculate a cost if the VF is scalar; for actual vectors
2599 // we should already have a pre-calculated cost at each VF.
2600 if (!VF.isScalar())
2601 return getCallWideningDecision(CI, VF).Cost;
2602
2603 Type *RetTy = CI->getType();
2604 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
2605 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2606 return *RedCost;
2607
2608 SmallVector<Type *, 4> Tys;
2609 for (auto &ArgOp : CI->args())
2610 Tys.push_back(ArgOp->getType());
2611
2612 InstructionCost ScalarCallCost =
2613 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2614
2615 // If this is an intrinsic we may have a lower cost for it.
2616 if (getVectorIntrinsicIDForCall(CI, TLI)) {
2617 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2618 return std::min(ScalarCallCost, IntrinsicCost);
2619 }
2620 return ScalarCallCost;
2621 }
2622
maybeVectorizeType(Type * Ty,ElementCount VF)2623 static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
2624 if (VF.isScalar() || !canVectorizeTy(Ty))
2625 return Ty;
2626 return toVectorizedTy(Ty, VF);
2627 }
2628
2629 InstructionCost
getVectorIntrinsicCost(CallInst * CI,ElementCount VF) const2630 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2631 ElementCount VF) const {
2632 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2633 assert(ID && "Expected intrinsic call!");
2634 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2635 FastMathFlags FMF;
2636 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2637 FMF = FPMO->getFastMathFlags();
2638
2639 SmallVector<const Value *> Arguments(CI->args());
2640 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2641 SmallVector<Type *> ParamTys;
2642 std::transform(FTy->param_begin(), FTy->param_end(),
2643 std::back_inserter(ParamTys),
2644 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2645
2646 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2647 dyn_cast<IntrinsicInst>(CI),
2648 InstructionCost::getInvalid(), TLI);
2649 return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2650 }
2651
fixVectorizedLoop(VPTransformState & State)2652 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2653 // Fix widened non-induction PHIs by setting up the PHI operands.
2654 fixNonInductionPHIs(State);
2655
2656 // After vectorization, the exit blocks of the original loop will have
2657 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2658 // looked through single-entry phis.
2659 SmallVector<BasicBlock *> ExitBlocks;
2660 OrigLoop->getExitBlocks(ExitBlocks);
2661 for (BasicBlock *Exit : ExitBlocks)
2662 for (PHINode &PN : Exit->phis())
2663 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
2664
2665 // Forget the original basic block.
2666 PSE.getSE()->forgetLoop(OrigLoop);
2667 PSE.getSE()->forgetBlockAndLoopDispositions();
2668
2669 // Don't apply optimizations below when no (vector) loop remains, as they all
2670 // require one at the moment.
2671 VPBasicBlock *HeaderVPBB =
2672 vputils::getFirstLoopHeader(*State.Plan, State.VPDT);
2673 if (!HeaderVPBB)
2674 return;
2675
2676 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2677
2678 // Remove redundant induction instructions.
2679 cse(HeaderBB);
2680
2681 // Set/update profile weights for the vector and remainder loops as original
2682 // loop iterations are now distributed among them. Note that original loop
2683 // becomes the scalar remainder loop after vectorization.
2684 //
2685 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2686 // end up getting slightly roughened result but that should be OK since
2687 // profile is not inherently precise anyway. Note also possible bypass of
2688 // vector code caused by legality checks is ignored, assigning all the weight
2689 // to the vector loop, optimistically.
2690 //
2691 // For scalable vectorization we can't know at compile time how many
2692 // iterations of the loop are handled in one vector iteration, so instead
2693 // use the value of vscale used for tuning.
2694 Loop *VectorLoop = LI->getLoopFor(HeaderBB);
2695 unsigned EstimatedVFxUF =
2696 getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning());
2697 setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
2698 }
2699
fixNonInductionPHIs(VPTransformState & State)2700 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
2701 auto Iter = vp_depth_first_shallow(Plan.getEntry());
2702 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
2703 for (VPRecipeBase &P : VPBB->phis()) {
2704 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
2705 if (!VPPhi)
2706 continue;
2707 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
2708 // Make sure the builder has a valid insert point.
2709 Builder.SetInsertPoint(NewPhi);
2710 for (unsigned Idx = 0; Idx < VPPhi->getNumIncoming(); ++Idx) {
2711 VPValue *Inc = VPPhi->getIncomingValue(Idx);
2712 const VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
2713 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
2714 }
2715 }
2716 }
2717 }
2718
collectLoopScalars(ElementCount VF)2719 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2720 // We should not collect Scalars more than once per VF. Right now, this
2721 // function is called from collectUniformsAndScalars(), which already does
2722 // this check. Collecting Scalars for VF=1 does not make any sense.
2723 assert(VF.isVector() && !Scalars.contains(VF) &&
2724 "This function should not be visited twice for the same VF");
2725
2726 // This avoids any chances of creating a REPLICATE recipe during planning
2727 // since that would result in generation of scalarized code during execution,
2728 // which is not supported for scalable vectors.
2729 if (VF.isScalable()) {
2730 Scalars[VF].insert_range(Uniforms[VF]);
2731 return;
2732 }
2733
2734 SmallSetVector<Instruction *, 8> Worklist;
2735
2736 // These sets are used to seed the analysis with pointers used by memory
2737 // accesses that will remain scalar.
2738 SmallSetVector<Instruction *, 8> ScalarPtrs;
2739 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2740 auto *Latch = TheLoop->getLoopLatch();
2741
2742 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2743 // The pointer operands of loads and stores will be scalar as long as the
2744 // memory access is not a gather or scatter operation. The value operand of a
2745 // store will remain scalar if the store is scalarized.
2746 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2747 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
2748 assert(WideningDecision != CM_Unknown &&
2749 "Widening decision should be ready at this moment");
2750 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
2751 if (Ptr == Store->getValueOperand())
2752 return WideningDecision == CM_Scalarize;
2753 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2754 "Ptr is neither a value or pointer operand");
2755 return WideningDecision != CM_GatherScatter;
2756 };
2757
2758 // A helper that returns true if the given value is a getelementptr
2759 // instruction contained in the loop.
2760 auto IsLoopVaryingGEP = [&](Value *V) {
2761 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
2762 };
2763
2764 // A helper that evaluates a memory access's use of a pointer. If the use will
2765 // be a scalar use and the pointer is only used by memory accesses, we place
2766 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2767 // PossibleNonScalarPtrs.
2768 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2769 // We only care about bitcast and getelementptr instructions contained in
2770 // the loop.
2771 if (!IsLoopVaryingGEP(Ptr))
2772 return;
2773
2774 // If the pointer has already been identified as scalar (e.g., if it was
2775 // also identified as uniform), there's nothing to do.
2776 auto *I = cast<Instruction>(Ptr);
2777 if (Worklist.count(I))
2778 return;
2779
2780 // If the use of the pointer will be a scalar use, and all users of the
2781 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2782 // place the pointer in PossibleNonScalarPtrs.
2783 if (IsScalarUse(MemAccess, Ptr) &&
2784 all_of(I->users(), IsaPred<LoadInst, StoreInst>))
2785 ScalarPtrs.insert(I);
2786 else
2787 PossibleNonScalarPtrs.insert(I);
2788 };
2789
2790 // We seed the scalars analysis with three classes of instructions: (1)
2791 // instructions marked uniform-after-vectorization and (2) bitcast,
2792 // getelementptr and (pointer) phi instructions used by memory accesses
2793 // requiring a scalar use.
2794 //
2795 // (1) Add to the worklist all instructions that have been identified as
2796 // uniform-after-vectorization.
2797 Worklist.insert_range(Uniforms[VF]);
2798
2799 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2800 // memory accesses requiring a scalar use. The pointer operands of loads and
2801 // stores will be scalar unless the operation is a gather or scatter.
2802 // The value operand of a store will remain scalar if the store is scalarized.
2803 for (auto *BB : TheLoop->blocks())
2804 for (auto &I : *BB) {
2805 if (auto *Load = dyn_cast<LoadInst>(&I)) {
2806 EvaluatePtrUse(Load, Load->getPointerOperand());
2807 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
2808 EvaluatePtrUse(Store, Store->getPointerOperand());
2809 EvaluatePtrUse(Store, Store->getValueOperand());
2810 }
2811 }
2812 for (auto *I : ScalarPtrs)
2813 if (!PossibleNonScalarPtrs.count(I)) {
2814 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2815 Worklist.insert(I);
2816 }
2817
2818 // Insert the forced scalars.
2819 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2820 // induction variable when the PHI user is scalarized.
2821 auto ForcedScalar = ForcedScalars.find(VF);
2822 if (ForcedScalar != ForcedScalars.end())
2823 for (auto *I : ForcedScalar->second) {
2824 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2825 Worklist.insert(I);
2826 }
2827
2828 // Expand the worklist by looking through any bitcasts and getelementptr
2829 // instructions we've already identified as scalar. This is similar to the
2830 // expansion step in collectLoopUniforms(); however, here we're only
2831 // expanding to include additional bitcasts and getelementptr instructions.
2832 unsigned Idx = 0;
2833 while (Idx != Worklist.size()) {
2834 Instruction *Dst = Worklist[Idx++];
2835 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
2836 continue;
2837 auto *Src = cast<Instruction>(Dst->getOperand(0));
2838 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
2839 auto *J = cast<Instruction>(U);
2840 return !TheLoop->contains(J) || Worklist.count(J) ||
2841 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
2842 IsScalarUse(J, Src));
2843 })) {
2844 Worklist.insert(Src);
2845 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2846 }
2847 }
2848
2849 // An induction variable will remain scalar if all users of the induction
2850 // variable and induction variable update remain scalar.
2851 for (const auto &Induction : Legal->getInductionVars()) {
2852 auto *Ind = Induction.first;
2853 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
2854
2855 // If tail-folding is applied, the primary induction variable will be used
2856 // to feed a vector compare.
2857 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2858 continue;
2859
2860 // Returns true if \p Indvar is a pointer induction that is used directly by
2861 // load/store instruction \p I.
2862 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2863 Instruction *I) {
2864 return Induction.second.getKind() ==
2865 InductionDescriptor::IK_PtrInduction &&
2866 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
2867 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
2868 };
2869
2870 // Determine if all users of the induction variable are scalar after
2871 // vectorization.
2872 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
2873 auto *I = cast<Instruction>(U);
2874 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
2875 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2876 });
2877 if (!ScalarInd)
2878 continue;
2879
2880 // If the induction variable update is a fixed-order recurrence, neither the
2881 // induction variable or its update should be marked scalar after
2882 // vectorization.
2883 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
2884 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
2885 continue;
2886
2887 // Determine if all users of the induction variable update instruction are
2888 // scalar after vectorization.
2889 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
2890 auto *I = cast<Instruction>(U);
2891 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
2892 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2893 });
2894 if (!ScalarIndUpdate)
2895 continue;
2896
2897 // The induction variable and its update instruction will remain scalar.
2898 Worklist.insert(Ind);
2899 Worklist.insert(IndUpdate);
2900 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2901 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2902 << "\n");
2903 }
2904
2905 Scalars[VF].insert_range(Worklist);
2906 }
2907
isScalarWithPredication(Instruction * I,ElementCount VF) const2908 bool LoopVectorizationCostModel::isScalarWithPredication(
2909 Instruction *I, ElementCount VF) const {
2910 if (!isPredicatedInst(I))
2911 return false;
2912
2913 // Do we have a non-scalar lowering for this predicated
2914 // instruction? No - it is scalar with predication.
2915 switch(I->getOpcode()) {
2916 default:
2917 return true;
2918 case Instruction::Call:
2919 if (VF.isScalar())
2920 return true;
2921 return getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize;
2922 case Instruction::Load:
2923 case Instruction::Store: {
2924 auto *Ptr = getLoadStorePointerOperand(I);
2925 auto *Ty = getLoadStoreType(I);
2926 unsigned AS = getLoadStoreAddressSpace(I);
2927 Type *VTy = Ty;
2928 if (VF.isVector())
2929 VTy = VectorType::get(Ty, VF);
2930 const Align Alignment = getLoadStoreAlignment(I);
2931 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment, AS) ||
2932 TTI.isLegalMaskedGather(VTy, Alignment))
2933 : !(isLegalMaskedStore(Ty, Ptr, Alignment, AS) ||
2934 TTI.isLegalMaskedScatter(VTy, Alignment));
2935 }
2936 case Instruction::UDiv:
2937 case Instruction::SDiv:
2938 case Instruction::SRem:
2939 case Instruction::URem: {
2940 // We have the option to use the safe-divisor idiom to avoid predication.
2941 // The cost based decision here will always select safe-divisor for
2942 // scalable vectors as scalarization isn't legal.
2943 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2944 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2945 }
2946 }
2947 }
2948
2949 // TODO: Fold into LoopVectorizationLegality::isMaskRequired.
isPredicatedInst(Instruction * I) const2950 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
2951 // TODO: We can use the loop-preheader as context point here and get
2952 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2953 if (isSafeToSpeculativelyExecute(I) ||
2954 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
2955 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
2956 return false;
2957
2958 // If the instruction was executed conditionally in the original scalar loop,
2959 // predication is needed with a mask whose lanes are all possibly inactive.
2960 if (Legal->blockNeedsPredication(I->getParent()))
2961 return true;
2962
2963 // If we're not folding the tail by masking, predication is unnecessary.
2964 if (!foldTailByMasking())
2965 return false;
2966
2967 // All that remain are instructions with side-effects originally executed in
2968 // the loop unconditionally, but now execute under a tail-fold mask (only)
2969 // having at least one active lane (the first). If the side-effects of the
2970 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2971 // - it will cause the same side-effects as when masked.
2972 switch(I->getOpcode()) {
2973 default:
2974 llvm_unreachable(
2975 "instruction should have been considered by earlier checks");
2976 case Instruction::Call:
2977 // Side-effects of a Call are assumed to be non-invariant, needing a
2978 // (fold-tail) mask.
2979 assert(Legal->isMaskRequired(I) &&
2980 "should have returned earlier for calls not needing a mask");
2981 return true;
2982 case Instruction::Load:
2983 // If the address is loop invariant no predication is needed.
2984 return !Legal->isInvariant(getLoadStorePointerOperand(I));
2985 case Instruction::Store: {
2986 // For stores, we need to prove both speculation safety (which follows from
2987 // the same argument as loads), but also must prove the value being stored
2988 // is correct. The easiest form of the later is to require that all values
2989 // stored are the same.
2990 return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
2991 Legal->isInvariant(cast<StoreInst>(I)->getValueOperand()));
2992 }
2993 case Instruction::UDiv:
2994 case Instruction::SDiv:
2995 case Instruction::SRem:
2996 case Instruction::URem:
2997 // If the divisor is loop-invariant no predication is needed.
2998 return !Legal->isInvariant(I->getOperand(1));
2999 }
3000 }
3001
3002 std::pair<InstructionCost, InstructionCost>
getDivRemSpeculationCost(Instruction * I,ElementCount VF) const3003 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3004 ElementCount VF) const {
3005 assert(I->getOpcode() == Instruction::UDiv ||
3006 I->getOpcode() == Instruction::SDiv ||
3007 I->getOpcode() == Instruction::SRem ||
3008 I->getOpcode() == Instruction::URem);
3009 assert(!isSafeToSpeculativelyExecute(I));
3010
3011 // Scalarization isn't legal for scalable vector types
3012 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3013 if (!VF.isScalable()) {
3014 // Get the scalarization cost and scale this amount by the probability of
3015 // executing the predicated block. If the instruction is not predicated,
3016 // we fall through to the next case.
3017 ScalarizationCost = 0;
3018
3019 // These instructions have a non-void type, so account for the phi nodes
3020 // that we will create. This cost is likely to be zero. The phi node
3021 // cost, if any, should be scaled by the block probability because it
3022 // models a copy at the end of each predicated block.
3023 ScalarizationCost +=
3024 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
3025
3026 // The cost of the non-predicated instruction.
3027 ScalarizationCost +=
3028 VF.getFixedValue() *
3029 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3030
3031 // The cost of insertelement and extractelement instructions needed for
3032 // scalarization.
3033 ScalarizationCost += getScalarizationOverhead(I, VF);
3034
3035 // Scale the cost by the probability of executing the predicated blocks.
3036 // This assumes the predicated block for each vector lane is equally
3037 // likely.
3038 ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
3039 }
3040 InstructionCost SafeDivisorCost = 0;
3041
3042 auto *VecTy = toVectorTy(I->getType(), VF);
3043
3044 // The cost of the select guard to ensure all lanes are well defined
3045 // after we speculate above any internal control flow.
3046 SafeDivisorCost +=
3047 TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3048 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3049 CmpInst::BAD_ICMP_PREDICATE, CostKind);
3050
3051 // Certain instructions can be cheaper to vectorize if they have a constant
3052 // second vector operand. One example of this are shifts on x86.
3053 Value *Op2 = I->getOperand(1);
3054 auto Op2Info = TTI.getOperandInfo(Op2);
3055 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3056 Legal->isInvariant(Op2))
3057 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3058
3059 SmallVector<const Value *, 4> Operands(I->operand_values());
3060 SafeDivisorCost += TTI.getArithmeticInstrCost(
3061 I->getOpcode(), VecTy, CostKind,
3062 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3063 Op2Info, Operands, I);
3064 return {ScalarizationCost, SafeDivisorCost};
3065 }
3066
interleavedAccessCanBeWidened(Instruction * I,ElementCount VF) const3067 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3068 Instruction *I, ElementCount VF) const {
3069 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3070 assert(getWideningDecision(I, VF) == CM_Unknown &&
3071 "Decision should not be set yet.");
3072 auto *Group = getInterleavedAccessGroup(I);
3073 assert(Group && "Must have a group.");
3074 unsigned InterleaveFactor = Group->getFactor();
3075
3076 // If the instruction's allocated size doesn't equal its type size, it
3077 // requires padding and will be scalarized.
3078 auto &DL = I->getDataLayout();
3079 auto *ScalarTy = getLoadStoreType(I);
3080 if (hasIrregularType(ScalarTy, DL))
3081 return false;
3082
3083 // For scalable vectors, the interleave factors must be <= 8 since we require
3084 // the (de)interleaveN intrinsics instead of shufflevectors.
3085 if (VF.isScalable() && InterleaveFactor > 8)
3086 return false;
3087
3088 // If the group involves a non-integral pointer, we may not be able to
3089 // losslessly cast all values to a common type.
3090 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3091 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3092 Instruction *Member = Group->getMember(Idx);
3093 if (!Member)
3094 continue;
3095 auto *MemberTy = getLoadStoreType(Member);
3096 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3097 // Don't coerce non-integral pointers to integers or vice versa.
3098 if (MemberNI != ScalarNI)
3099 // TODO: Consider adding special nullptr value case here
3100 return false;
3101 if (MemberNI && ScalarNI &&
3102 ScalarTy->getPointerAddressSpace() !=
3103 MemberTy->getPointerAddressSpace())
3104 return false;
3105 }
3106
3107 // Check if masking is required.
3108 // A Group may need masking for one of two reasons: it resides in a block that
3109 // needs predication, or it was decided to use masking to deal with gaps
3110 // (either a gap at the end of a load-access that may result in a speculative
3111 // load, or any gaps in a store-access).
3112 bool PredicatedAccessRequiresMasking =
3113 blockNeedsPredicationForAnyReason(I->getParent()) &&
3114 Legal->isMaskRequired(I);
3115 bool LoadAccessWithGapsRequiresEpilogMasking =
3116 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3117 !isScalarEpilogueAllowed();
3118 bool StoreAccessWithGapsRequiresMasking =
3119 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3120 if (!PredicatedAccessRequiresMasking &&
3121 !LoadAccessWithGapsRequiresEpilogMasking &&
3122 !StoreAccessWithGapsRequiresMasking)
3123 return true;
3124
3125 // If masked interleaving is required, we expect that the user/target had
3126 // enabled it, because otherwise it either wouldn't have been created or
3127 // it should have been invalidated by the CostModel.
3128 assert(useMaskedInterleavedAccesses(TTI) &&
3129 "Masked interleave-groups for predicated accesses are not enabled.");
3130
3131 if (Group->isReverse())
3132 return false;
3133
3134 auto *Ty = getLoadStoreType(I);
3135 const Align Alignment = getLoadStoreAlignment(I);
3136 unsigned AS = getLoadStoreAddressSpace(I);
3137 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS)
3138 : TTI.isLegalMaskedStore(Ty, Alignment, AS);
3139 }
3140
memoryInstructionCanBeWidened(Instruction * I,ElementCount VF)3141 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3142 Instruction *I, ElementCount VF) {
3143 // Get and ensure we have a valid memory instruction.
3144 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3145
3146 auto *Ptr = getLoadStorePointerOperand(I);
3147 auto *ScalarTy = getLoadStoreType(I);
3148
3149 // In order to be widened, the pointer should be consecutive, first of all.
3150 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3151 return false;
3152
3153 // If the instruction is a store located in a predicated block, it will be
3154 // scalarized.
3155 if (isScalarWithPredication(I, VF))
3156 return false;
3157
3158 // If the instruction's allocated size doesn't equal it's type size, it
3159 // requires padding and will be scalarized.
3160 auto &DL = I->getDataLayout();
3161 if (hasIrregularType(ScalarTy, DL))
3162 return false;
3163
3164 return true;
3165 }
3166
collectLoopUniforms(ElementCount VF)3167 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3168 // We should not collect Uniforms more than once per VF. Right now,
3169 // this function is called from collectUniformsAndScalars(), which
3170 // already does this check. Collecting Uniforms for VF=1 does not make any
3171 // sense.
3172
3173 assert(VF.isVector() && !Uniforms.contains(VF) &&
3174 "This function should not be visited twice for the same VF");
3175
3176 // Visit the list of Uniforms. If we find no uniform value, we won't
3177 // analyze again. Uniforms.count(VF) will return 1.
3178 Uniforms[VF].clear();
3179
3180 // Now we know that the loop is vectorizable!
3181 // Collect instructions inside the loop that will remain uniform after
3182 // vectorization.
3183
3184 // Global values, params and instructions outside of current loop are out of
3185 // scope.
3186 auto IsOutOfScope = [&](Value *V) -> bool {
3187 Instruction *I = dyn_cast<Instruction>(V);
3188 return (!I || !TheLoop->contains(I));
3189 };
3190
3191 // Worklist containing uniform instructions demanding lane 0.
3192 SetVector<Instruction *> Worklist;
3193
3194 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3195 // that require predication must not be considered uniform after
3196 // vectorization, because that would create an erroneous replicating region
3197 // where only a single instance out of VF should be formed.
3198 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3199 if (IsOutOfScope(I)) {
3200 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3201 << *I << "\n");
3202 return;
3203 }
3204 if (isPredicatedInst(I)) {
3205 LLVM_DEBUG(
3206 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3207 << "\n");
3208 return;
3209 }
3210 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3211 Worklist.insert(I);
3212 };
3213
3214 // Start with the conditional branches exiting the loop. If the branch
3215 // condition is an instruction contained in the loop that is only used by the
3216 // branch, it is uniform. Note conditions from uncountable early exits are not
3217 // uniform.
3218 SmallVector<BasicBlock *> Exiting;
3219 TheLoop->getExitingBlocks(Exiting);
3220 for (BasicBlock *E : Exiting) {
3221 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3222 continue;
3223 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3224 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3225 AddToWorklistIfAllowed(Cmp);
3226 }
3227
3228 auto PrevVF = VF.divideCoefficientBy(2);
3229 // Return true if all lanes perform the same memory operation, and we can
3230 // thus choose to execute only one.
3231 auto IsUniformMemOpUse = [&](Instruction *I) {
3232 // If the value was already known to not be uniform for the previous
3233 // (smaller VF), it cannot be uniform for the larger VF.
3234 if (PrevVF.isVector()) {
3235 auto Iter = Uniforms.find(PrevVF);
3236 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3237 return false;
3238 }
3239 if (!Legal->isUniformMemOp(*I, VF))
3240 return false;
3241 if (isa<LoadInst>(I))
3242 // Loading the same address always produces the same result - at least
3243 // assuming aliasing and ordering which have already been checked.
3244 return true;
3245 // Storing the same value on every iteration.
3246 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3247 };
3248
3249 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3250 InstWidening WideningDecision = getWideningDecision(I, VF);
3251 assert(WideningDecision != CM_Unknown &&
3252 "Widening decision should be ready at this moment");
3253
3254 if (IsUniformMemOpUse(I))
3255 return true;
3256
3257 return (WideningDecision == CM_Widen ||
3258 WideningDecision == CM_Widen_Reverse ||
3259 WideningDecision == CM_Interleave);
3260 };
3261
3262 // Returns true if Ptr is the pointer operand of a memory access instruction
3263 // I, I is known to not require scalarization, and the pointer is not also
3264 // stored.
3265 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3266 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3267 return false;
3268 return getLoadStorePointerOperand(I) == Ptr &&
3269 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3270 };
3271
3272 // Holds a list of values which are known to have at least one uniform use.
3273 // Note that there may be other uses which aren't uniform. A "uniform use"
3274 // here is something which only demands lane 0 of the unrolled iterations;
3275 // it does not imply that all lanes produce the same value (e.g. this is not
3276 // the usual meaning of uniform)
3277 SetVector<Value *> HasUniformUse;
3278
3279 // Scan the loop for instructions which are either a) known to have only
3280 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3281 for (auto *BB : TheLoop->blocks())
3282 for (auto &I : *BB) {
3283 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3284 switch (II->getIntrinsicID()) {
3285 case Intrinsic::sideeffect:
3286 case Intrinsic::experimental_noalias_scope_decl:
3287 case Intrinsic::assume:
3288 case Intrinsic::lifetime_start:
3289 case Intrinsic::lifetime_end:
3290 if (TheLoop->hasLoopInvariantOperands(&I))
3291 AddToWorklistIfAllowed(&I);
3292 break;
3293 default:
3294 break;
3295 }
3296 }
3297
3298 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3299 if (IsOutOfScope(EVI->getAggregateOperand())) {
3300 AddToWorklistIfAllowed(EVI);
3301 continue;
3302 }
3303 // Only ExtractValue instructions where the aggregate value comes from a
3304 // call are allowed to be non-uniform.
3305 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3306 "Expected aggregate value to be call return value");
3307 }
3308
3309 // If there's no pointer operand, there's nothing to do.
3310 auto *Ptr = getLoadStorePointerOperand(&I);
3311 if (!Ptr)
3312 continue;
3313
3314 if (IsUniformMemOpUse(&I))
3315 AddToWorklistIfAllowed(&I);
3316
3317 if (IsVectorizedMemAccessUse(&I, Ptr))
3318 HasUniformUse.insert(Ptr);
3319 }
3320
3321 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3322 // demanding) users. Since loops are assumed to be in LCSSA form, this
3323 // disallows uses outside the loop as well.
3324 for (auto *V : HasUniformUse) {
3325 if (IsOutOfScope(V))
3326 continue;
3327 auto *I = cast<Instruction>(V);
3328 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3329 auto *UI = cast<Instruction>(U);
3330 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3331 });
3332 if (UsersAreMemAccesses)
3333 AddToWorklistIfAllowed(I);
3334 }
3335
3336 // Expand Worklist in topological order: whenever a new instruction
3337 // is added , its users should be already inside Worklist. It ensures
3338 // a uniform instruction will only be used by uniform instructions.
3339 unsigned Idx = 0;
3340 while (Idx != Worklist.size()) {
3341 Instruction *I = Worklist[Idx++];
3342
3343 for (auto *OV : I->operand_values()) {
3344 // isOutOfScope operands cannot be uniform instructions.
3345 if (IsOutOfScope(OV))
3346 continue;
3347 // First order recurrence Phi's should typically be considered
3348 // non-uniform.
3349 auto *OP = dyn_cast<PHINode>(OV);
3350 if (OP && Legal->isFixedOrderRecurrence(OP))
3351 continue;
3352 // If all the users of the operand are uniform, then add the
3353 // operand into the uniform worklist.
3354 auto *OI = cast<Instruction>(OV);
3355 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3356 auto *J = cast<Instruction>(U);
3357 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3358 }))
3359 AddToWorklistIfAllowed(OI);
3360 }
3361 }
3362
3363 // For an instruction to be added into Worklist above, all its users inside
3364 // the loop should also be in Worklist. However, this condition cannot be
3365 // true for phi nodes that form a cyclic dependence. We must process phi
3366 // nodes separately. An induction variable will remain uniform if all users
3367 // of the induction variable and induction variable update remain uniform.
3368 // The code below handles both pointer and non-pointer induction variables.
3369 BasicBlock *Latch = TheLoop->getLoopLatch();
3370 for (const auto &Induction : Legal->getInductionVars()) {
3371 auto *Ind = Induction.first;
3372 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3373
3374 // Determine if all users of the induction variable are uniform after
3375 // vectorization.
3376 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3377 auto *I = cast<Instruction>(U);
3378 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3379 IsVectorizedMemAccessUse(I, Ind);
3380 });
3381 if (!UniformInd)
3382 continue;
3383
3384 // Determine if all users of the induction variable update instruction are
3385 // uniform after vectorization.
3386 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3387 auto *I = cast<Instruction>(U);
3388 return I == Ind || Worklist.count(I) ||
3389 IsVectorizedMemAccessUse(I, IndUpdate);
3390 });
3391 if (!UniformIndUpdate)
3392 continue;
3393
3394 // The induction variable and its update instruction will remain uniform.
3395 AddToWorklistIfAllowed(Ind);
3396 AddToWorklistIfAllowed(IndUpdate);
3397 }
3398
3399 Uniforms[VF].insert_range(Worklist);
3400 }
3401
runtimeChecksRequired()3402 bool LoopVectorizationCostModel::runtimeChecksRequired() {
3403 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3404
3405 if (Legal->getRuntimePointerChecking()->Need) {
3406 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3407 "runtime pointer checks needed. Enable vectorization of this "
3408 "loop with '#pragma clang loop vectorize(enable)' when "
3409 "compiling with -Os/-Oz",
3410 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3411 return true;
3412 }
3413
3414 if (!PSE.getPredicate().isAlwaysTrue()) {
3415 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3416 "runtime SCEV checks needed. Enable vectorization of this "
3417 "loop with '#pragma clang loop vectorize(enable)' when "
3418 "compiling with -Os/-Oz",
3419 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3420 return true;
3421 }
3422
3423 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3424 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3425 reportVectorizationFailure("Runtime stride check for small trip count",
3426 "runtime stride == 1 checks needed. Enable vectorization of "
3427 "this loop without such check by compiling with -Os/-Oz",
3428 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3429 return true;
3430 }
3431
3432 return false;
3433 }
3434
isScalableVectorizationAllowed()3435 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3436 if (IsScalableVectorizationAllowed)
3437 return *IsScalableVectorizationAllowed;
3438
3439 IsScalableVectorizationAllowed = false;
3440 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3441 return false;
3442
3443 if (Hints->isScalableVectorizationDisabled()) {
3444 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3445 "ScalableVectorizationDisabled", ORE, TheLoop);
3446 return false;
3447 }
3448
3449 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3450
3451 auto MaxScalableVF = ElementCount::getScalable(
3452 std::numeric_limits<ElementCount::ScalarTy>::max());
3453
3454 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3455 // FIXME: While for scalable vectors this is currently sufficient, this should
3456 // be replaced by a more detailed mechanism that filters out specific VFs,
3457 // instead of invalidating vectorization for a whole set of VFs based on the
3458 // MaxVF.
3459
3460 // Disable scalable vectorization if the loop contains unsupported reductions.
3461 if (!canVectorizeReductions(MaxScalableVF)) {
3462 reportVectorizationInfo(
3463 "Scalable vectorization not supported for the reduction "
3464 "operations found in this loop.",
3465 "ScalableVFUnfeasible", ORE, TheLoop);
3466 return false;
3467 }
3468
3469 // Disable scalable vectorization if the loop contains any instructions
3470 // with element types not supported for scalable vectors.
3471 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3472 return !Ty->isVoidTy() &&
3473 !this->TTI.isElementTypeLegalForScalableVector(Ty);
3474 })) {
3475 reportVectorizationInfo("Scalable vectorization is not supported "
3476 "for all element types found in this loop.",
3477 "ScalableVFUnfeasible", ORE, TheLoop);
3478 return false;
3479 }
3480
3481 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3482 reportVectorizationInfo("The target does not provide maximum vscale value "
3483 "for safe distance analysis.",
3484 "ScalableVFUnfeasible", ORE, TheLoop);
3485 return false;
3486 }
3487
3488 IsScalableVectorizationAllowed = true;
3489 return true;
3490 }
3491
3492 ElementCount
getMaxLegalScalableVF(unsigned MaxSafeElements)3493 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3494 if (!isScalableVectorizationAllowed())
3495 return ElementCount::getScalable(0);
3496
3497 auto MaxScalableVF = ElementCount::getScalable(
3498 std::numeric_limits<ElementCount::ScalarTy>::max());
3499 if (Legal->isSafeForAnyVectorWidth())
3500 return MaxScalableVF;
3501
3502 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3503 // Limit MaxScalableVF by the maximum safe dependence distance.
3504 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3505
3506 if (!MaxScalableVF)
3507 reportVectorizationInfo(
3508 "Max legal vector width too small, scalable vectorization "
3509 "unfeasible.",
3510 "ScalableVFUnfeasible", ORE, TheLoop);
3511
3512 return MaxScalableVF;
3513 }
3514
computeFeasibleMaxVF(unsigned MaxTripCount,ElementCount UserVF,bool FoldTailByMasking)3515 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3516 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3517 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3518 unsigned SmallestType, WidestType;
3519 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3520
3521 // Get the maximum safe dependence distance in bits computed by LAA.
3522 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3523 // the memory accesses that is most restrictive (involved in the smallest
3524 // dependence distance).
3525 unsigned MaxSafeElementsPowerOf2 =
3526 bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3527 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3528 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3529 MaxSafeElementsPowerOf2 =
3530 std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
3531 }
3532 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
3533 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
3534
3535 if (!Legal->isSafeForAnyVectorWidth())
3536 this->MaxSafeElements = MaxSafeElementsPowerOf2;
3537
3538 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3539 << ".\n");
3540 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3541 << ".\n");
3542
3543 // First analyze the UserVF, fall back if the UserVF should be ignored.
3544 if (UserVF) {
3545 auto MaxSafeUserVF =
3546 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3547
3548 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3549 // If `VF=vscale x N` is safe, then so is `VF=N`
3550 if (UserVF.isScalable())
3551 return FixedScalableVFPair(
3552 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3553
3554 return UserVF;
3555 }
3556
3557 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3558
3559 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3560 // is better to ignore the hint and let the compiler choose a suitable VF.
3561 if (!UserVF.isScalable()) {
3562 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3563 << " is unsafe, clamping to max safe VF="
3564 << MaxSafeFixedVF << ".\n");
3565 ORE->emit([&]() {
3566 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3567 TheLoop->getStartLoc(),
3568 TheLoop->getHeader())
3569 << "User-specified vectorization factor "
3570 << ore::NV("UserVectorizationFactor", UserVF)
3571 << " is unsafe, clamping to maximum safe vectorization factor "
3572 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3573 });
3574 return MaxSafeFixedVF;
3575 }
3576
3577 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3578 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3579 << " is ignored because scalable vectors are not "
3580 "available.\n");
3581 ORE->emit([&]() {
3582 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3583 TheLoop->getStartLoc(),
3584 TheLoop->getHeader())
3585 << "User-specified vectorization factor "
3586 << ore::NV("UserVectorizationFactor", UserVF)
3587 << " is ignored because the target does not support scalable "
3588 "vectors. The compiler will pick a more suitable value.";
3589 });
3590 } else {
3591 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3592 << " is unsafe. Ignoring scalable UserVF.\n");
3593 ORE->emit([&]() {
3594 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3595 TheLoop->getStartLoc(),
3596 TheLoop->getHeader())
3597 << "User-specified vectorization factor "
3598 << ore::NV("UserVectorizationFactor", UserVF)
3599 << " is unsafe. Ignoring the hint to let the compiler pick a "
3600 "more suitable value.";
3601 });
3602 }
3603 }
3604
3605 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3606 << " / " << WidestType << " bits.\n");
3607
3608 FixedScalableVFPair Result(ElementCount::getFixed(1),
3609 ElementCount::getScalable(0));
3610 if (auto MaxVF =
3611 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3612 MaxSafeFixedVF, FoldTailByMasking))
3613 Result.FixedVF = MaxVF;
3614
3615 if (auto MaxVF =
3616 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3617 MaxSafeScalableVF, FoldTailByMasking))
3618 if (MaxVF.isScalable()) {
3619 Result.ScalableVF = MaxVF;
3620 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3621 << "\n");
3622 }
3623
3624 return Result;
3625 }
3626
3627 FixedScalableVFPair
computeMaxVF(ElementCount UserVF,unsigned UserIC)3628 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3629 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3630 // TODO: It may be useful to do since it's still likely to be dynamically
3631 // uniform if the target can skip.
3632 reportVectorizationFailure(
3633 "Not inserting runtime ptr check for divergent target",
3634 "runtime pointer checks needed. Not enabled for divergent target",
3635 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3636 return FixedScalableVFPair::getNone();
3637 }
3638
3639 ScalarEvolution *SE = PSE.getSE();
3640 ElementCount TC = getSmallConstantTripCount(SE, TheLoop);
3641 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3642 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3643 if (TC != ElementCount::getFixed(MaxTC))
3644 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3645 if (TC.isScalar()) {
3646 reportVectorizationFailure("Single iteration (non) loop",
3647 "loop trip count is one, irrelevant for vectorization",
3648 "SingleIterationLoop", ORE, TheLoop);
3649 return FixedScalableVFPair::getNone();
3650 }
3651
3652 // If BTC matches the widest induction type and is -1 then the trip count
3653 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3654 // to vectorize.
3655 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
3656 if (!isa<SCEVCouldNotCompute>(BTC) &&
3657 BTC->getType()->getScalarSizeInBits() >=
3658 Legal->getWidestInductionType()->getScalarSizeInBits() &&
3659 SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC,
3660 SE->getMinusOne(BTC->getType()))) {
3661 reportVectorizationFailure(
3662 "Trip count computation wrapped",
3663 "backedge-taken count is -1, loop trip count wrapped to 0",
3664 "TripCountWrapped", ORE, TheLoop);
3665 return FixedScalableVFPair::getNone();
3666 }
3667
3668 switch (ScalarEpilogueStatus) {
3669 case CM_ScalarEpilogueAllowed:
3670 return computeFeasibleMaxVF(MaxTC, UserVF, false);
3671 case CM_ScalarEpilogueNotAllowedUsePredicate:
3672 [[fallthrough]];
3673 case CM_ScalarEpilogueNotNeededUsePredicate:
3674 LLVM_DEBUG(
3675 dbgs() << "LV: vector predicate hint/switch found.\n"
3676 << "LV: Not allowing scalar epilogue, creating predicated "
3677 << "vector loop.\n");
3678 break;
3679 case CM_ScalarEpilogueNotAllowedLowTripLoop:
3680 // fallthrough as a special case of OptForSize
3681 case CM_ScalarEpilogueNotAllowedOptSize:
3682 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3683 LLVM_DEBUG(
3684 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3685 else
3686 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3687 << "count.\n");
3688
3689 // Bail if runtime checks are required, which are not good when optimising
3690 // for size.
3691 if (runtimeChecksRequired())
3692 return FixedScalableVFPair::getNone();
3693
3694 break;
3695 }
3696
3697 // Now try the tail folding
3698
3699 // Invalidate interleave groups that require an epilogue if we can't mask
3700 // the interleave-group.
3701 if (!useMaskedInterleavedAccesses(TTI)) {
3702 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3703 "No decisions should have been taken at this point");
3704 // Note: There is no need to invalidate any cost modeling decisions here, as
3705 // none were taken so far.
3706 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3707 }
3708
3709 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
3710
3711 // Avoid tail folding if the trip count is known to be a multiple of any VF
3712 // we choose.
3713 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3714 MaxFactors.FixedVF.getFixedValue();
3715 if (MaxFactors.ScalableVF) {
3716 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3717 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
3718 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3719 *MaxPowerOf2RuntimeVF,
3720 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3721 } else
3722 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3723 }
3724
3725 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3726 // Return false if the loop is neither a single-latch-exit loop nor an
3727 // early-exit loop as tail-folding is not supported in that case.
3728 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3729 !Legal->hasUncountableEarlyExit())
3730 return false;
3731 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3732 ScalarEvolution *SE = PSE.getSE();
3733 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3734 // with uncountable exits. For countable loops, the symbolic maximum must
3735 // remain identical to the known back-edge taken count.
3736 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3737 assert((Legal->hasUncountableEarlyExit() ||
3738 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3739 "Invalid loop count");
3740 const SCEV *ExitCount = SE->getAddExpr(
3741 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3742 const SCEV *Rem = SE->getURemExpr(
3743 SE->applyLoopGuards(ExitCount, TheLoop),
3744 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
3745 return Rem->isZero();
3746 };
3747
3748 if (MaxPowerOf2RuntimeVF > 0u) {
3749 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3750 "MaxFixedVF must be a power of 2");
3751 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3752 // Accept MaxFixedVF if we do not have a tail.
3753 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3754 return MaxFactors;
3755 }
3756 }
3757
3758 auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
3759 if (ExpectedTC && ExpectedTC->isFixed() &&
3760 ExpectedTC->getFixedValue() <=
3761 TTI.getMinTripCountTailFoldingThreshold()) {
3762 if (MaxPowerOf2RuntimeVF > 0u) {
3763 // If we have a low-trip-count, and the fixed-width VF is known to divide
3764 // the trip count but the scalable factor does not, use the fixed-width
3765 // factor in preference to allow the generation of a non-predicated loop.
3766 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3767 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3768 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3769 "remain for any chosen VF.\n");
3770 MaxFactors.ScalableVF = ElementCount::getScalable(0);
3771 return MaxFactors;
3772 }
3773 }
3774
3775 reportVectorizationFailure(
3776 "The trip count is below the minial threshold value.",
3777 "loop trip count is too low, avoiding vectorization", "LowTripCount",
3778 ORE, TheLoop);
3779 return FixedScalableVFPair::getNone();
3780 }
3781
3782 // If we don't know the precise trip count, or if the trip count that we
3783 // found modulo the vectorization factor is not zero, try to fold the tail
3784 // by masking.
3785 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3786 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3787 setTailFoldingStyles(ContainsScalableVF, UserIC);
3788 if (foldTailByMasking()) {
3789 if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
3790 LLVM_DEBUG(
3791 dbgs()
3792 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3793 "try to generate VP Intrinsics with scalable vector "
3794 "factors only.\n");
3795 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3796 // for now.
3797 // TODO: extend it for fixed vectors, if required.
3798 assert(ContainsScalableVF && "Expected scalable vector factor.");
3799
3800 MaxFactors.FixedVF = ElementCount::getFixed(1);
3801 }
3802 return MaxFactors;
3803 }
3804
3805 // If there was a tail-folding hint/switch, but we can't fold the tail by
3806 // masking, fallback to a vectorization with a scalar epilogue.
3807 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3808 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3809 "scalar epilogue instead.\n");
3810 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3811 return MaxFactors;
3812 }
3813
3814 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3815 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3816 return FixedScalableVFPair::getNone();
3817 }
3818
3819 if (TC.isZero()) {
3820 reportVectorizationFailure(
3821 "unable to calculate the loop count due to complex control flow",
3822 "UnknownLoopCountComplexCFG", ORE, TheLoop);
3823 return FixedScalableVFPair::getNone();
3824 }
3825
3826 reportVectorizationFailure(
3827 "Cannot optimize for size and vectorize at the same time.",
3828 "cannot optimize for size and vectorize at the same time. "
3829 "Enable vectorization of this loop with '#pragma clang loop "
3830 "vectorize(enable)' when compiling with -Os/-Oz",
3831 "NoTailLoopWithOptForSize", ORE, TheLoop);
3832 return FixedScalableVFPair::getNone();
3833 }
3834
useMaxBandwidth(ElementCount VF)3835 bool LoopVectorizationCostModel::useMaxBandwidth(ElementCount VF) {
3836 return useMaxBandwidth(VF.isScalable()
3837 ? TargetTransformInfo::RGK_ScalableVector
3838 : TargetTransformInfo::RGK_FixedWidthVector);
3839 }
3840
useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind)3841 bool LoopVectorizationCostModel::useMaxBandwidth(
3842 TargetTransformInfo::RegisterKind RegKind) {
3843 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
3844 (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
3845 (UseWiderVFIfCallVariantsPresent &&
3846 Legal->hasVectorCallVariants())));
3847 }
3848
getMaximizedVFForTarget(unsigned MaxTripCount,unsigned SmallestType,unsigned WidestType,ElementCount MaxSafeVF,bool FoldTailByMasking)3849 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3850 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3851 ElementCount MaxSafeVF, bool FoldTailByMasking) {
3852 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3853 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3854 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3855 : TargetTransformInfo::RGK_FixedWidthVector);
3856
3857 // Convenience function to return the minimum of two ElementCounts.
3858 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3859 assert((LHS.isScalable() == RHS.isScalable()) &&
3860 "Scalable flags must match");
3861 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3862 };
3863
3864 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3865 // Note that both WidestRegister and WidestType may not be a powers of 2.
3866 auto MaxVectorElementCount = ElementCount::get(
3867 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
3868 ComputeScalableMaxVF);
3869 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3870 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3871 << (MaxVectorElementCount * WidestType) << " bits.\n");
3872
3873 if (!MaxVectorElementCount) {
3874 LLVM_DEBUG(dbgs() << "LV: The target has no "
3875 << (ComputeScalableMaxVF ? "scalable" : "fixed")
3876 << " vector registers.\n");
3877 return ElementCount::getFixed(1);
3878 }
3879
3880 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
3881 if (MaxVectorElementCount.isScalable() &&
3882 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
3883 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
3884 auto Min = Attr.getVScaleRangeMin();
3885 WidestRegisterMinEC *= Min;
3886 }
3887
3888 // When a scalar epilogue is required, at least one iteration of the scalar
3889 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3890 // max VF that results in a dead vector loop.
3891 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
3892 MaxTripCount -= 1;
3893
3894 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
3895 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
3896 // If upper bound loop trip count (TC) is known at compile time there is no
3897 // point in choosing VF greater than TC (as done in the loop below). Select
3898 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
3899 // scalable, we only fall back on a fixed VF when the TC is less than or
3900 // equal to the known number of lanes.
3901 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
3902 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3903 "exceeding the constant trip count: "
3904 << ClampedUpperTripCount << "\n");
3905 return ElementCount::get(
3906 ClampedUpperTripCount,
3907 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
3908 }
3909
3910 TargetTransformInfo::RegisterKind RegKind =
3911 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3912 : TargetTransformInfo::RGK_FixedWidthVector;
3913 ElementCount MaxVF = MaxVectorElementCount;
3914 if (useMaxBandwidth(RegKind)) {
3915 auto MaxVectorElementCountMaxBW = ElementCount::get(
3916 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
3917 ComputeScalableMaxVF);
3918 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
3919
3920 if (ElementCount MinVF =
3921 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
3922 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
3923 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3924 << ") with target's minimum: " << MinVF << '\n');
3925 MaxVF = MinVF;
3926 }
3927 }
3928
3929 // Invalidate any widening decisions we might have made, in case the loop
3930 // requires prediction (decided later), but we have already made some
3931 // load/store widening decisions.
3932 invalidateCostModelingDecisions();
3933 }
3934 return MaxVF;
3935 }
3936
isMoreProfitable(const VectorizationFactor & A,const VectorizationFactor & B,const unsigned MaxTripCount,bool HasTail) const3937 bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3938 const VectorizationFactor &B,
3939 const unsigned MaxTripCount,
3940 bool HasTail) const {
3941 InstructionCost CostA = A.Cost;
3942 InstructionCost CostB = B.Cost;
3943
3944 // Improve estimate for the vector width if it is scalable.
3945 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3946 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3947 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3948 if (A.Width.isScalable())
3949 EstimatedWidthA *= *VScale;
3950 if (B.Width.isScalable())
3951 EstimatedWidthB *= *VScale;
3952 }
3953
3954 // When optimizing for size choose whichever is smallest, which will be the
3955 // one with the smallest cost for the whole loop. On a tie pick the larger
3956 // vector width, on the assumption that throughput will be greater.
3957 if (CM.CostKind == TTI::TCK_CodeSize)
3958 return CostA < CostB ||
3959 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3960
3961 // Assume vscale may be larger than 1 (or the value being tuned for),
3962 // so that scalable vectorization is slightly favorable over fixed-width
3963 // vectorization.
3964 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
3965 A.Width.isScalable() && !B.Width.isScalable();
3966
3967 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3968 const InstructionCost &RHS) {
3969 return PreferScalable ? LHS <= RHS : LHS < RHS;
3970 };
3971
3972 // To avoid the need for FP division:
3973 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3974 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
3975 if (!MaxTripCount)
3976 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3977
3978 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3979 InstructionCost VectorCost,
3980 InstructionCost ScalarCost) {
3981 // If the trip count is a known (possibly small) constant, the trip count
3982 // will be rounded up to an integer number of iterations under
3983 // FoldTailByMasking. The total cost in that case will be
3984 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
3985 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
3986 // some extra overheads, but for the purpose of comparing the costs of
3987 // different VFs we can use this to compare the total loop-body cost
3988 // expected after vectorization.
3989 if (HasTail)
3990 return VectorCost * (MaxTripCount / VF) +
3991 ScalarCost * (MaxTripCount % VF);
3992 return VectorCost * divideCeil(MaxTripCount, VF);
3993 };
3994
3995 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
3996 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
3997 return CmpFn(RTCostA, RTCostB);
3998 }
3999
isMoreProfitable(const VectorizationFactor & A,const VectorizationFactor & B,bool HasTail) const4000 bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
4001 const VectorizationFactor &B,
4002 bool HasTail) const {
4003 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4004 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
4005 HasTail);
4006 }
4007
emitInvalidCostRemarks(OptimizationRemarkEmitter * ORE)4008 void LoopVectorizationPlanner::emitInvalidCostRemarks(
4009 OptimizationRemarkEmitter *ORE) {
4010 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4011 SmallVector<RecipeVFPair> InvalidCosts;
4012 for (const auto &Plan : VPlans) {
4013 for (ElementCount VF : Plan->vectorFactors()) {
4014 // The VPlan-based cost model is designed for computing vector cost.
4015 // Querying VPlan-based cost model with a scarlar VF will cause some
4016 // errors because we expect the VF is vector for most of the widen
4017 // recipes.
4018 if (VF.isScalar())
4019 continue;
4020
4021 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4022 CM, CM.CostKind);
4023 precomputeCosts(*Plan, VF, CostCtx);
4024 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4025 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4026 for (auto &R : *VPBB) {
4027 if (!R.cost(VF, CostCtx).isValid())
4028 InvalidCosts.emplace_back(&R, VF);
4029 }
4030 }
4031 }
4032 }
4033 if (InvalidCosts.empty())
4034 return;
4035
4036 // Emit a report of VFs with invalid costs in the loop.
4037
4038 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4039 DenseMap<VPRecipeBase *, unsigned> Numbering;
4040 unsigned I = 0;
4041 for (auto &Pair : InvalidCosts)
4042 if (Numbering.try_emplace(Pair.first, I).second)
4043 ++I;
4044
4045 // Sort the list, first on recipe(number) then on VF.
4046 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4047 unsigned NA = Numbering[A.first];
4048 unsigned NB = Numbering[B.first];
4049 if (NA != NB)
4050 return NA < NB;
4051 return ElementCount::isKnownLT(A.second, B.second);
4052 });
4053
4054 // For a list of ordered recipe-VF pairs:
4055 // [(load, VF1), (load, VF2), (store, VF1)]
4056 // group the recipes together to emit separate remarks for:
4057 // load (VF1, VF2)
4058 // store (VF1)
4059 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4060 auto Subset = ArrayRef<RecipeVFPair>();
4061 do {
4062 if (Subset.empty())
4063 Subset = Tail.take_front(1);
4064
4065 VPRecipeBase *R = Subset.front().first;
4066
4067 unsigned Opcode =
4068 TypeSwitch<const VPRecipeBase *, unsigned>(R)
4069 .Case<VPHeaderPHIRecipe>(
4070 [](const auto *R) { return Instruction::PHI; })
4071 .Case<VPWidenSelectRecipe>(
4072 [](const auto *R) { return Instruction::Select; })
4073 .Case<VPWidenStoreRecipe>(
4074 [](const auto *R) { return Instruction::Store; })
4075 .Case<VPWidenLoadRecipe>(
4076 [](const auto *R) { return Instruction::Load; })
4077 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4078 [](const auto *R) { return Instruction::Call; })
4079 .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4080 VPWidenCastRecipe>(
4081 [](const auto *R) { return R->getOpcode(); })
4082 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4083 return R->getStoredValues().empty() ? Instruction::Load
4084 : Instruction::Store;
4085 });
4086
4087 // If the next recipe is different, or if there are no other pairs,
4088 // emit a remark for the collated subset. e.g.
4089 // [(load, VF1), (load, VF2))]
4090 // to emit:
4091 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4092 if (Subset == Tail || Tail[Subset.size()].first != R) {
4093 std::string OutString;
4094 raw_string_ostream OS(OutString);
4095 assert(!Subset.empty() && "Unexpected empty range");
4096 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4097 for (const auto &Pair : Subset)
4098 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4099 OS << "):";
4100 if (Opcode == Instruction::Call) {
4101 StringRef Name = "";
4102 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4103 Name = Int->getIntrinsicName();
4104 } else {
4105 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4106 Function *CalledFn =
4107 WidenCall ? WidenCall->getCalledScalarFunction()
4108 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4109 ->getLiveInIRValue());
4110 Name = CalledFn->getName();
4111 }
4112 OS << " call to " << Name;
4113 } else
4114 OS << " " << Instruction::getOpcodeName(Opcode);
4115 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4116 R->getDebugLoc());
4117 Tail = Tail.drop_front(Subset.size());
4118 Subset = {};
4119 } else
4120 // Grow the subset by one element
4121 Subset = Tail.take_front(Subset.size() + 1);
4122 } while (!Tail.empty());
4123 }
4124
4125 /// Check if any recipe of \p Plan will generate a vector value, which will be
4126 /// assigned a vector register.
willGenerateVectors(VPlan & Plan,ElementCount VF,const TargetTransformInfo & TTI)4127 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4128 const TargetTransformInfo &TTI) {
4129 assert(VF.isVector() && "Checking a scalar VF?");
4130 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4131 DenseSet<VPRecipeBase *> EphemeralRecipes;
4132 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4133 // Set of already visited types.
4134 DenseSet<Type *> Visited;
4135 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4136 vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4137 for (VPRecipeBase &R : *VPBB) {
4138 if (EphemeralRecipes.contains(&R))
4139 continue;
4140 // Continue early if the recipe is considered to not produce a vector
4141 // result. Note that this includes VPInstruction where some opcodes may
4142 // produce a vector, to preserve existing behavior as VPInstructions model
4143 // aspects not directly mapped to existing IR instructions.
4144 switch (R.getVPDefID()) {
4145 case VPDef::VPDerivedIVSC:
4146 case VPDef::VPScalarIVStepsSC:
4147 case VPDef::VPReplicateSC:
4148 case VPDef::VPInstructionSC:
4149 case VPDef::VPCanonicalIVPHISC:
4150 case VPDef::VPVectorPointerSC:
4151 case VPDef::VPVectorEndPointerSC:
4152 case VPDef::VPExpandSCEVSC:
4153 case VPDef::VPEVLBasedIVPHISC:
4154 case VPDef::VPPredInstPHISC:
4155 case VPDef::VPBranchOnMaskSC:
4156 continue;
4157 case VPDef::VPReductionSC:
4158 case VPDef::VPActiveLaneMaskPHISC:
4159 case VPDef::VPWidenCallSC:
4160 case VPDef::VPWidenCanonicalIVSC:
4161 case VPDef::VPWidenCastSC:
4162 case VPDef::VPWidenGEPSC:
4163 case VPDef::VPWidenIntrinsicSC:
4164 case VPDef::VPWidenSC:
4165 case VPDef::VPWidenSelectSC:
4166 case VPDef::VPBlendSC:
4167 case VPDef::VPFirstOrderRecurrencePHISC:
4168 case VPDef::VPHistogramSC:
4169 case VPDef::VPWidenPHISC:
4170 case VPDef::VPWidenIntOrFpInductionSC:
4171 case VPDef::VPWidenPointerInductionSC:
4172 case VPDef::VPReductionPHISC:
4173 case VPDef::VPInterleaveSC:
4174 case VPDef::VPWidenLoadEVLSC:
4175 case VPDef::VPWidenLoadSC:
4176 case VPDef::VPWidenStoreEVLSC:
4177 case VPDef::VPWidenStoreSC:
4178 break;
4179 default:
4180 llvm_unreachable("unhandled recipe");
4181 }
4182
4183 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4184 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4185 if (!NumLegalParts)
4186 return false;
4187 if (VF.isScalable()) {
4188 // <vscale x 1 x iN> is assumed to be profitable over iN because
4189 // scalable registers are a distinct register class from scalar
4190 // ones. If we ever find a target which wants to lower scalable
4191 // vectors back to scalars, we'll need to update this code to
4192 // explicitly ask TTI about the register class uses for each part.
4193 return NumLegalParts <= VF.getKnownMinValue();
4194 }
4195 // Two or more elements that share a register - are vectorized.
4196 return NumLegalParts < VF.getFixedValue();
4197 };
4198
4199 // If no def nor is a store, e.g., branches, continue - no value to check.
4200 if (R.getNumDefinedValues() == 0 &&
4201 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4202 &R))
4203 continue;
4204 // For multi-def recipes, currently only interleaved loads, suffice to
4205 // check first def only.
4206 // For stores check their stored value; for interleaved stores suffice
4207 // the check first stored value only. In all cases this is the second
4208 // operand.
4209 VPValue *ToCheck =
4210 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4211 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4212 if (!Visited.insert({ScalarTy}).second)
4213 continue;
4214 Type *WideTy = toVectorizedTy(ScalarTy, VF);
4215 if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
4216 return true;
4217 }
4218 }
4219
4220 return false;
4221 }
4222
hasReplicatorRegion(VPlan & Plan)4223 static bool hasReplicatorRegion(VPlan &Plan) {
4224 return any_of(VPBlockUtils::blocksOnly<VPRegionBlock>(vp_depth_first_shallow(
4225 Plan.getVectorLoopRegion()->getEntry())),
4226 [](auto *VPRB) { return VPRB->isReplicator(); });
4227 }
4228
4229 #ifndef NDEBUG
selectVectorizationFactor()4230 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4231 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4232 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4233 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4234 assert(
4235 any_of(VPlans,
4236 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4237 "Expected Scalar VF to be a candidate");
4238
4239 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4240 ExpectedCost);
4241 VectorizationFactor ChosenFactor = ScalarCost;
4242
4243 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4244 if (ForceVectorization &&
4245 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4246 // Ignore scalar width, because the user explicitly wants vectorization.
4247 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4248 // evaluation.
4249 ChosenFactor.Cost = InstructionCost::getMax();
4250 }
4251
4252 for (auto &P : VPlans) {
4253 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4254 P->vectorFactors().end());
4255
4256 SmallVector<VPRegisterUsage, 8> RUs;
4257 if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
4258 CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
4259 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4260
4261 for (unsigned I = 0; I < VFs.size(); I++) {
4262 ElementCount VF = VFs[I];
4263 // The cost for scalar VF=1 is already calculated, so ignore it.
4264 if (VF.isScalar())
4265 continue;
4266
4267 /// Don't consider the VF if it exceeds the number of registers for the
4268 /// target.
4269 if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI))
4270 continue;
4271
4272 InstructionCost C = CM.expectedCost(VF);
4273
4274 // Add on other costs that are modelled in VPlan, but not in the legacy
4275 // cost model.
4276 VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
4277 CM, CM.CostKind);
4278 VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4279 assert(VectorRegion && "Expected to have a vector region!");
4280 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4281 vp_depth_first_shallow(VectorRegion->getEntry()))) {
4282 for (VPRecipeBase &R : *VPBB) {
4283 auto *VPI = dyn_cast<VPInstruction>(&R);
4284 if (!VPI)
4285 continue;
4286 switch (VPI->getOpcode()) {
4287 case VPInstruction::ActiveLaneMask:
4288 case VPInstruction::ExplicitVectorLength:
4289 C += VPI->cost(VF, CostCtx);
4290 break;
4291 default:
4292 break;
4293 }
4294 }
4295 }
4296
4297 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4298 unsigned Width =
4299 getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
4300 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4301 << " costs: " << (Candidate.Cost / Width));
4302 if (VF.isScalable())
4303 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4304 << CM.getVScaleForTuning().value_or(1) << ")");
4305 LLVM_DEBUG(dbgs() << ".\n");
4306
4307 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4308 LLVM_DEBUG(
4309 dbgs()
4310 << "LV: Not considering vector loop of width " << VF
4311 << " because it will not generate any vector instructions.\n");
4312 continue;
4313 }
4314
4315 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4316 LLVM_DEBUG(
4317 dbgs()
4318 << "LV: Not considering vector loop of width " << VF
4319 << " because it would cause replicated blocks to be generated,"
4320 << " which isn't allowed when optimizing for size.\n");
4321 continue;
4322 }
4323
4324 if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4325 ChosenFactor = Candidate;
4326 }
4327 }
4328
4329 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4330 reportVectorizationFailure(
4331 "There are conditional stores.",
4332 "store that is conditionally executed prevents vectorization",
4333 "ConditionalStore", ORE, OrigLoop);
4334 ChosenFactor = ScalarCost;
4335 }
4336
4337 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4338 !isMoreProfitable(ChosenFactor, ScalarCost,
4339 !CM.foldTailByMasking())) dbgs()
4340 << "LV: Vectorization seems to be not beneficial, "
4341 << "but was forced by a user.\n");
4342 return ChosenFactor;
4343 }
4344 #endif
4345
isCandidateForEpilogueVectorization(ElementCount VF) const4346 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4347 ElementCount VF) const {
4348 // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4349 // reductions need special handling and are currently unsupported.
4350 if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
4351 if (!Legal->isReductionVariable(&Phi))
4352 return Legal->isFixedOrderRecurrence(&Phi);
4353 RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
4354 return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
4355 }))
4356 return false;
4357
4358 // Phis with uses outside of the loop require special handling and are
4359 // currently unsupported.
4360 for (const auto &Entry : Legal->getInductionVars()) {
4361 // Look for uses of the value of the induction at the last iteration.
4362 Value *PostInc =
4363 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4364 for (User *U : PostInc->users())
4365 if (!OrigLoop->contains(cast<Instruction>(U)))
4366 return false;
4367 // Look for uses of penultimate value of the induction.
4368 for (User *U : Entry.first->users())
4369 if (!OrigLoop->contains(cast<Instruction>(U)))
4370 return false;
4371 }
4372
4373 // Epilogue vectorization code has not been auditted to ensure it handles
4374 // non-latch exits properly. It may be fine, but it needs auditted and
4375 // tested.
4376 // TODO: Add support for loops with an early exit.
4377 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4378 return false;
4379
4380 return true;
4381 }
4382
isEpilogueVectorizationProfitable(const ElementCount VF,const unsigned IC) const4383 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4384 const ElementCount VF, const unsigned IC) const {
4385 // FIXME: We need a much better cost-model to take different parameters such
4386 // as register pressure, code size increase and cost of extra branches into
4387 // account. For now we apply a very crude heuristic and only consider loops
4388 // with vectorization factors larger than a certain value.
4389
4390 // Allow the target to opt out entirely.
4391 if (!TTI.preferEpilogueVectorization())
4392 return false;
4393
4394 // We also consider epilogue vectorization unprofitable for targets that don't
4395 // consider interleaving beneficial (eg. MVE).
4396 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4397 return false;
4398
4399 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4400 // VFs when deciding profitability.
4401 // See related "TODO: extend to support scalable VFs." in
4402 // selectEpilogueVectorizationFactor.
4403 unsigned Multiplier = VF.isFixed() ? IC : 1;
4404 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4405 ? EpilogueVectorizationMinVF
4406 : TTI.getEpilogueVectorizationMinVF();
4407 return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >=
4408 MinVFThreshold;
4409 }
4410
selectEpilogueVectorizationFactor(const ElementCount MainLoopVF,unsigned IC)4411 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4412 const ElementCount MainLoopVF, unsigned IC) {
4413 VectorizationFactor Result = VectorizationFactor::Disabled();
4414 if (!EnableEpilogueVectorization) {
4415 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4416 return Result;
4417 }
4418
4419 if (!CM.isScalarEpilogueAllowed()) {
4420 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4421 "epilogue is allowed.\n");
4422 return Result;
4423 }
4424
4425 // Not really a cost consideration, but check for unsupported cases here to
4426 // simplify the logic.
4427 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4428 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4429 "is not a supported candidate.\n");
4430 return Result;
4431 }
4432
4433 if (EpilogueVectorizationForceVF > 1) {
4434 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4435 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
4436 if (hasPlanWithVF(ForcedEC))
4437 return {ForcedEC, 0, 0};
4438
4439 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4440 "viable.\n");
4441 return Result;
4442 }
4443
4444 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4445 LLVM_DEBUG(
4446 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4447 return Result;
4448 }
4449
4450 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4451 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4452 "this loop\n");
4453 return Result;
4454 }
4455
4456 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4457 // the main loop handles 8 lanes per iteration. We could still benefit from
4458 // vectorizing the epilogue loop with VF=4.
4459 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4460 getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning()));
4461
4462 ScalarEvolution &SE = *PSE.getSE();
4463 Type *TCType = Legal->getWidestInductionType();
4464 const SCEV *RemainingIterations = nullptr;
4465 unsigned MaxTripCount = 0;
4466 for (auto &NextVF : ProfitableVFs) {
4467 // Skip candidate VFs without a corresponding VPlan.
4468 if (!hasPlanWithVF(NextVF.Width))
4469 continue;
4470
4471 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4472 // vectors) or > the VF of the main loop (fixed vectors).
4473 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4474 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4475 (NextVF.Width.isScalable() &&
4476 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4477 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4478 ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4479 continue;
4480
4481 // If NextVF is greater than the number of remaining iterations, the
4482 // epilogue loop would be dead. Skip such factors.
4483 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4484 // TODO: extend to support scalable VFs.
4485 if (!RemainingIterations) {
4486 const SCEV *TC = vputils::getSCEVExprForVPValue(
4487 getPlanFor(NextVF.Width).getTripCount(), SE);
4488 assert(!isa<SCEVCouldNotCompute>(TC) &&
4489 "Trip count SCEV must be computable");
4490 RemainingIterations = SE.getURemExpr(
4491 TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
4492 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4493 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4494 SE.getConstant(TCType, MaxTripCount))) {
4495 MaxTripCount =
4496 SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4497 }
4498 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4499 << MaxTripCount << "\n");
4500 }
4501 if (SE.isKnownPredicate(
4502 CmpInst::ICMP_UGT,
4503 SE.getConstant(TCType, NextVF.Width.getFixedValue()),
4504 RemainingIterations))
4505 continue;
4506 }
4507
4508 if (Result.Width.isScalar() ||
4509 isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking()))
4510 Result = NextVF;
4511 }
4512
4513 if (Result != VectorizationFactor::Disabled())
4514 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4515 << Result.Width << "\n");
4516 return Result;
4517 }
4518
4519 std::pair<unsigned, unsigned>
getSmallestAndWidestTypes()4520 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4521 unsigned MinWidth = -1U;
4522 unsigned MaxWidth = 8;
4523 const DataLayout &DL = TheFunction->getDataLayout();
4524 // For in-loop reductions, no element types are added to ElementTypesInLoop
4525 // if there are no loads/stores in the loop. In this case, check through the
4526 // reduction variables to determine the maximum width.
4527 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4528 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4529 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4530 // When finding the min width used by the recurrence we need to account
4531 // for casts on the input operands of the recurrence.
4532 MinWidth = std::min(
4533 MinWidth,
4534 std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4535 RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4536 MaxWidth = std::max(MaxWidth,
4537 RdxDesc.getRecurrenceType()->getScalarSizeInBits());
4538 }
4539 } else {
4540 for (Type *T : ElementTypesInLoop) {
4541 MinWidth = std::min<unsigned>(
4542 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4543 MaxWidth = std::max<unsigned>(
4544 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4545 }
4546 }
4547 return {MinWidth, MaxWidth};
4548 }
4549
collectElementTypesForWidening()4550 void LoopVectorizationCostModel::collectElementTypesForWidening() {
4551 ElementTypesInLoop.clear();
4552 // For each block.
4553 for (BasicBlock *BB : TheLoop->blocks()) {
4554 // For each instruction in the loop.
4555 for (Instruction &I : BB->instructionsWithoutDebug()) {
4556 Type *T = I.getType();
4557
4558 // Skip ignored values.
4559 if (ValuesToIgnore.count(&I))
4560 continue;
4561
4562 // Only examine Loads, Stores and PHINodes.
4563 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4564 continue;
4565
4566 // Examine PHI nodes that are reduction variables. Update the type to
4567 // account for the recurrence type.
4568 if (auto *PN = dyn_cast<PHINode>(&I)) {
4569 if (!Legal->isReductionVariable(PN))
4570 continue;
4571 const RecurrenceDescriptor &RdxDesc =
4572 Legal->getRecurrenceDescriptor(PN);
4573 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4574 TTI.preferInLoopReduction(RdxDesc.getRecurrenceKind(),
4575 RdxDesc.getRecurrenceType()))
4576 continue;
4577 T = RdxDesc.getRecurrenceType();
4578 }
4579
4580 // Examine the stored values.
4581 if (auto *ST = dyn_cast<StoreInst>(&I))
4582 T = ST->getValueOperand()->getType();
4583
4584 assert(T->isSized() &&
4585 "Expected the load/store/recurrence type to be sized");
4586
4587 ElementTypesInLoop.insert(T);
4588 }
4589 }
4590 }
4591
4592 unsigned
selectInterleaveCount(VPlan & Plan,ElementCount VF,InstructionCost LoopCost)4593 LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4594 InstructionCost LoopCost) {
4595 // -- The interleave heuristics --
4596 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4597 // There are many micro-architectural considerations that we can't predict
4598 // at this level. For example, frontend pressure (on decode or fetch) due to
4599 // code size, or the number and capabilities of the execution ports.
4600 //
4601 // We use the following heuristics to select the interleave count:
4602 // 1. If the code has reductions, then we interleave to break the cross
4603 // iteration dependency.
4604 // 2. If the loop is really small, then we interleave to reduce the loop
4605 // overhead.
4606 // 3. We don't interleave if we think that we will spill registers to memory
4607 // due to the increased register pressure.
4608
4609 if (!isScalarEpilogueAllowed())
4610 return 1;
4611
4612 // Do not interleave if EVL is preferred and no User IC is specified.
4613 if (foldTailWithEVL()) {
4614 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4615 "Unroll factor forced to be 1.\n");
4616 return 1;
4617 }
4618
4619 // We used the distance for the interleave count.
4620 if (!Legal->isSafeForAnyVectorWidth())
4621 return 1;
4622
4623 // We don't attempt to perform interleaving for loops with uncountable early
4624 // exits because the VPInstruction::AnyOf code cannot currently handle
4625 // multiple parts.
4626 if (Legal->hasUncountableEarlyExit())
4627 return 1;
4628
4629 const bool HasReductions = !Legal->getReductionVars().empty();
4630
4631 // If we did not calculate the cost for VF (because the user selected the VF)
4632 // then we calculate the cost of VF here.
4633 if (LoopCost == 0) {
4634 LoopCost = expectedCost(VF);
4635 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4636
4637 // Loop body is free and there is no need for interleaving.
4638 if (LoopCost == 0)
4639 return 1;
4640 }
4641
4642 VPRegisterUsage R =
4643 calculateRegisterUsageForPlan(Plan, {VF}, TTI, ValuesToIgnore)[0];
4644 // We divide by these constants so assume that we have at least one
4645 // instruction that uses at least one register.
4646 for (auto &Pair : R.MaxLocalUsers) {
4647 Pair.second = std::max(Pair.second, 1U);
4648 }
4649
4650 // We calculate the interleave count using the following formula.
4651 // Subtract the number of loop invariants from the number of available
4652 // registers. These registers are used by all of the interleaved instances.
4653 // Next, divide the remaining registers by the number of registers that is
4654 // required by the loop, in order to estimate how many parallel instances
4655 // fit without causing spills. All of this is rounded down if necessary to be
4656 // a power of two. We want power of two interleave count to simplify any
4657 // addressing operations or alignment considerations.
4658 // We also want power of two interleave counts to ensure that the induction
4659 // variable of the vector loop wraps to zero, when tail is folded by masking;
4660 // this currently happens when OptForSize, in which case IC is set to 1 above.
4661 unsigned IC = UINT_MAX;
4662
4663 for (const auto &Pair : R.MaxLocalUsers) {
4664 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4665 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4666 << " registers of "
4667 << TTI.getRegisterClassName(Pair.first)
4668 << " register class\n");
4669 if (VF.isScalar()) {
4670 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4671 TargetNumRegisters = ForceTargetNumScalarRegs;
4672 } else {
4673 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4674 TargetNumRegisters = ForceTargetNumVectorRegs;
4675 }
4676 unsigned MaxLocalUsers = Pair.second;
4677 unsigned LoopInvariantRegs = 0;
4678 if (R.LoopInvariantRegs.contains(Pair.first))
4679 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4680
4681 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4682 MaxLocalUsers);
4683 // Don't count the induction variable as interleaved.
4684 if (EnableIndVarRegisterHeur) {
4685 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4686 std::max(1U, (MaxLocalUsers - 1)));
4687 }
4688
4689 IC = std::min(IC, TmpIC);
4690 }
4691
4692 // Clamp the interleave ranges to reasonable counts.
4693 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4694
4695 // Check if the user has overridden the max.
4696 if (VF.isScalar()) {
4697 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4698 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4699 } else {
4700 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4701 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4702 }
4703
4704 unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
4705
4706 // Try to get the exact trip count, or an estimate based on profiling data or
4707 // ConstantMax from PSE, failing that.
4708 if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) {
4709 // At least one iteration must be scalar when this constraint holds. So the
4710 // maximum available iterations for interleaving is one less.
4711 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
4712 ? BestKnownTC->getFixedValue() - 1
4713 : BestKnownTC->getFixedValue();
4714
4715 unsigned InterleaveCountLB = bit_floor(std::max(
4716 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4717
4718 if (getSmallConstantTripCount(PSE.getSE(), TheLoop).isNonZero()) {
4719 // If the best known trip count is exact, we select between two
4720 // prospective ICs, where
4721 //
4722 // 1) the aggressive IC is capped by the trip count divided by VF
4723 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4724 //
4725 // The final IC is selected in a way that the epilogue loop trip count is
4726 // minimized while maximizing the IC itself, so that we either run the
4727 // vector loop at least once if it generates a small epilogue loop, or
4728 // else we run the vector loop at least twice.
4729
4730 unsigned InterleaveCountUB = bit_floor(std::max(
4731 1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4732 MaxInterleaveCount = InterleaveCountLB;
4733
4734 if (InterleaveCountUB != InterleaveCountLB) {
4735 unsigned TailTripCountUB =
4736 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4737 unsigned TailTripCountLB =
4738 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4739 // If both produce same scalar tail, maximize the IC to do the same work
4740 // in fewer vector loop iterations
4741 if (TailTripCountUB == TailTripCountLB)
4742 MaxInterleaveCount = InterleaveCountUB;
4743 }
4744 } else {
4745 // If trip count is an estimated compile time constant, limit the
4746 // IC to be capped by the trip count divided by VF * 2, such that the
4747 // vector loop runs at least twice to make interleaving seem profitable
4748 // when there is an epilogue loop present. Since exact Trip count is not
4749 // known we choose to be conservative in our IC estimate.
4750 MaxInterleaveCount = InterleaveCountLB;
4751 }
4752 }
4753
4754 assert(MaxInterleaveCount > 0 &&
4755 "Maximum interleave count must be greater than 0");
4756
4757 // Clamp the calculated IC to be between the 1 and the max interleave count
4758 // that the target and trip count allows.
4759 if (IC > MaxInterleaveCount)
4760 IC = MaxInterleaveCount;
4761 else
4762 // Make sure IC is greater than 0.
4763 IC = std::max(1u, IC);
4764
4765 assert(IC > 0 && "Interleave count must be greater than 0.");
4766
4767 // Interleave if we vectorized this loop and there is a reduction that could
4768 // benefit from interleaving.
4769 if (VF.isVector() && HasReductions) {
4770 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4771 return IC;
4772 }
4773
4774 // For any scalar loop that either requires runtime checks or predication we
4775 // are better off leaving this to the unroller. Note that if we've already
4776 // vectorized the loop we will have done the runtime check and so interleaving
4777 // won't require further checks.
4778 bool ScalarInterleavingRequiresPredication =
4779 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
4780 return Legal->blockNeedsPredication(BB);
4781 }));
4782 bool ScalarInterleavingRequiresRuntimePointerCheck =
4783 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4784
4785 // We want to interleave small loops in order to reduce the loop overhead and
4786 // potentially expose ILP opportunities.
4787 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4788 << "LV: IC is " << IC << '\n'
4789 << "LV: VF is " << VF << '\n');
4790 const bool AggressivelyInterleaveReductions =
4791 TTI.enableAggressiveInterleaving(HasReductions);
4792 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4793 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4794 // We assume that the cost overhead is 1 and we use the cost model
4795 // to estimate the cost of the loop and interleave until the cost of the
4796 // loop overhead is about 5% of the cost of the loop.
4797 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
4798 SmallLoopCost / LoopCost.getValue()));
4799
4800 // Interleave until store/load ports (estimated by max interleave count) are
4801 // saturated.
4802 unsigned NumStores = Legal->getNumStores();
4803 unsigned NumLoads = Legal->getNumLoads();
4804 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4805 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4806
4807 // There is little point in interleaving for reductions containing selects
4808 // and compares when VF=1 since it may just create more overhead than it's
4809 // worth for loops with small trip counts. This is because we still have to
4810 // do the final reduction after the loop.
4811 bool HasSelectCmpReductions =
4812 HasReductions &&
4813 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
4814 const RecurrenceDescriptor &RdxDesc = Reduction.second;
4815 RecurKind RK = RdxDesc.getRecurrenceKind();
4816 return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
4817 RecurrenceDescriptor::isFindIVRecurrenceKind(RK);
4818 });
4819 if (HasSelectCmpReductions) {
4820 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4821 return 1;
4822 }
4823
4824 // If we have a scalar reduction (vector reductions are already dealt with
4825 // by this point), we can increase the critical path length if the loop
4826 // we're interleaving is inside another loop. For tree-wise reductions
4827 // set the limit to 2, and for ordered reductions it's best to disable
4828 // interleaving entirely.
4829 if (HasReductions && TheLoop->getLoopDepth() > 1) {
4830 bool HasOrderedReductions =
4831 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
4832 const RecurrenceDescriptor &RdxDesc = Reduction.second;
4833 return RdxDesc.isOrdered();
4834 });
4835 if (HasOrderedReductions) {
4836 LLVM_DEBUG(
4837 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4838 return 1;
4839 }
4840
4841 unsigned F = MaxNestedScalarReductionIC;
4842 SmallIC = std::min(SmallIC, F);
4843 StoresIC = std::min(StoresIC, F);
4844 LoadsIC = std::min(LoadsIC, F);
4845 }
4846
4847 if (EnableLoadStoreRuntimeInterleave &&
4848 std::max(StoresIC, LoadsIC) > SmallIC) {
4849 LLVM_DEBUG(
4850 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4851 return std::max(StoresIC, LoadsIC);
4852 }
4853
4854 // If there are scalar reductions and TTI has enabled aggressive
4855 // interleaving for reductions, we will interleave to expose ILP.
4856 if (VF.isScalar() && AggressivelyInterleaveReductions) {
4857 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4858 // Interleave no less than SmallIC but not as aggressive as the normal IC
4859 // to satisfy the rare situation when resources are too limited.
4860 return std::max(IC / 2, SmallIC);
4861 }
4862
4863 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4864 return SmallIC;
4865 }
4866
4867 // Interleave if this is a large loop (small loops are already dealt with by
4868 // this point) that could benefit from interleaving.
4869 if (AggressivelyInterleaveReductions) {
4870 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4871 return IC;
4872 }
4873
4874 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4875 return 1;
4876 }
4877
useEmulatedMaskMemRefHack(Instruction * I,ElementCount VF)4878 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4879 ElementCount VF) {
4880 // TODO: Cost model for emulated masked load/store is completely
4881 // broken. This hack guides the cost model to use an artificially
4882 // high enough value to practically disable vectorization with such
4883 // operations, except where previously deployed legality hack allowed
4884 // using very low cost values. This is to avoid regressions coming simply
4885 // from moving "masked load/store" check from legality to cost model.
4886 // Masked Load/Gather emulation was previously never allowed.
4887 // Limited number of Masked Store/Scatter emulation was allowed.
4888 assert((isPredicatedInst(I)) &&
4889 "Expecting a scalar emulated instruction");
4890 return isa<LoadInst>(I) ||
4891 (isa<StoreInst>(I) &&
4892 NumPredStores > NumberOfStoresToPredicate);
4893 }
4894
collectInstsToScalarize(ElementCount VF)4895 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
4896 assert(VF.isVector() && "Expected VF >= 2");
4897
4898 // If we've already collected the instructions to scalarize or the predicated
4899 // BBs after vectorization, there's nothing to do. Collection may already have
4900 // occurred if we have a user-selected VF and are now computing the expected
4901 // cost for interleaving.
4902 if (InstsToScalarize.contains(VF) ||
4903 PredicatedBBsAfterVectorization.contains(VF))
4904 return;
4905
4906 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
4907 // not profitable to scalarize any instructions, the presence of VF in the
4908 // map will indicate that we've analyzed it already.
4909 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
4910
4911 // Find all the instructions that are scalar with predication in the loop and
4912 // determine if it would be better to not if-convert the blocks they are in.
4913 // If so, we also record the instructions to scalarize.
4914 for (BasicBlock *BB : TheLoop->blocks()) {
4915 if (!blockNeedsPredicationForAnyReason(BB))
4916 continue;
4917 for (Instruction &I : *BB)
4918 if (isScalarWithPredication(&I, VF)) {
4919 ScalarCostsTy ScalarCosts;
4920 // Do not apply discount logic for:
4921 // 1. Scalars after vectorization, as there will only be a single copy
4922 // of the instruction.
4923 // 2. Scalable VF, as that would lead to invalid scalarization costs.
4924 // 3. Emulated masked memrefs, if a hacked cost is needed.
4925 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
4926 !useEmulatedMaskMemRefHack(&I, VF) &&
4927 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
4928 ScalarCostsVF.insert_range(ScalarCosts);
4929 // Check if we decided to scalarize a call. If so, update the widening
4930 // decision of the call to CM_Scalarize with the computed scalar cost.
4931 for (const auto &[I, Cost] : ScalarCosts) {
4932 auto *CI = dyn_cast<CallInst>(I);
4933 if (!CI || !CallWideningDecisions.contains({CI, VF}))
4934 continue;
4935 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
4936 CallWideningDecisions[{CI, VF}].Cost = Cost;
4937 }
4938 }
4939 // Remember that BB will remain after vectorization.
4940 PredicatedBBsAfterVectorization[VF].insert(BB);
4941 for (auto *Pred : predecessors(BB)) {
4942 if (Pred->getSingleSuccessor() == BB)
4943 PredicatedBBsAfterVectorization[VF].insert(Pred);
4944 }
4945 }
4946 }
4947 }
4948
computePredInstDiscount(Instruction * PredInst,ScalarCostsTy & ScalarCosts,ElementCount VF)4949 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
4950 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
4951 assert(!isUniformAfterVectorization(PredInst, VF) &&
4952 "Instruction marked uniform-after-vectorization will be predicated");
4953
4954 // Initialize the discount to zero, meaning that the scalar version and the
4955 // vector version cost the same.
4956 InstructionCost Discount = 0;
4957
4958 // Holds instructions to analyze. The instructions we visit are mapped in
4959 // ScalarCosts. Those instructions are the ones that would be scalarized if
4960 // we find that the scalar version costs less.
4961 SmallVector<Instruction *, 8> Worklist;
4962
4963 // Returns true if the given instruction can be scalarized.
4964 auto CanBeScalarized = [&](Instruction *I) -> bool {
4965 // We only attempt to scalarize instructions forming a single-use chain
4966 // from the original predicated block that would otherwise be vectorized.
4967 // Although not strictly necessary, we give up on instructions we know will
4968 // already be scalar to avoid traversing chains that are unlikely to be
4969 // beneficial.
4970 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
4971 isScalarAfterVectorization(I, VF))
4972 return false;
4973
4974 // If the instruction is scalar with predication, it will be analyzed
4975 // separately. We ignore it within the context of PredInst.
4976 if (isScalarWithPredication(I, VF))
4977 return false;
4978
4979 // If any of the instruction's operands are uniform after vectorization,
4980 // the instruction cannot be scalarized. This prevents, for example, a
4981 // masked load from being scalarized.
4982 //
4983 // We assume we will only emit a value for lane zero of an instruction
4984 // marked uniform after vectorization, rather than VF identical values.
4985 // Thus, if we scalarize an instruction that uses a uniform, we would
4986 // create uses of values corresponding to the lanes we aren't emitting code
4987 // for. This behavior can be changed by allowing getScalarValue to clone
4988 // the lane zero values for uniforms rather than asserting.
4989 for (Use &U : I->operands())
4990 if (auto *J = dyn_cast<Instruction>(U.get()))
4991 if (isUniformAfterVectorization(J, VF))
4992 return false;
4993
4994 // Otherwise, we can scalarize the instruction.
4995 return true;
4996 };
4997
4998 // Compute the expected cost discount from scalarizing the entire expression
4999 // feeding the predicated instruction. We currently only consider expressions
5000 // that are single-use instruction chains.
5001 Worklist.push_back(PredInst);
5002 while (!Worklist.empty()) {
5003 Instruction *I = Worklist.pop_back_val();
5004
5005 // If we've already analyzed the instruction, there's nothing to do.
5006 if (ScalarCosts.contains(I))
5007 continue;
5008
5009 // Cannot scalarize fixed-order recurrence phis at the moment.
5010 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5011 continue;
5012
5013 // Compute the cost of the vector instruction. Note that this cost already
5014 // includes the scalarization overhead of the predicated instruction.
5015 InstructionCost VectorCost = getInstructionCost(I, VF);
5016
5017 // Compute the cost of the scalarized instruction. This cost is the cost of
5018 // the instruction as if it wasn't if-converted and instead remained in the
5019 // predicated block. We will scale this cost by block probability after
5020 // computing the scalarization overhead.
5021 InstructionCost ScalarCost =
5022 VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5023
5024 // Compute the scalarization overhead of needed insertelement instructions
5025 // and phi nodes.
5026 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5027 Type *WideTy = toVectorizedTy(I->getType(), VF);
5028 for (Type *VectorTy : getContainedTypes(WideTy)) {
5029 ScalarCost += TTI.getScalarizationOverhead(
5030 cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
5031 /*Insert=*/true,
5032 /*Extract=*/false, CostKind);
5033 }
5034 ScalarCost +=
5035 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5036 }
5037
5038 // Compute the scalarization overhead of needed extractelement
5039 // instructions. For each of the instruction's operands, if the operand can
5040 // be scalarized, add it to the worklist; otherwise, account for the
5041 // overhead.
5042 for (Use &U : I->operands())
5043 if (auto *J = dyn_cast<Instruction>(U.get())) {
5044 assert(canVectorizeTy(J->getType()) &&
5045 "Instruction has non-scalar type");
5046 if (CanBeScalarized(J))
5047 Worklist.push_back(J);
5048 else if (needsExtract(J, VF)) {
5049 Type *WideTy = toVectorizedTy(J->getType(), VF);
5050 for (Type *VectorTy : getContainedTypes(WideTy)) {
5051 ScalarCost += TTI.getScalarizationOverhead(
5052 cast<VectorType>(VectorTy),
5053 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5054 /*Extract*/ true, CostKind);
5055 }
5056 }
5057 }
5058
5059 // Scale the total scalar cost by block probability.
5060 ScalarCost /= getPredBlockCostDivisor(CostKind);
5061
5062 // Compute the discount. A non-negative discount means the vector version
5063 // of the instruction costs more, and scalarizing would be beneficial.
5064 Discount += VectorCost - ScalarCost;
5065 ScalarCosts[I] = ScalarCost;
5066 }
5067
5068 return Discount;
5069 }
5070
expectedCost(ElementCount VF)5071 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5072 InstructionCost Cost;
5073
5074 // If the vector loop gets executed exactly once with the given VF, ignore the
5075 // costs of comparison and induction instructions, as they'll get simplified
5076 // away.
5077 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5078 auto TC = getSmallConstantTripCount(PSE.getSE(), TheLoop);
5079 if (TC == VF && !foldTailByMasking())
5080 addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
5081 ValuesToIgnoreForVF);
5082
5083 // For each block.
5084 for (BasicBlock *BB : TheLoop->blocks()) {
5085 InstructionCost BlockCost;
5086
5087 // For each instruction in the old loop.
5088 for (Instruction &I : BB->instructionsWithoutDebug()) {
5089 // Skip ignored values.
5090 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5091 (VF.isVector() && VecValuesToIgnore.count(&I)))
5092 continue;
5093
5094 InstructionCost C = getInstructionCost(&I, VF);
5095
5096 // Check if we should override the cost.
5097 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5098 C = InstructionCost(ForceTargetInstructionCost);
5099
5100 BlockCost += C;
5101 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5102 << VF << " For instruction: " << I << '\n');
5103 }
5104
5105 // If we are vectorizing a predicated block, it will have been
5106 // if-converted. This means that the block's instructions (aside from
5107 // stores and instructions that may divide by zero) will now be
5108 // unconditionally executed. For the scalar case, we may not always execute
5109 // the predicated block, if it is an if-else block. Thus, scale the block's
5110 // cost by the probability of executing it. blockNeedsPredication from
5111 // Legal is used so as to not include all blocks in tail folded loops.
5112 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5113 BlockCost /= getPredBlockCostDivisor(CostKind);
5114
5115 Cost += BlockCost;
5116 }
5117
5118 return Cost;
5119 }
5120
5121 /// Gets Address Access SCEV after verifying that the access pattern
5122 /// is loop invariant except the induction variable dependence.
5123 ///
5124 /// This SCEV can be sent to the Target in order to estimate the address
5125 /// calculation cost.
getAddressAccessSCEV(Value * Ptr,LoopVectorizationLegality * Legal,PredicatedScalarEvolution & PSE,const Loop * TheLoop)5126 static const SCEV *getAddressAccessSCEV(
5127 Value *Ptr,
5128 LoopVectorizationLegality *Legal,
5129 PredicatedScalarEvolution &PSE,
5130 const Loop *TheLoop) {
5131
5132 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5133 if (!Gep)
5134 return nullptr;
5135
5136 // We are looking for a gep with all loop invariant indices except for one
5137 // which should be an induction variable.
5138 auto *SE = PSE.getSE();
5139 unsigned NumOperands = Gep->getNumOperands();
5140 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5141 Value *Opd = Gep->getOperand(Idx);
5142 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5143 !Legal->isInductionVariable(Opd))
5144 return nullptr;
5145 }
5146
5147 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5148 return PSE.getSCEV(Ptr);
5149 }
5150
5151 InstructionCost
getMemInstScalarizationCost(Instruction * I,ElementCount VF)5152 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5153 ElementCount VF) {
5154 assert(VF.isVector() &&
5155 "Scalarization cost of instruction implies vectorization.");
5156 if (VF.isScalable())
5157 return InstructionCost::getInvalid();
5158
5159 Type *ValTy = getLoadStoreType(I);
5160 auto *SE = PSE.getSE();
5161
5162 unsigned AS = getLoadStoreAddressSpace(I);
5163 Value *Ptr = getLoadStorePointerOperand(I);
5164 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5165 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5166 // that it is being called from this specific place.
5167
5168 // Figure out whether the access is strided and get the stride value
5169 // if it's known in compile time
5170 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5171
5172 // Get the cost of the scalar memory instruction and address computation.
5173 InstructionCost Cost =
5174 VF.getFixedValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5175
5176 // Don't pass *I here, since it is scalar but will actually be part of a
5177 // vectorized loop where the user of it is a vectorized instruction.
5178 const Align Alignment = getLoadStoreAlignment(I);
5179 Cost += VF.getFixedValue() * TTI.getMemoryOpCost(I->getOpcode(),
5180 ValTy->getScalarType(),
5181 Alignment, AS, CostKind);
5182
5183 // Get the overhead of the extractelement and insertelement instructions
5184 // we might create due to scalarization.
5185 Cost += getScalarizationOverhead(I, VF);
5186
5187 // If we have a predicated load/store, it will need extra i1 extracts and
5188 // conditional branches, but may not be executed for each vector lane. Scale
5189 // the cost by the probability of executing the predicated block.
5190 if (isPredicatedInst(I)) {
5191 Cost /= getPredBlockCostDivisor(CostKind);
5192
5193 // Add the cost of an i1 extract and a branch
5194 auto *VecI1Ty =
5195 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5196 Cost += TTI.getScalarizationOverhead(
5197 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
5198 /*Insert=*/false, /*Extract=*/true, CostKind);
5199 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5200
5201 if (useEmulatedMaskMemRefHack(I, VF))
5202 // Artificially setting to a high enough value to practically disable
5203 // vectorization with such operations.
5204 Cost = 3000000;
5205 }
5206
5207 return Cost;
5208 }
5209
5210 InstructionCost
getConsecutiveMemOpCost(Instruction * I,ElementCount VF)5211 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5212 ElementCount VF) {
5213 Type *ValTy = getLoadStoreType(I);
5214 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5215 Value *Ptr = getLoadStorePointerOperand(I);
5216 unsigned AS = getLoadStoreAddressSpace(I);
5217 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5218
5219 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5220 "Stride should be 1 or -1 for consecutive memory access");
5221 const Align Alignment = getLoadStoreAlignment(I);
5222 InstructionCost Cost = 0;
5223 if (Legal->isMaskRequired(I)) {
5224 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5225 CostKind);
5226 } else {
5227 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5228 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5229 CostKind, OpInfo, I);
5230 }
5231
5232 bool Reverse = ConsecutiveStride < 0;
5233 if (Reverse)
5234 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
5235 VectorTy, {}, CostKind, 0);
5236 return Cost;
5237 }
5238
5239 InstructionCost
getUniformMemOpCost(Instruction * I,ElementCount VF)5240 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5241 ElementCount VF) {
5242 assert(Legal->isUniformMemOp(*I, VF));
5243
5244 Type *ValTy = getLoadStoreType(I);
5245 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5246 const Align Alignment = getLoadStoreAlignment(I);
5247 unsigned AS = getLoadStoreAddressSpace(I);
5248 if (isa<LoadInst>(I)) {
5249 return TTI.getAddressComputationCost(ValTy) +
5250 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5251 CostKind) +
5252 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy,
5253 VectorTy, {}, CostKind);
5254 }
5255 StoreInst *SI = cast<StoreInst>(I);
5256
5257 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5258 // TODO: We have existing tests that request the cost of extracting element
5259 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5260 // the actual generated code, which involves extracting the last element of
5261 // a scalable vector where the lane to extract is unknown at compile time.
5262 return TTI.getAddressComputationCost(ValTy) +
5263 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5264 CostKind) +
5265 (IsLoopInvariantStoreValue
5266 ? 0
5267 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5268 CostKind, VF.getKnownMinValue() - 1));
5269 }
5270
5271 InstructionCost
getGatherScatterCost(Instruction * I,ElementCount VF)5272 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5273 ElementCount VF) {
5274 Type *ValTy = getLoadStoreType(I);
5275 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5276 const Align Alignment = getLoadStoreAlignment(I);
5277 const Value *Ptr = getLoadStorePointerOperand(I);
5278
5279 return TTI.getAddressComputationCost(VectorTy) +
5280 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5281 Legal->isMaskRequired(I), Alignment,
5282 CostKind, I);
5283 }
5284
5285 InstructionCost
getInterleaveGroupCost(Instruction * I,ElementCount VF)5286 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5287 ElementCount VF) {
5288 const auto *Group = getInterleavedAccessGroup(I);
5289 assert(Group && "Fail to get an interleaved access group.");
5290
5291 Instruction *InsertPos = Group->getInsertPos();
5292 Type *ValTy = getLoadStoreType(InsertPos);
5293 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5294 unsigned AS = getLoadStoreAddressSpace(InsertPos);
5295
5296 unsigned InterleaveFactor = Group->getFactor();
5297 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5298
5299 // Holds the indices of existing members in the interleaved group.
5300 SmallVector<unsigned, 4> Indices;
5301 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5302 if (Group->getMember(IF))
5303 Indices.push_back(IF);
5304
5305 // Calculate the cost of the whole interleaved group.
5306 bool UseMaskForGaps =
5307 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5308 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5309 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5310 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5311 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5312 UseMaskForGaps);
5313
5314 if (Group->isReverse()) {
5315 // TODO: Add support for reversed masked interleaved access.
5316 assert(!Legal->isMaskRequired(I) &&
5317 "Reverse masked interleaved access not supported.");
5318 Cost += Group->getNumMembers() *
5319 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
5320 VectorTy, {}, CostKind, 0);
5321 }
5322 return Cost;
5323 }
5324
5325 std::optional<InstructionCost>
getReductionPatternCost(Instruction * I,ElementCount VF,Type * Ty) const5326 LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5327 ElementCount VF,
5328 Type *Ty) const {
5329 using namespace llvm::PatternMatch;
5330 // Early exit for no inloop reductions
5331 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5332 return std::nullopt;
5333 auto *VectorTy = cast<VectorType>(Ty);
5334
5335 // We are looking for a pattern of, and finding the minimal acceptable cost:
5336 // reduce(mul(ext(A), ext(B))) or
5337 // reduce(mul(A, B)) or
5338 // reduce(ext(A)) or
5339 // reduce(A).
5340 // The basic idea is that we walk down the tree to do that, finding the root
5341 // reduction instruction in InLoopReductionImmediateChains. From there we find
5342 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5343 // of the components. If the reduction cost is lower then we return it for the
5344 // reduction instruction and 0 for the other instructions in the pattern. If
5345 // it is not we return an invalid cost specifying the orignal cost method
5346 // should be used.
5347 Instruction *RetI = I;
5348 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5349 if (!RetI->hasOneUser())
5350 return std::nullopt;
5351 RetI = RetI->user_back();
5352 }
5353
5354 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5355 RetI->user_back()->getOpcode() == Instruction::Add) {
5356 RetI = RetI->user_back();
5357 }
5358
5359 // Test if the found instruction is a reduction, and if not return an invalid
5360 // cost specifying the parent to use the original cost modelling.
5361 Instruction *LastChain = InLoopReductionImmediateChains.lookup(RetI);
5362 if (!LastChain)
5363 return std::nullopt;
5364
5365 // Find the reduction this chain is a part of and calculate the basic cost of
5366 // the reduction on its own.
5367 Instruction *ReductionPhi = LastChain;
5368 while (!isa<PHINode>(ReductionPhi))
5369 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5370
5371 const RecurrenceDescriptor &RdxDesc =
5372 Legal->getRecurrenceDescriptor(cast<PHINode>(ReductionPhi));
5373
5374 InstructionCost BaseCost;
5375 RecurKind RK = RdxDesc.getRecurrenceKind();
5376 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
5377 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5378 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5379 RdxDesc.getFastMathFlags(), CostKind);
5380 } else {
5381 BaseCost = TTI.getArithmeticReductionCost(
5382 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5383 }
5384
5385 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5386 // normal fmul instruction to the cost of the fadd reduction.
5387 if (RK == RecurKind::FMulAdd)
5388 BaseCost +=
5389 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5390
5391 // If we're using ordered reductions then we can just return the base cost
5392 // here, since getArithmeticReductionCost calculates the full ordered
5393 // reduction cost when FP reassociation is not allowed.
5394 if (useOrderedReductions(RdxDesc))
5395 return BaseCost;
5396
5397 // Get the operand that was not the reduction chain and match it to one of the
5398 // patterns, returning the better cost if it is found.
5399 Instruction *RedOp = RetI->getOperand(1) == LastChain
5400 ? dyn_cast<Instruction>(RetI->getOperand(0))
5401 : dyn_cast<Instruction>(RetI->getOperand(1));
5402
5403 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5404
5405 Instruction *Op0, *Op1;
5406 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5407 match(RedOp,
5408 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
5409 match(Op0, m_ZExtOrSExt(m_Value())) &&
5410 Op0->getOpcode() == Op1->getOpcode() &&
5411 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5412 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5413 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5414
5415 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5416 // Note that the extend opcodes need to all match, or if A==B they will have
5417 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5418 // which is equally fine.
5419 bool IsUnsigned = isa<ZExtInst>(Op0);
5420 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5421 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5422
5423 InstructionCost ExtCost =
5424 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5425 TTI::CastContextHint::None, CostKind, Op0);
5426 InstructionCost MulCost =
5427 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5428 InstructionCost Ext2Cost =
5429 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5430 TTI::CastContextHint::None, CostKind, RedOp);
5431
5432 InstructionCost RedCost = TTI.getMulAccReductionCost(
5433 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5434
5435 if (RedCost.isValid() &&
5436 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5437 return I == RetI ? RedCost : 0;
5438 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5439 !TheLoop->isLoopInvariant(RedOp)) {
5440 // Matched reduce(ext(A))
5441 bool IsUnsigned = isa<ZExtInst>(RedOp);
5442 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5443 InstructionCost RedCost = TTI.getExtendedReductionCost(
5444 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5445 RdxDesc.getFastMathFlags(), CostKind);
5446
5447 InstructionCost ExtCost =
5448 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5449 TTI::CastContextHint::None, CostKind, RedOp);
5450 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5451 return I == RetI ? RedCost : 0;
5452 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5453 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5454 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5455 Op0->getOpcode() == Op1->getOpcode() &&
5456 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
5457 bool IsUnsigned = isa<ZExtInst>(Op0);
5458 Type *Op0Ty = Op0->getOperand(0)->getType();
5459 Type *Op1Ty = Op1->getOperand(0)->getType();
5460 Type *LargestOpTy =
5461 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5462 : Op0Ty;
5463 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5464
5465 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5466 // different sizes. We take the largest type as the ext to reduce, and add
5467 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5468 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5469 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5470 TTI::CastContextHint::None, CostKind, Op0);
5471 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5472 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5473 TTI::CastContextHint::None, CostKind, Op1);
5474 InstructionCost MulCost =
5475 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5476
5477 InstructionCost RedCost = TTI.getMulAccReductionCost(
5478 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5479 InstructionCost ExtraExtCost = 0;
5480 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5481 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5482 ExtraExtCost = TTI.getCastInstrCost(
5483 ExtraExtOp->getOpcode(), ExtType,
5484 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5485 TTI::CastContextHint::None, CostKind, ExtraExtOp);
5486 }
5487
5488 if (RedCost.isValid() &&
5489 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5490 return I == RetI ? RedCost : 0;
5491 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5492 // Matched reduce.add(mul())
5493 InstructionCost MulCost =
5494 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5495
5496 InstructionCost RedCost = TTI.getMulAccReductionCost(
5497 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
5498
5499 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5500 return I == RetI ? RedCost : 0;
5501 }
5502 }
5503
5504 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5505 }
5506
5507 InstructionCost
getMemoryInstructionCost(Instruction * I,ElementCount VF)5508 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5509 ElementCount VF) {
5510 // Calculate scalar cost only. Vectorization cost should be ready at this
5511 // moment.
5512 if (VF.isScalar()) {
5513 Type *ValTy = getLoadStoreType(I);
5514 const Align Alignment = getLoadStoreAlignment(I);
5515 unsigned AS = getLoadStoreAddressSpace(I);
5516
5517 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5518 return TTI.getAddressComputationCost(ValTy) +
5519 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
5520 OpInfo, I);
5521 }
5522 return getWideningCost(I, VF);
5523 }
5524
5525 InstructionCost
getScalarizationOverhead(Instruction * I,ElementCount VF) const5526 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5527 ElementCount VF) const {
5528
5529 // There is no mechanism yet to create a scalable scalarization loop,
5530 // so this is currently Invalid.
5531 if (VF.isScalable())
5532 return InstructionCost::getInvalid();
5533
5534 if (VF.isScalar())
5535 return 0;
5536
5537 InstructionCost Cost = 0;
5538 Type *RetTy = toVectorizedTy(I->getType(), VF);
5539 if (!RetTy->isVoidTy() &&
5540 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
5541
5542 for (Type *VectorTy : getContainedTypes(RetTy)) {
5543 Cost += TTI.getScalarizationOverhead(
5544 cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
5545 /*Insert=*/true,
5546 /*Extract=*/false, CostKind);
5547 }
5548 }
5549
5550 // Some targets keep addresses scalar.
5551 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5552 return Cost;
5553
5554 // Some targets support efficient element stores.
5555 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5556 return Cost;
5557
5558 // Collect operands to consider.
5559 CallInst *CI = dyn_cast<CallInst>(I);
5560 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5561
5562 // Skip operands that do not require extraction/scalarization and do not incur
5563 // any overhead.
5564 SmallVector<Type *> Tys;
5565 for (auto *V : filterExtractingOperands(Ops, VF))
5566 Tys.push_back(maybeVectorizeType(V->getType(), VF));
5567 return Cost + TTI.getOperandsScalarizationOverhead(
5568 filterExtractingOperands(Ops, VF), Tys, CostKind);
5569 }
5570
setCostBasedWideningDecision(ElementCount VF)5571 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5572 if (VF.isScalar())
5573 return;
5574 NumPredStores = 0;
5575 for (BasicBlock *BB : TheLoop->blocks()) {
5576 // For each instruction in the old loop.
5577 for (Instruction &I : *BB) {
5578 Value *Ptr = getLoadStorePointerOperand(&I);
5579 if (!Ptr)
5580 continue;
5581
5582 // TODO: We should generate better code and update the cost model for
5583 // predicated uniform stores. Today they are treated as any other
5584 // predicated store (see added test cases in
5585 // invariant-store-vectorization.ll).
5586 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
5587 NumPredStores++;
5588
5589 if (Legal->isUniformMemOp(I, VF)) {
5590 auto IsLegalToScalarize = [&]() {
5591 if (!VF.isScalable())
5592 // Scalarization of fixed length vectors "just works".
5593 return true;
5594
5595 // We have dedicated lowering for unpredicated uniform loads and
5596 // stores. Note that even with tail folding we know that at least
5597 // one lane is active (i.e. generalized predication is not possible
5598 // here), and the logic below depends on this fact.
5599 if (!foldTailByMasking())
5600 return true;
5601
5602 // For scalable vectors, a uniform memop load is always
5603 // uniform-by-parts and we know how to scalarize that.
5604 if (isa<LoadInst>(I))
5605 return true;
5606
5607 // A uniform store isn't neccessarily uniform-by-part
5608 // and we can't assume scalarization.
5609 auto &SI = cast<StoreInst>(I);
5610 return TheLoop->isLoopInvariant(SI.getValueOperand());
5611 };
5612
5613 const InstructionCost GatherScatterCost =
5614 isLegalGatherOrScatter(&I, VF) ?
5615 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
5616
5617 // Load: Scalar load + broadcast
5618 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5619 // FIXME: This cost is a significant under-estimate for tail folded
5620 // memory ops.
5621 const InstructionCost ScalarizationCost =
5622 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
5623 : InstructionCost::getInvalid();
5624
5625 // Choose better solution for the current VF, Note that Invalid
5626 // costs compare as maximumal large. If both are invalid, we get
5627 // scalable invalid which signals a failure and a vectorization abort.
5628 if (GatherScatterCost < ScalarizationCost)
5629 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
5630 else
5631 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
5632 continue;
5633 }
5634
5635 // We assume that widening is the best solution when possible.
5636 if (memoryInstructionCanBeWidened(&I, VF)) {
5637 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
5638 int ConsecutiveStride = Legal->isConsecutivePtr(
5639 getLoadStoreType(&I), getLoadStorePointerOperand(&I));
5640 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5641 "Expected consecutive stride.");
5642 InstWidening Decision =
5643 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5644 setWideningDecision(&I, VF, Decision, Cost);
5645 continue;
5646 }
5647
5648 // Choose between Interleaving, Gather/Scatter or Scalarization.
5649 InstructionCost InterleaveCost = InstructionCost::getInvalid();
5650 unsigned NumAccesses = 1;
5651 if (isAccessInterleaved(&I)) {
5652 const auto *Group = getInterleavedAccessGroup(&I);
5653 assert(Group && "Fail to get an interleaved access group.");
5654
5655 // Make one decision for the whole group.
5656 if (getWideningDecision(&I, VF) != CM_Unknown)
5657 continue;
5658
5659 NumAccesses = Group->getNumMembers();
5660 if (interleavedAccessCanBeWidened(&I, VF))
5661 InterleaveCost = getInterleaveGroupCost(&I, VF);
5662 }
5663
5664 InstructionCost GatherScatterCost =
5665 isLegalGatherOrScatter(&I, VF)
5666 ? getGatherScatterCost(&I, VF) * NumAccesses
5667 : InstructionCost::getInvalid();
5668
5669 InstructionCost ScalarizationCost =
5670 getMemInstScalarizationCost(&I, VF) * NumAccesses;
5671
5672 // Choose better solution for the current VF,
5673 // write down this decision and use it during vectorization.
5674 InstructionCost Cost;
5675 InstWidening Decision;
5676 if (InterleaveCost <= GatherScatterCost &&
5677 InterleaveCost < ScalarizationCost) {
5678 Decision = CM_Interleave;
5679 Cost = InterleaveCost;
5680 } else if (GatherScatterCost < ScalarizationCost) {
5681 Decision = CM_GatherScatter;
5682 Cost = GatherScatterCost;
5683 } else {
5684 Decision = CM_Scalarize;
5685 Cost = ScalarizationCost;
5686 }
5687 // If the instructions belongs to an interleave group, the whole group
5688 // receives the same decision. The whole group receives the cost, but
5689 // the cost will actually be assigned to one instruction.
5690 if (const auto *Group = getInterleavedAccessGroup(&I))
5691 setWideningDecision(Group, VF, Decision, Cost);
5692 else
5693 setWideningDecision(&I, VF, Decision, Cost);
5694 }
5695 }
5696
5697 // Make sure that any load of address and any other address computation
5698 // remains scalar unless there is gather/scatter support. This avoids
5699 // inevitable extracts into address registers, and also has the benefit of
5700 // activating LSR more, since that pass can't optimize vectorized
5701 // addresses.
5702 if (TTI.prefersVectorizedAddressing())
5703 return;
5704
5705 // Start with all scalar pointer uses.
5706 SmallPtrSet<Instruction *, 8> AddrDefs;
5707 for (BasicBlock *BB : TheLoop->blocks())
5708 for (Instruction &I : *BB) {
5709 Instruction *PtrDef =
5710 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5711 if (PtrDef && TheLoop->contains(PtrDef) &&
5712 getWideningDecision(&I, VF) != CM_GatherScatter)
5713 AddrDefs.insert(PtrDef);
5714 }
5715
5716 // Add all instructions used to generate the addresses.
5717 SmallVector<Instruction *, 4> Worklist;
5718 append_range(Worklist, AddrDefs);
5719 while (!Worklist.empty()) {
5720 Instruction *I = Worklist.pop_back_val();
5721 for (auto &Op : I->operands())
5722 if (auto *InstOp = dyn_cast<Instruction>(Op))
5723 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5724 AddrDefs.insert(InstOp).second)
5725 Worklist.push_back(InstOp);
5726 }
5727
5728 for (auto *I : AddrDefs) {
5729 if (isa<LoadInst>(I)) {
5730 // Setting the desired widening decision should ideally be handled in
5731 // by cost functions, but since this involves the task of finding out
5732 // if the loaded register is involved in an address computation, it is
5733 // instead changed here when we know this is the case.
5734 InstWidening Decision = getWideningDecision(I, VF);
5735 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5736 // Scalarize a widened load of address.
5737 setWideningDecision(
5738 I, VF, CM_Scalarize,
5739 (VF.getKnownMinValue() *
5740 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
5741 else if (const auto *Group = getInterleavedAccessGroup(I)) {
5742 // Scalarize an interleave group of address loads.
5743 for (unsigned I = 0; I < Group->getFactor(); ++I) {
5744 if (Instruction *Member = Group->getMember(I))
5745 setWideningDecision(
5746 Member, VF, CM_Scalarize,
5747 (VF.getKnownMinValue() *
5748 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
5749 }
5750 }
5751 } else {
5752 // Cannot scalarize fixed-order recurrence phis at the moment.
5753 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5754 continue;
5755
5756 // Make sure I gets scalarized and a cost estimate without
5757 // scalarization overhead.
5758 ForcedScalars[VF].insert(I);
5759 }
5760 }
5761 }
5762
setVectorizedCallDecision(ElementCount VF)5763 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
5764 assert(!VF.isScalar() &&
5765 "Trying to set a vectorization decision for a scalar VF");
5766
5767 auto ForcedScalar = ForcedScalars.find(VF);
5768 for (BasicBlock *BB : TheLoop->blocks()) {
5769 // For each instruction in the old loop.
5770 for (Instruction &I : *BB) {
5771 CallInst *CI = dyn_cast<CallInst>(&I);
5772
5773 if (!CI)
5774 continue;
5775
5776 InstructionCost ScalarCost = InstructionCost::getInvalid();
5777 InstructionCost VectorCost = InstructionCost::getInvalid();
5778 InstructionCost IntrinsicCost = InstructionCost::getInvalid();
5779 Function *ScalarFunc = CI->getCalledFunction();
5780 Type *ScalarRetTy = CI->getType();
5781 SmallVector<Type *, 4> Tys, ScalarTys;
5782 for (auto &ArgOp : CI->args())
5783 ScalarTys.push_back(ArgOp->getType());
5784
5785 // Estimate cost of scalarized vector call. The source operands are
5786 // assumed to be vectors, so we need to extract individual elements from
5787 // there, execute VF scalar calls, and then gather the result into the
5788 // vector return value.
5789 InstructionCost ScalarCallCost =
5790 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
5791
5792 // Compute costs of unpacking argument values for the scalar calls and
5793 // packing the return values to a vector.
5794 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
5795
5796 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5797 // Honor ForcedScalars and UniformAfterVectorization decisions.
5798 // TODO: For calls, it might still be more profitable to widen. Use
5799 // VPlan-based cost model to compare different options.
5800 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5801 ForcedScalar->second.contains(CI)) ||
5802 isUniformAfterVectorization(CI, VF))) {
5803 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
5804 Intrinsic::not_intrinsic, std::nullopt,
5805 ScalarCost);
5806 continue;
5807 }
5808
5809 bool MaskRequired = Legal->isMaskRequired(CI);
5810 // Compute corresponding vector type for return value and arguments.
5811 Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
5812 for (Type *ScalarTy : ScalarTys)
5813 Tys.push_back(toVectorizedTy(ScalarTy, VF));
5814
5815 // An in-loop reduction using an fmuladd intrinsic is a special case;
5816 // we don't want the normal cost for that intrinsic.
5817 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
5818 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
5819 setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
5820 getVectorIntrinsicIDForCall(CI, TLI),
5821 std::nullopt, *RedCost);
5822 continue;
5823 }
5824
5825 // Find the cost of vectorizing the call, if we can find a suitable
5826 // vector variant of the function.
5827 VFInfo FuncInfo;
5828 Function *VecFunc = nullptr;
5829 // Search through any available variants for one we can use at this VF.
5830 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
5831 // Must match requested VF.
5832 if (Info.Shape.VF != VF)
5833 continue;
5834
5835 // Must take a mask argument if one is required
5836 if (MaskRequired && !Info.isMasked())
5837 continue;
5838
5839 // Check that all parameter kinds are supported
5840 bool ParamsOk = true;
5841 for (VFParameter Param : Info.Shape.Parameters) {
5842 switch (Param.ParamKind) {
5843 case VFParamKind::Vector:
5844 break;
5845 case VFParamKind::OMP_Uniform: {
5846 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5847 // Make sure the scalar parameter in the loop is invariant.
5848 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
5849 TheLoop))
5850 ParamsOk = false;
5851 break;
5852 }
5853 case VFParamKind::OMP_Linear: {
5854 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5855 // Find the stride for the scalar parameter in this loop and see if
5856 // it matches the stride for the variant.
5857 // TODO: do we need to figure out the cost of an extract to get the
5858 // first lane? Or do we hope that it will be folded away?
5859 ScalarEvolution *SE = PSE.getSE();
5860 const auto *SAR =
5861 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
5862
5863 if (!SAR || SAR->getLoop() != TheLoop) {
5864 ParamsOk = false;
5865 break;
5866 }
5867
5868 const SCEVConstant *Step =
5869 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
5870
5871 if (!Step ||
5872 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
5873 ParamsOk = false;
5874
5875 break;
5876 }
5877 case VFParamKind::GlobalPredicate:
5878 break;
5879 default:
5880 ParamsOk = false;
5881 break;
5882 }
5883 }
5884
5885 if (!ParamsOk)
5886 continue;
5887
5888 // Found a suitable candidate, stop here.
5889 VecFunc = CI->getModule()->getFunction(Info.VectorName);
5890 FuncInfo = Info;
5891 break;
5892 }
5893
5894 if (TLI && VecFunc && !CI->isNoBuiltin())
5895 VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
5896
5897 // Find the cost of an intrinsic; some targets may have instructions that
5898 // perform the operation without needing an actual call.
5899 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
5900 if (IID != Intrinsic::not_intrinsic)
5901 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
5902
5903 InstructionCost Cost = ScalarCost;
5904 InstWidening Decision = CM_Scalarize;
5905
5906 if (VectorCost <= Cost) {
5907 Cost = VectorCost;
5908 Decision = CM_VectorCall;
5909 }
5910
5911 if (IntrinsicCost <= Cost) {
5912 Cost = IntrinsicCost;
5913 Decision = CM_IntrinsicCall;
5914 }
5915
5916 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
5917 FuncInfo.getParamIndexForOptionalMask(), Cost);
5918 }
5919 }
5920 }
5921
shouldConsiderInvariant(Value * Op)5922 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
5923 if (!Legal->isInvariant(Op))
5924 return false;
5925 // Consider Op invariant, if it or its operands aren't predicated
5926 // instruction in the loop. In that case, it is not trivially hoistable.
5927 auto *OpI = dyn_cast<Instruction>(Op);
5928 return !OpI || !TheLoop->contains(OpI) ||
5929 (!isPredicatedInst(OpI) &&
5930 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
5931 all_of(OpI->operands(),
5932 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
5933 }
5934
5935 InstructionCost
getInstructionCost(Instruction * I,ElementCount VF)5936 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5937 ElementCount VF) {
5938 // If we know that this instruction will remain uniform, check the cost of
5939 // the scalar version.
5940 if (isUniformAfterVectorization(I, VF))
5941 VF = ElementCount::getFixed(1);
5942
5943 if (VF.isVector() && isProfitableToScalarize(I, VF))
5944 return InstsToScalarize[VF][I];
5945
5946 // Forced scalars do not have any scalarization overhead.
5947 auto ForcedScalar = ForcedScalars.find(VF);
5948 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
5949 auto InstSet = ForcedScalar->second;
5950 if (InstSet.count(I))
5951 return getInstructionCost(I, ElementCount::getFixed(1)) *
5952 VF.getKnownMinValue();
5953 }
5954
5955 Type *RetTy = I->getType();
5956 if (canTruncateToMinimalBitwidth(I, VF))
5957 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5958 auto *SE = PSE.getSE();
5959
5960 Type *VectorTy;
5961 if (isScalarAfterVectorization(I, VF)) {
5962 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
5963 [this](Instruction *I, ElementCount VF) -> bool {
5964 if (VF.isScalar())
5965 return true;
5966
5967 auto Scalarized = InstsToScalarize.find(VF);
5968 assert(Scalarized != InstsToScalarize.end() &&
5969 "VF not yet analyzed for scalarization profitability");
5970 return !Scalarized->second.count(I) &&
5971 llvm::all_of(I->users(), [&](User *U) {
5972 auto *UI = cast<Instruction>(U);
5973 return !Scalarized->second.count(UI);
5974 });
5975 };
5976
5977 // With the exception of GEPs and PHIs, after scalarization there should
5978 // only be one copy of the instruction generated in the loop. This is
5979 // because the VF is either 1, or any instructions that need scalarizing
5980 // have already been dealt with by the time we get here. As a result,
5981 // it means we don't have to multiply the instruction cost by VF.
5982 assert(I->getOpcode() == Instruction::GetElementPtr ||
5983 I->getOpcode() == Instruction::PHI ||
5984 (I->getOpcode() == Instruction::BitCast &&
5985 I->getType()->isPointerTy()) ||
5986 HasSingleCopyAfterVectorization(I, VF));
5987 VectorTy = RetTy;
5988 } else
5989 VectorTy = toVectorizedTy(RetTy, VF);
5990
5991 if (VF.isVector() && VectorTy->isVectorTy() &&
5992 !TTI.getNumberOfParts(VectorTy))
5993 return InstructionCost::getInvalid();
5994
5995 // TODO: We need to estimate the cost of intrinsic calls.
5996 switch (I->getOpcode()) {
5997 case Instruction::GetElementPtr:
5998 // We mark this instruction as zero-cost because the cost of GEPs in
5999 // vectorized code depends on whether the corresponding memory instruction
6000 // is scalarized or not. Therefore, we handle GEPs with the memory
6001 // instruction cost.
6002 return 0;
6003 case Instruction::Br: {
6004 // In cases of scalarized and predicated instructions, there will be VF
6005 // predicated blocks in the vectorized loop. Each branch around these
6006 // blocks requires also an extract of its vector compare i1 element.
6007 // Note that the conditional branch from the loop latch will be replaced by
6008 // a single branch controlling the loop, so there is no extra overhead from
6009 // scalarization.
6010 bool ScalarPredicatedBB = false;
6011 BranchInst *BI = cast<BranchInst>(I);
6012 if (VF.isVector() && BI->isConditional() &&
6013 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6014 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6015 BI->getParent() != TheLoop->getLoopLatch())
6016 ScalarPredicatedBB = true;
6017
6018 if (ScalarPredicatedBB) {
6019 // Not possible to scalarize scalable vector with predicated instructions.
6020 if (VF.isScalable())
6021 return InstructionCost::getInvalid();
6022 // Return cost for branches around scalarized and predicated blocks.
6023 auto *VecI1Ty =
6024 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6025 return (
6026 TTI.getScalarizationOverhead(
6027 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6028 /*Insert*/ false, /*Extract*/ true, CostKind) +
6029 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6030 }
6031
6032 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6033 // The back-edge branch will remain, as will all scalar branches.
6034 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6035
6036 // This branch will be eliminated by if-conversion.
6037 return 0;
6038 // Note: We currently assume zero cost for an unconditional branch inside
6039 // a predicated block since it will become a fall-through, although we
6040 // may decide in the future to call TTI for all branches.
6041 }
6042 case Instruction::Switch: {
6043 if (VF.isScalar())
6044 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6045 auto *Switch = cast<SwitchInst>(I);
6046 return Switch->getNumCases() *
6047 TTI.getCmpSelInstrCost(
6048 Instruction::ICmp,
6049 toVectorTy(Switch->getCondition()->getType(), VF),
6050 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6051 CmpInst::ICMP_EQ, CostKind);
6052 }
6053 case Instruction::PHI: {
6054 auto *Phi = cast<PHINode>(I);
6055
6056 // First-order recurrences are replaced by vector shuffles inside the loop.
6057 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6058 SmallVector<int> Mask(VF.getKnownMinValue());
6059 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6060 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6061 cast<VectorType>(VectorTy),
6062 cast<VectorType>(VectorTy), Mask, CostKind,
6063 VF.getKnownMinValue() - 1);
6064 }
6065
6066 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6067 // converted into select instructions. We require N - 1 selects per phi
6068 // node, where N is the number of incoming values.
6069 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6070 Type *ResultTy = Phi->getType();
6071
6072 // All instructions in an Any-of reduction chain are narrowed to bool.
6073 // Check if that is the case for this phi node.
6074 auto *HeaderUser = cast_if_present<PHINode>(
6075 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6076 auto *Phi = dyn_cast<PHINode>(U);
6077 if (Phi && Phi->getParent() == TheLoop->getHeader())
6078 return Phi;
6079 return nullptr;
6080 }));
6081 if (HeaderUser) {
6082 auto &ReductionVars = Legal->getReductionVars();
6083 auto Iter = ReductionVars.find(HeaderUser);
6084 if (Iter != ReductionVars.end() &&
6085 RecurrenceDescriptor::isAnyOfRecurrenceKind(
6086 Iter->second.getRecurrenceKind()))
6087 ResultTy = Type::getInt1Ty(Phi->getContext());
6088 }
6089 return (Phi->getNumIncomingValues() - 1) *
6090 TTI.getCmpSelInstrCost(
6091 Instruction::Select, toVectorTy(ResultTy, VF),
6092 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6093 CmpInst::BAD_ICMP_PREDICATE, CostKind);
6094 }
6095
6096 // When tail folding with EVL, if the phi is part of an out of loop
6097 // reduction then it will be transformed into a wide vp_merge.
6098 if (VF.isVector() && foldTailWithEVL() &&
6099 Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6100 IntrinsicCostAttributes ICA(
6101 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6102 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6103 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6104 }
6105
6106 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6107 }
6108 case Instruction::UDiv:
6109 case Instruction::SDiv:
6110 case Instruction::URem:
6111 case Instruction::SRem:
6112 if (VF.isVector() && isPredicatedInst(I)) {
6113 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6114 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6115 ScalarCost : SafeDivisorCost;
6116 }
6117 // We've proven all lanes safe to speculate, fall through.
6118 [[fallthrough]];
6119 case Instruction::Add:
6120 case Instruction::Sub: {
6121 auto Info = Legal->getHistogramInfo(I);
6122 if (Info && VF.isVector()) {
6123 const HistogramInfo *HGram = Info.value();
6124 // Assume that a non-constant update value (or a constant != 1) requires
6125 // a multiply, and add that into the cost.
6126 InstructionCost MulCost = TTI::TCC_Free;
6127 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6128 if (!RHS || RHS->getZExtValue() != 1)
6129 MulCost =
6130 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6131
6132 // Find the cost of the histogram operation itself.
6133 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6134 Type *ScalarTy = I->getType();
6135 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6136 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6137 Type::getVoidTy(I->getContext()),
6138 {PtrTy, ScalarTy, MaskTy});
6139
6140 // Add the costs together with the add/sub operation.
6141 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6142 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6143 }
6144 [[fallthrough]];
6145 }
6146 case Instruction::FAdd:
6147 case Instruction::FSub:
6148 case Instruction::Mul:
6149 case Instruction::FMul:
6150 case Instruction::FDiv:
6151 case Instruction::FRem:
6152 case Instruction::Shl:
6153 case Instruction::LShr:
6154 case Instruction::AShr:
6155 case Instruction::And:
6156 case Instruction::Or:
6157 case Instruction::Xor: {
6158 // If we're speculating on the stride being 1, the multiplication may
6159 // fold away. We can generalize this for all operations using the notion
6160 // of neutral elements. (TODO)
6161 if (I->getOpcode() == Instruction::Mul &&
6162 ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
6163 PSE.getSCEV(I->getOperand(0))->isOne()) ||
6164 (TheLoop->isLoopInvariant(I->getOperand(1)) &&
6165 PSE.getSCEV(I->getOperand(1))->isOne())))
6166 return 0;
6167
6168 // Detect reduction patterns
6169 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6170 return *RedCost;
6171
6172 // Certain instructions can be cheaper to vectorize if they have a constant
6173 // second vector operand. One example of this are shifts on x86.
6174 Value *Op2 = I->getOperand(1);
6175 if (!isa<Constant>(Op2) && TheLoop->isLoopInvariant(Op2) &&
6176 PSE.getSE()->isSCEVable(Op2->getType()) &&
6177 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6178 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6179 }
6180 auto Op2Info = TTI.getOperandInfo(Op2);
6181 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6182 shouldConsiderInvariant(Op2))
6183 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6184
6185 SmallVector<const Value *, 4> Operands(I->operand_values());
6186 return TTI.getArithmeticInstrCost(
6187 I->getOpcode(), VectorTy, CostKind,
6188 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6189 Op2Info, Operands, I, TLI);
6190 }
6191 case Instruction::FNeg: {
6192 return TTI.getArithmeticInstrCost(
6193 I->getOpcode(), VectorTy, CostKind,
6194 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6195 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6196 I->getOperand(0), I);
6197 }
6198 case Instruction::Select: {
6199 SelectInst *SI = cast<SelectInst>(I);
6200 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6201 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6202
6203 const Value *Op0, *Op1;
6204 using namespace llvm::PatternMatch;
6205 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6206 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6207 // select x, y, false --> x & y
6208 // select x, true, y --> x | y
6209 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6210 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6211 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6212 Op1->getType()->getScalarSizeInBits() == 1);
6213
6214 SmallVector<const Value *, 2> Operands{Op0, Op1};
6215 return TTI.getArithmeticInstrCost(
6216 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6217 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6218 }
6219
6220 Type *CondTy = SI->getCondition()->getType();
6221 if (!ScalarCond)
6222 CondTy = VectorType::get(CondTy, VF);
6223
6224 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6225 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6226 Pred = Cmp->getPredicate();
6227 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6228 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6229 {TTI::OK_AnyValue, TTI::OP_None}, I);
6230 }
6231 case Instruction::ICmp:
6232 case Instruction::FCmp: {
6233 Type *ValTy = I->getOperand(0)->getType();
6234
6235 if (canTruncateToMinimalBitwidth(I, VF)) {
6236 [[maybe_unused]] Instruction *Op0AsInstruction =
6237 dyn_cast<Instruction>(I->getOperand(0));
6238 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6239 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6240 "if both the operand and the compare are marked for "
6241 "truncation, they must have the same bitwidth");
6242 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6243 }
6244
6245 VectorTy = toVectorTy(ValTy, VF);
6246 return TTI.getCmpSelInstrCost(
6247 I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy),
6248 cast<CmpInst>(I)->getPredicate(), CostKind,
6249 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
6250 }
6251 case Instruction::Store:
6252 case Instruction::Load: {
6253 ElementCount Width = VF;
6254 if (Width.isVector()) {
6255 InstWidening Decision = getWideningDecision(I, Width);
6256 assert(Decision != CM_Unknown &&
6257 "CM decision should be taken at this point");
6258 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6259 return InstructionCost::getInvalid();
6260 if (Decision == CM_Scalarize)
6261 Width = ElementCount::getFixed(1);
6262 }
6263 VectorTy = toVectorTy(getLoadStoreType(I), Width);
6264 return getMemoryInstructionCost(I, VF);
6265 }
6266 case Instruction::BitCast:
6267 if (I->getType()->isPointerTy())
6268 return 0;
6269 [[fallthrough]];
6270 case Instruction::ZExt:
6271 case Instruction::SExt:
6272 case Instruction::FPToUI:
6273 case Instruction::FPToSI:
6274 case Instruction::FPExt:
6275 case Instruction::PtrToInt:
6276 case Instruction::IntToPtr:
6277 case Instruction::SIToFP:
6278 case Instruction::UIToFP:
6279 case Instruction::Trunc:
6280 case Instruction::FPTrunc: {
6281 // Computes the CastContextHint from a Load/Store instruction.
6282 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6283 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6284 "Expected a load or a store!");
6285
6286 if (VF.isScalar() || !TheLoop->contains(I))
6287 return TTI::CastContextHint::Normal;
6288
6289 switch (getWideningDecision(I, VF)) {
6290 case LoopVectorizationCostModel::CM_GatherScatter:
6291 return TTI::CastContextHint::GatherScatter;
6292 case LoopVectorizationCostModel::CM_Interleave:
6293 return TTI::CastContextHint::Interleave;
6294 case LoopVectorizationCostModel::CM_Scalarize:
6295 case LoopVectorizationCostModel::CM_Widen:
6296 return isPredicatedInst(I) ? TTI::CastContextHint::Masked
6297 : TTI::CastContextHint::Normal;
6298 case LoopVectorizationCostModel::CM_Widen_Reverse:
6299 return TTI::CastContextHint::Reversed;
6300 case LoopVectorizationCostModel::CM_Unknown:
6301 llvm_unreachable("Instr did not go through cost modelling?");
6302 case LoopVectorizationCostModel::CM_VectorCall:
6303 case LoopVectorizationCostModel::CM_IntrinsicCall:
6304 llvm_unreachable_internal("Instr has invalid widening decision");
6305 }
6306
6307 llvm_unreachable("Unhandled case!");
6308 };
6309
6310 unsigned Opcode = I->getOpcode();
6311 TTI::CastContextHint CCH = TTI::CastContextHint::None;
6312 // For Trunc, the context is the only user, which must be a StoreInst.
6313 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6314 if (I->hasOneUse())
6315 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6316 CCH = ComputeCCH(Store);
6317 }
6318 // For Z/Sext, the context is the operand, which must be a LoadInst.
6319 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6320 Opcode == Instruction::FPExt) {
6321 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6322 CCH = ComputeCCH(Load);
6323 }
6324
6325 // We optimize the truncation of induction variables having constant
6326 // integer steps. The cost of these truncations is the same as the scalar
6327 // operation.
6328 if (isOptimizableIVTruncate(I, VF)) {
6329 auto *Trunc = cast<TruncInst>(I);
6330 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6331 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6332 }
6333
6334 // Detect reduction patterns
6335 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6336 return *RedCost;
6337
6338 Type *SrcScalarTy = I->getOperand(0)->getType();
6339 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6340 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6341 SrcScalarTy =
6342 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6343 Type *SrcVecTy =
6344 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6345
6346 if (canTruncateToMinimalBitwidth(I, VF)) {
6347 // If the result type is <= the source type, there will be no extend
6348 // after truncating the users to the minimal required bitwidth.
6349 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6350 (I->getOpcode() == Instruction::ZExt ||
6351 I->getOpcode() == Instruction::SExt))
6352 return 0;
6353 }
6354
6355 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6356 }
6357 case Instruction::Call:
6358 return getVectorCallCost(cast<CallInst>(I), VF);
6359 case Instruction::ExtractValue:
6360 return TTI.getInstructionCost(I, CostKind);
6361 case Instruction::Alloca:
6362 // We cannot easily widen alloca to a scalable alloca, as
6363 // the result would need to be a vector of pointers.
6364 if (VF.isScalable())
6365 return InstructionCost::getInvalid();
6366 [[fallthrough]];
6367 default:
6368 // This opcode is unknown. Assume that it is the same as 'mul'.
6369 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6370 } // end of switch.
6371 }
6372
collectValuesToIgnore()6373 void LoopVectorizationCostModel::collectValuesToIgnore() {
6374 // Ignore ephemeral values.
6375 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6376
6377 SmallVector<Value *, 4> DeadInterleavePointerOps;
6378 SmallVector<Value *, 4> DeadOps;
6379
6380 // If a scalar epilogue is required, users outside the loop won't use
6381 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6382 // that is the case.
6383 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6384 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6385 return RequiresScalarEpilogue &&
6386 !TheLoop->contains(cast<Instruction>(U)->getParent());
6387 };
6388
6389 LoopBlocksDFS DFS(TheLoop);
6390 DFS.perform(LI);
6391 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6392 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6393 for (Instruction &I : reverse(*BB)) {
6394 // Find all stores to invariant variables. Since they are going to sink
6395 // outside the loop we do not need calculate cost for them.
6396 StoreInst *SI;
6397 if ((SI = dyn_cast<StoreInst>(&I)) &&
6398 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6399 ValuesToIgnore.insert(&I);
6400 DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6401 SI->getValueOperand());
6402 }
6403
6404 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6405 continue;
6406
6407 // Add instructions that would be trivially dead and are only used by
6408 // values already ignored to DeadOps to seed worklist.
6409 if (wouldInstructionBeTriviallyDead(&I, TLI) &&
6410 all_of(I.users(), [this, IsLiveOutDead](User *U) {
6411 return VecValuesToIgnore.contains(U) ||
6412 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6413 }))
6414 DeadOps.push_back(&I);
6415
6416 // For interleave groups, we only create a pointer for the start of the
6417 // interleave group. Queue up addresses of group members except the insert
6418 // position for further processing.
6419 if (isAccessInterleaved(&I)) {
6420 auto *Group = getInterleavedAccessGroup(&I);
6421 if (Group->getInsertPos() == &I)
6422 continue;
6423 Value *PointerOp = getLoadStorePointerOperand(&I);
6424 DeadInterleavePointerOps.push_back(PointerOp);
6425 }
6426
6427 // Queue branches for analysis. They are dead, if their successors only
6428 // contain dead instructions.
6429 if (auto *Br = dyn_cast<BranchInst>(&I)) {
6430 if (Br->isConditional())
6431 DeadOps.push_back(&I);
6432 }
6433 }
6434
6435 // Mark ops feeding interleave group members as free, if they are only used
6436 // by other dead computations.
6437 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6438 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6439 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6440 Instruction *UI = cast<Instruction>(U);
6441 return !VecValuesToIgnore.contains(U) &&
6442 (!isAccessInterleaved(UI) ||
6443 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6444 }))
6445 continue;
6446 VecValuesToIgnore.insert(Op);
6447 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
6448 }
6449
6450 for (const auto &[_, Ops] : DeadInvariantStoreOps)
6451 llvm::append_range(DeadOps, drop_end(Ops));
6452
6453 // Mark ops that would be trivially dead and are only used by ignored
6454 // instructions as free.
6455 BasicBlock *Header = TheLoop->getHeader();
6456
6457 // Returns true if the block contains only dead instructions. Such blocks will
6458 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6459 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6460 auto IsEmptyBlock = [this](BasicBlock *BB) {
6461 return all_of(*BB, [this](Instruction &I) {
6462 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6463 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6464 });
6465 };
6466 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6467 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6468
6469 // Check if the branch should be considered dead.
6470 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6471 BasicBlock *ThenBB = Br->getSuccessor(0);
6472 BasicBlock *ElseBB = Br->getSuccessor(1);
6473 // Don't considers branches leaving the loop for simplification.
6474 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6475 continue;
6476 bool ThenEmpty = IsEmptyBlock(ThenBB);
6477 bool ElseEmpty = IsEmptyBlock(ElseBB);
6478 if ((ThenEmpty && ElseEmpty) ||
6479 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6480 ElseBB->phis().empty()) ||
6481 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6482 ThenBB->phis().empty())) {
6483 VecValuesToIgnore.insert(Br);
6484 DeadOps.push_back(Br->getCondition());
6485 }
6486 continue;
6487 }
6488
6489 // Skip any op that shouldn't be considered dead.
6490 if (!Op || !TheLoop->contains(Op) ||
6491 (isa<PHINode>(Op) && Op->getParent() == Header) ||
6492 !wouldInstructionBeTriviallyDead(Op, TLI) ||
6493 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6494 return !VecValuesToIgnore.contains(U) &&
6495 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6496 }))
6497 continue;
6498
6499 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6500 // which applies for both scalar and vector versions. Otherwise it is only
6501 // dead in vector versions, so only add it to VecValuesToIgnore.
6502 if (all_of(Op->users(),
6503 [this](User *U) { return ValuesToIgnore.contains(U); }))
6504 ValuesToIgnore.insert(Op);
6505
6506 VecValuesToIgnore.insert(Op);
6507 DeadOps.append(Op->op_begin(), Op->op_end());
6508 }
6509
6510 // Ignore type-promoting instructions we identified during reduction
6511 // detection.
6512 for (const auto &Reduction : Legal->getReductionVars()) {
6513 const RecurrenceDescriptor &RedDes = Reduction.second;
6514 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6515 VecValuesToIgnore.insert_range(Casts);
6516 }
6517 // Ignore type-casting instructions we identified during induction
6518 // detection.
6519 for (const auto &Induction : Legal->getInductionVars()) {
6520 const InductionDescriptor &IndDes = Induction.second;
6521 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6522 VecValuesToIgnore.insert_range(Casts);
6523 }
6524 }
6525
collectInLoopReductions()6526 void LoopVectorizationCostModel::collectInLoopReductions() {
6527 // Avoid duplicating work finding in-loop reductions.
6528 if (!InLoopReductions.empty())
6529 return;
6530
6531 for (const auto &Reduction : Legal->getReductionVars()) {
6532 PHINode *Phi = Reduction.first;
6533 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6534
6535 // We don't collect reductions that are type promoted (yet).
6536 if (RdxDesc.getRecurrenceType() != Phi->getType())
6537 continue;
6538
6539 // If the target would prefer this reduction to happen "in-loop", then we
6540 // want to record it as such.
6541 RecurKind Kind = RdxDesc.getRecurrenceKind();
6542 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6543 !TTI.preferInLoopReduction(Kind, Phi->getType()))
6544 continue;
6545
6546 // Check that we can correctly put the reductions into the loop, by
6547 // finding the chain of operations that leads from the phi to the loop
6548 // exit value.
6549 SmallVector<Instruction *, 4> ReductionOperations =
6550 RdxDesc.getReductionOpChain(Phi, TheLoop);
6551 bool InLoop = !ReductionOperations.empty();
6552
6553 if (InLoop) {
6554 InLoopReductions.insert(Phi);
6555 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6556 Instruction *LastChain = Phi;
6557 for (auto *I : ReductionOperations) {
6558 InLoopReductionImmediateChains[I] = LastChain;
6559 LastChain = I;
6560 }
6561 }
6562 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6563 << " reduction for phi: " << *Phi << "\n");
6564 }
6565 }
6566
6567 // This function will select a scalable VF if the target supports scalable
6568 // vectors and a fixed one otherwise.
6569 // TODO: we could return a pair of values that specify the max VF and
6570 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6571 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6572 // doesn't have a cost model that can choose which plan to execute if
6573 // more than one is generated.
determineVPlanVF(const TargetTransformInfo & TTI,LoopVectorizationCostModel & CM)6574 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6575 LoopVectorizationCostModel &CM) {
6576 unsigned WidestType;
6577 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6578
6579 TargetTransformInfo::RegisterKind RegKind =
6580 TTI.enableScalableVectorization()
6581 ? TargetTransformInfo::RGK_ScalableVector
6582 : TargetTransformInfo::RGK_FixedWidthVector;
6583
6584 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
6585 unsigned N = RegSize.getKnownMinValue() / WidestType;
6586 return ElementCount::get(N, RegSize.isScalable());
6587 }
6588
6589 VectorizationFactor
planInVPlanNativePath(ElementCount UserVF)6590 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6591 ElementCount VF = UserVF;
6592 // Outer loop handling: They may require CFG and instruction level
6593 // transformations before even evaluating whether vectorization is profitable.
6594 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6595 // the vectorization pipeline.
6596 if (!OrigLoop->isInnermost()) {
6597 // If the user doesn't provide a vectorization factor, determine a
6598 // reasonable one.
6599 if (UserVF.isZero()) {
6600 VF = determineVPlanVF(TTI, CM);
6601 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6602
6603 // Make sure we have a VF > 1 for stress testing.
6604 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6605 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6606 << "overriding computed VF.\n");
6607 VF = ElementCount::getFixed(4);
6608 }
6609 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6610 !ForceTargetSupportsScalableVectors) {
6611 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6612 << "not supported by the target.\n");
6613 reportVectorizationFailure(
6614 "Scalable vectorization requested but not supported by the target",
6615 "the scalable user-specified vectorization width for outer-loop "
6616 "vectorization cannot be used because the target does not support "
6617 "scalable vectors.",
6618 "ScalableVFUnfeasible", ORE, OrigLoop);
6619 return VectorizationFactor::Disabled();
6620 }
6621 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6622 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6623 "VF needs to be a power of two");
6624 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6625 << "VF " << VF << " to build VPlans.\n");
6626 buildVPlans(VF, VF);
6627
6628 if (VPlans.empty())
6629 return VectorizationFactor::Disabled();
6630
6631 // For VPlan build stress testing, we bail out after VPlan construction.
6632 if (VPlanBuildStressTest)
6633 return VectorizationFactor::Disabled();
6634
6635 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6636 }
6637
6638 LLVM_DEBUG(
6639 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6640 "VPlan-native path.\n");
6641 return VectorizationFactor::Disabled();
6642 }
6643
plan(ElementCount UserVF,unsigned UserIC)6644 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6645 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6646 CM.collectValuesToIgnore();
6647 CM.collectElementTypesForWidening();
6648
6649 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6650 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6651 return;
6652
6653 // Invalidate interleave groups if all blocks of loop will be predicated.
6654 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
6655 !useMaskedInterleavedAccesses(TTI)) {
6656 LLVM_DEBUG(
6657 dbgs()
6658 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6659 "which requires masked-interleaved support.\n");
6660 if (CM.InterleaveInfo.invalidateGroups())
6661 // Invalidating interleave groups also requires invalidating all decisions
6662 // based on them, which includes widening decisions and uniform and scalar
6663 // values.
6664 CM.invalidateCostModelingDecisions();
6665 }
6666
6667 if (CM.foldTailByMasking())
6668 Legal->prepareToFoldTailByMasking();
6669
6670 ElementCount MaxUserVF =
6671 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6672 if (UserVF) {
6673 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
6674 reportVectorizationInfo(
6675 "UserVF ignored because it may be larger than the maximal safe VF",
6676 "InvalidUserVF", ORE, OrigLoop);
6677 } else {
6678 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6679 "VF needs to be a power of two");
6680 // Collect the instructions (and their associated costs) that will be more
6681 // profitable to scalarize.
6682 CM.collectInLoopReductions();
6683 if (CM.selectUserVectorizationFactor(UserVF)) {
6684 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6685 buildVPlansWithVPRecipes(UserVF, UserVF);
6686 LLVM_DEBUG(printPlans(dbgs()));
6687 return;
6688 }
6689 reportVectorizationInfo("UserVF ignored because of invalid costs.",
6690 "InvalidCost", ORE, OrigLoop);
6691 }
6692 }
6693
6694 // Collect the Vectorization Factor Candidates.
6695 SmallVector<ElementCount> VFCandidates;
6696 for (auto VF = ElementCount::getFixed(1);
6697 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
6698 VFCandidates.push_back(VF);
6699 for (auto VF = ElementCount::getScalable(1);
6700 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
6701 VFCandidates.push_back(VF);
6702
6703 CM.collectInLoopReductions();
6704 for (const auto &VF : VFCandidates) {
6705 // Collect Uniform and Scalar instructions after vectorization with VF.
6706 CM.collectNonVectorizedAndSetWideningDecisions(VF);
6707 }
6708
6709 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
6710 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
6711
6712 LLVM_DEBUG(printPlans(dbgs()));
6713 }
6714
getLegacyCost(Instruction * UI,ElementCount VF) const6715 InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6716 ElementCount VF) const {
6717 if (ForceTargetInstructionCost.getNumOccurrences())
6718 return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
6719 return CM.getInstructionCost(UI, VF);
6720 }
6721
isLegacyUniformAfterVectorization(Instruction * I,ElementCount VF) const6722 bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
6723 ElementCount VF) const {
6724 return CM.isUniformAfterVectorization(I, VF);
6725 }
6726
skipCostComputation(Instruction * UI,bool IsVector) const6727 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6728 return CM.ValuesToIgnore.contains(UI) ||
6729 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
6730 SkipCostComputation.contains(UI);
6731 }
6732
6733 InstructionCost
precomputeCosts(VPlan & Plan,ElementCount VF,VPCostContext & CostCtx) const6734 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6735 VPCostContext &CostCtx) const {
6736 InstructionCost Cost;
6737 // Cost modeling for inductions is inaccurate in the legacy cost model
6738 // compared to the recipes that are generated. To match here initially during
6739 // VPlan cost model bring up directly use the induction costs from the legacy
6740 // cost model. Note that we do this as pre-processing; the VPlan may not have
6741 // any recipes associated with the original induction increment instruction
6742 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6743 // the cost of induction phis and increments (both that are represented by
6744 // recipes and those that are not), to avoid distinguishing between them here,
6745 // and skip all recipes that represent induction phis and increments (the
6746 // former case) later on, if they exist, to avoid counting them twice.
6747 // Similarly we pre-compute the cost of any optimized truncates.
6748 // TODO: Switch to more accurate costing based on VPlan.
6749 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6750 Instruction *IVInc = cast<Instruction>(
6751 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
6752 SmallVector<Instruction *> IVInsts = {IVInc};
6753 for (unsigned I = 0; I != IVInsts.size(); I++) {
6754 for (Value *Op : IVInsts[I]->operands()) {
6755 auto *OpI = dyn_cast<Instruction>(Op);
6756 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
6757 continue;
6758 IVInsts.push_back(OpI);
6759 }
6760 }
6761 IVInsts.push_back(IV);
6762 for (User *U : IV->users()) {
6763 auto *CI = cast<Instruction>(U);
6764 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
6765 continue;
6766 IVInsts.push_back(CI);
6767 }
6768
6769 // If the vector loop gets executed exactly once with the given VF, ignore
6770 // the costs of comparison and induction instructions, as they'll get
6771 // simplified away.
6772 // TODO: Remove this code after stepping away from the legacy cost model and
6773 // adding code to simplify VPlans before calculating their costs.
6774 auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
6775 if (TC == VF && !CM.foldTailByMasking())
6776 addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
6777 CostCtx.SkipCostComputation);
6778
6779 for (Instruction *IVInst : IVInsts) {
6780 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
6781 continue;
6782 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
6783 LLVM_DEBUG({
6784 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6785 << ": induction instruction " << *IVInst << "\n";
6786 });
6787 Cost += InductionCost;
6788 CostCtx.SkipCostComputation.insert(IVInst);
6789 }
6790 }
6791
6792 /// Compute the cost of all exiting conditions of the loop using the legacy
6793 /// cost model. This is to match the legacy behavior, which adds the cost of
6794 /// all exit conditions. Note that this over-estimates the cost, as there will
6795 /// be a single condition to control the vector loop.
6796 SmallVector<BasicBlock *> Exiting;
6797 CM.TheLoop->getExitingBlocks(Exiting);
6798 SetVector<Instruction *> ExitInstrs;
6799 // Collect all exit conditions.
6800 for (BasicBlock *EB : Exiting) {
6801 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
6802 if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
6803 continue;
6804 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
6805 ExitInstrs.insert(CondI);
6806 }
6807 }
6808 // Compute the cost of all instructions only feeding the exit conditions.
6809 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6810 Instruction *CondI = ExitInstrs[I];
6811 if (!OrigLoop->contains(CondI) ||
6812 !CostCtx.SkipCostComputation.insert(CondI).second)
6813 continue;
6814 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
6815 LLVM_DEBUG({
6816 dbgs() << "Cost of " << CondICost << " for VF " << VF
6817 << ": exit condition instruction " << *CondI << "\n";
6818 });
6819 Cost += CondICost;
6820 for (Value *Op : CondI->operands()) {
6821 auto *OpI = dyn_cast<Instruction>(Op);
6822 if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
6823 any_of(OpI->users(), [&ExitInstrs, this](User *U) {
6824 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
6825 !ExitInstrs.contains(cast<Instruction>(U));
6826 }))
6827 continue;
6828 ExitInstrs.insert(OpI);
6829 }
6830 }
6831
6832 // Pre-compute the costs for branches except for the backedge, as the number
6833 // of replicate regions in a VPlan may not directly match the number of
6834 // branches, which would lead to different decisions.
6835 // TODO: Compute cost of branches for each replicate region in the VPlan,
6836 // which is more accurate than the legacy cost model.
6837 for (BasicBlock *BB : OrigLoop->blocks()) {
6838 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
6839 continue;
6840 CostCtx.SkipCostComputation.insert(BB->getTerminator());
6841 if (BB == OrigLoop->getLoopLatch())
6842 continue;
6843 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
6844 Cost += BranchCost;
6845 }
6846
6847 // Pre-compute costs for instructions that are forced-scalar or profitable to
6848 // scalarize. Their costs will be computed separately in the legacy cost
6849 // model.
6850 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
6851 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
6852 continue;
6853 CostCtx.SkipCostComputation.insert(ForcedScalar);
6854 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
6855 LLVM_DEBUG({
6856 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
6857 << ": forced scalar " << *ForcedScalar << "\n";
6858 });
6859 Cost += ForcedCost;
6860 }
6861 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
6862 if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
6863 continue;
6864 CostCtx.SkipCostComputation.insert(Scalarized);
6865 LLVM_DEBUG({
6866 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
6867 << ": profitable to scalarize " << *Scalarized << "\n";
6868 });
6869 Cost += ScalarCost;
6870 }
6871
6872 return Cost;
6873 }
6874
cost(VPlan & Plan,ElementCount VF) const6875 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6876 ElementCount VF) const {
6877 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
6878 CM.CostKind);
6879 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
6880
6881 // Now compute and add the VPlan-based cost.
6882 Cost += Plan.cost(VF, CostCtx);
6883 #ifndef NDEBUG
6884 unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
6885 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
6886 << " (Estimated cost per lane: ");
6887 if (Cost.isValid()) {
6888 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
6889 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
6890 } else /* No point dividing an invalid cost - it will still be invalid */
6891 LLVM_DEBUG(dbgs() << "Invalid");
6892 LLVM_DEBUG(dbgs() << ")\n");
6893 #endif
6894 return Cost;
6895 }
6896
6897 #ifndef NDEBUG
6898 /// Return true if the original loop \ TheLoop contains any instructions that do
6899 /// not have corresponding recipes in \p Plan and are not marked to be ignored
6900 /// in \p CostCtx. This means the VPlan contains simplification that the legacy
6901 /// cost-model did not account for.
planContainsAdditionalSimplifications(VPlan & Plan,VPCostContext & CostCtx,Loop * TheLoop,ElementCount VF)6902 static bool planContainsAdditionalSimplifications(VPlan &Plan,
6903 VPCostContext &CostCtx,
6904 Loop *TheLoop,
6905 ElementCount VF) {
6906 // First collect all instructions for the recipes in Plan.
6907 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
6908 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
6909 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
6910 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
6911 return &WidenMem->getIngredient();
6912 return nullptr;
6913 };
6914
6915 DenseSet<Instruction *> SeenInstrs;
6916 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
6917 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
6918 for (VPRecipeBase &R : *VPBB) {
6919 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
6920 auto *IG = IR->getInterleaveGroup();
6921 unsigned NumMembers = IG->getNumMembers();
6922 for (unsigned I = 0; I != NumMembers; ++I) {
6923 if (Instruction *M = IG->getMember(I))
6924 SeenInstrs.insert(M);
6925 }
6926 continue;
6927 }
6928 // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
6929 // cost model won't cost it whilst the legacy will.
6930 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
6931 if (none_of(FOR->users(), [](VPUser *U) {
6932 auto *VPI = dyn_cast<VPInstruction>(U);
6933 return VPI && VPI->getOpcode() ==
6934 VPInstruction::FirstOrderRecurrenceSplice;
6935 }))
6936 return true;
6937 }
6938 // The VPlan-based cost model is more accurate for partial reduction and
6939 // comparing against the legacy cost isn't desirable.
6940 if (isa<VPPartialReductionRecipe>(&R))
6941 return true;
6942
6943 /// If a VPlan transform folded a recipe to one producing a single-scalar,
6944 /// but the original instruction wasn't uniform-after-vectorization in the
6945 /// legacy cost model, the legacy cost overestimates the actual cost.
6946 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
6947 if (RepR->isSingleScalar() &&
6948 !CostCtx.isLegacyUniformAfterVectorization(
6949 RepR->getUnderlyingInstr(), VF))
6950 return true;
6951 }
6952 if (Instruction *UI = GetInstructionForCost(&R)) {
6953 // If we adjusted the predicate of the recipe, the cost in the legacy
6954 // cost model may be different.
6955 if (auto *WidenCmp = dyn_cast<VPWidenRecipe>(&R)) {
6956 if ((WidenCmp->getOpcode() == Instruction::ICmp ||
6957 WidenCmp->getOpcode() == Instruction::FCmp) &&
6958 WidenCmp->getPredicate() != cast<CmpInst>(UI)->getPredicate())
6959 return true;
6960 }
6961 SeenInstrs.insert(UI);
6962 }
6963 }
6964 }
6965
6966 // Return true if the loop contains any instructions that are not also part of
6967 // the VPlan or are skipped for VPlan-based cost computations. This indicates
6968 // that the VPlan contains extra simplifications.
6969 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
6970 TheLoop](BasicBlock *BB) {
6971 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
6972 // Skip induction phis when checking for simplifications, as they may not
6973 // be lowered directly be lowered to a corresponding PHI recipe.
6974 if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
6975 CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
6976 return false;
6977 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
6978 });
6979 });
6980 }
6981 #endif
6982
computeBestVF()6983 VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
6984 if (VPlans.empty())
6985 return VectorizationFactor::Disabled();
6986 // If there is a single VPlan with a single VF, return it directly.
6987 VPlan &FirstPlan = *VPlans[0];
6988 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
6989 return {*FirstPlan.vectorFactors().begin(), 0, 0};
6990
6991 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
6992 << (CM.CostKind == TTI::TCK_RecipThroughput
6993 ? "Reciprocal Throughput\n"
6994 : CM.CostKind == TTI::TCK_Latency
6995 ? "Instruction Latency\n"
6996 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
6997 : CM.CostKind == TTI::TCK_SizeAndLatency
6998 ? "Code Size and Latency\n"
6999 : "Unknown\n"));
7000
7001 ElementCount ScalarVF = ElementCount::getFixed(1);
7002 assert(hasPlanWithVF(ScalarVF) &&
7003 "More than a single plan/VF w/o any plan having scalar VF");
7004
7005 // TODO: Compute scalar cost using VPlan-based cost model.
7006 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7007 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7008 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7009 VectorizationFactor BestFactor = ScalarFactor;
7010
7011 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7012 if (ForceVectorization) {
7013 // Ignore scalar width, because the user explicitly wants vectorization.
7014 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7015 // evaluation.
7016 BestFactor.Cost = InstructionCost::getMax();
7017 }
7018
7019 for (auto &P : VPlans) {
7020 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
7021 P->vectorFactors().end());
7022
7023 SmallVector<VPRegisterUsage, 8> RUs;
7024 if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
7025 CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
7026 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
7027
7028 for (unsigned I = 0; I < VFs.size(); I++) {
7029 ElementCount VF = VFs[I];
7030 if (VF.isScalar())
7031 continue;
7032 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7033 LLVM_DEBUG(
7034 dbgs()
7035 << "LV: Not considering vector loop of width " << VF
7036 << " because it will not generate any vector instructions.\n");
7037 continue;
7038 }
7039 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
7040 LLVM_DEBUG(
7041 dbgs()
7042 << "LV: Not considering vector loop of width " << VF
7043 << " because it would cause replicated blocks to be generated,"
7044 << " which isn't allowed when optimizing for size.\n");
7045 continue;
7046 }
7047
7048 InstructionCost Cost = cost(*P, VF);
7049 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7050
7051 if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI)) {
7052 LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7053 << VF << " because it uses too many registers\n");
7054 continue;
7055 }
7056
7057 if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7058 BestFactor = CurrentFactor;
7059
7060 // If profitable add it to ProfitableVF list.
7061 if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
7062 ProfitableVFs.push_back(CurrentFactor);
7063 }
7064 }
7065
7066 #ifndef NDEBUG
7067 // Select the optimal vectorization factor according to the legacy cost-model.
7068 // This is now only used to verify the decisions by the new VPlan-based
7069 // cost-model and will be retired once the VPlan-based cost-model is
7070 // stabilized.
7071 VectorizationFactor LegacyVF = selectVectorizationFactor();
7072 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7073
7074 // Pre-compute the cost and use it to check if BestPlan contains any
7075 // simplifications not accounted for in the legacy cost model. If that's the
7076 // case, don't trigger the assertion, as the extra simplifications may cause a
7077 // different VF to be picked by the VPlan-based cost model.
7078 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7079 CM.CostKind);
7080 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7081 // Verify that the VPlan-based and legacy cost models agree, except for VPlans
7082 // with early exits and plans with additional VPlan simplifications. The
7083 // legacy cost model doesn't properly model costs for such loops.
7084 assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7085 planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7086 CostCtx, OrigLoop,
7087 BestFactor.Width) ||
7088 planContainsAdditionalSimplifications(
7089 getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7090 " VPlan cost model and legacy cost model disagreed");
7091 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7092 "when vectorizing, the scalar cost must be computed.");
7093 #endif
7094
7095 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7096 return BestFactor;
7097 }
7098
addRuntimeUnrollDisableMetaData(Loop * L)7099 static void addRuntimeUnrollDisableMetaData(Loop *L) {
7100 SmallVector<Metadata *, 4> MDs;
7101 // Reserve first location for self reference to the LoopID metadata node.
7102 MDs.push_back(nullptr);
7103 bool IsUnrollMetadata = false;
7104 MDNode *LoopID = L->getLoopID();
7105 if (LoopID) {
7106 // First find existing loop unrolling disable metadata.
7107 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7108 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7109 if (MD) {
7110 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7111 IsUnrollMetadata =
7112 S && S->getString().starts_with("llvm.loop.unroll.disable");
7113 }
7114 MDs.push_back(LoopID->getOperand(I));
7115 }
7116 }
7117
7118 if (!IsUnrollMetadata) {
7119 // Add runtime unroll disable metadata.
7120 LLVMContext &Context = L->getHeader()->getContext();
7121 SmallVector<Metadata *, 1> DisableOperands;
7122 DisableOperands.push_back(
7123 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7124 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7125 MDs.push_back(DisableNode);
7126 MDNode *NewLoopID = MDNode::get(Context, MDs);
7127 // Set operand 0 to refer to the loop id itself.
7128 NewLoopID->replaceOperandWith(0, NewLoopID);
7129 L->setLoopID(NewLoopID);
7130 }
7131 }
7132
getStartValueFromReductionResult(VPInstruction * RdxResult)7133 static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) {
7134 using namespace VPlanPatternMatch;
7135 assert(RdxResult->getOpcode() == VPInstruction::ComputeFindIVResult &&
7136 "RdxResult must be ComputeFindIVResult");
7137 VPValue *StartVPV = RdxResult->getOperand(1);
7138 match(StartVPV, m_Freeze(m_VPValue(StartVPV)));
7139 return StartVPV->getLiveInIRValue();
7140 }
7141
7142 // If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
7143 // epilog loop, fix the reduction's scalar PHI node by adding the incoming value
7144 // from the main vector loop.
fixReductionScalarResumeWhenVectorizingEpilog(VPPhi * EpiResumePhiR,VPTransformState & State,BasicBlock * BypassBlock)7145 static void fixReductionScalarResumeWhenVectorizingEpilog(
7146 VPPhi *EpiResumePhiR, VPTransformState &State, BasicBlock *BypassBlock) {
7147 // Get the VPInstruction computing the reduction result in the middle block.
7148 // The first operand may not be from the middle block if it is not connected
7149 // to the scalar preheader. In that case, there's nothing to fix.
7150 VPValue *Incoming = EpiResumePhiR->getOperand(0);
7151 match(Incoming, VPlanPatternMatch::m_ZExtOrSExt(
7152 VPlanPatternMatch::m_VPValue(Incoming)));
7153 auto *EpiRedResult = dyn_cast<VPInstruction>(Incoming);
7154 if (!EpiRedResult ||
7155 (EpiRedResult->getOpcode() != VPInstruction::ComputeAnyOfResult &&
7156 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult &&
7157 EpiRedResult->getOpcode() != VPInstruction::ComputeFindIVResult))
7158 return;
7159
7160 auto *EpiRedHeaderPhi =
7161 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7162 RecurKind Kind = EpiRedHeaderPhi->getRecurrenceKind();
7163 Value *MainResumeValue;
7164 if (auto *VPI = dyn_cast<VPInstruction>(EpiRedHeaderPhi->getStartValue())) {
7165 assert((VPI->getOpcode() == VPInstruction::Broadcast ||
7166 VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7167 "unexpected start recipe");
7168 MainResumeValue = VPI->getOperand(0)->getUnderlyingValue();
7169 } else
7170 MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7171 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) {
7172 [[maybe_unused]] Value *StartV =
7173 EpiRedResult->getOperand(1)->getLiveInIRValue();
7174 auto *Cmp = cast<ICmpInst>(MainResumeValue);
7175 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7176 "AnyOf expected to start with ICMP_NE");
7177 assert(Cmp->getOperand(1) == StartV &&
7178 "AnyOf expected to start by comparing main resume value to original "
7179 "start value");
7180 MainResumeValue = Cmp->getOperand(0);
7181 } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind)) {
7182 Value *StartV = getStartValueFromReductionResult(EpiRedResult);
7183 Value *SentinelV = EpiRedResult->getOperand(2)->getLiveInIRValue();
7184 using namespace llvm::PatternMatch;
7185 Value *Cmp, *OrigResumeV, *CmpOp;
7186 [[maybe_unused]] bool IsExpectedPattern =
7187 match(MainResumeValue,
7188 m_Select(m_OneUse(m_Value(Cmp)), m_Specific(SentinelV),
7189 m_Value(OrigResumeV))) &&
7190 (match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
7191 m_Value(CmpOp))) &&
7192 ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(CmpOp))));
7193 assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7194 MainResumeValue = OrigResumeV;
7195 }
7196 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7197
7198 // When fixing reductions in the epilogue loop we should already have
7199 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7200 // over the incoming values correctly.
7201 auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiR, true));
7202 EpiResumePhi->setIncomingValueForBlock(
7203 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7204 }
7205
executePlan(ElementCount BestVF,unsigned BestUF,VPlan & BestVPlan,InnerLoopVectorizer & ILV,DominatorTree * DT,bool VectorizingEpilogue)7206 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7207 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7208 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
7209 assert(BestVPlan.hasVF(BestVF) &&
7210 "Trying to execute plan with unsupported VF");
7211 assert(BestVPlan.hasUF(BestUF) &&
7212 "Trying to execute plan with unsupported UF");
7213 if (BestVPlan.hasEarlyExit())
7214 ++LoopsEarlyExitVectorized;
7215 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7216 // cost model is complete for better cost estimates.
7217 VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
7218 OrigLoop->getHeader()->getContext());
7219 VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
7220 VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
7221 bool HasBranchWeights =
7222 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
7223 if (HasBranchWeights) {
7224 std::optional<unsigned> VScale = CM.getVScaleForTuning();
7225 VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator,
7226 BestVPlan, BestVF, VScale);
7227 }
7228
7229 if (!VectorizingEpilogue) {
7230 // Checks are the same for all VPlans, added to BestVPlan only for
7231 // compactness.
7232 attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
7233 }
7234
7235 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7236 VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
7237 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7238 VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
7239 VPlanTransforms::narrowInterleaveGroups(
7240 BestVPlan, BestVF,
7241 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
7242 VPlanTransforms::removeDeadRecipes(BestVPlan);
7243
7244 VPlanTransforms::convertToConcreteRecipes(BestVPlan,
7245 *Legal->getWidestInductionType());
7246 // Regions are dissolved after optimizing for VF and UF, which completely
7247 // removes unneeded loop regions first.
7248 VPlanTransforms::dissolveLoopRegions(BestVPlan);
7249 // Perform the actual loop transformation.
7250 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7251 OrigLoop->getParentLoop(),
7252 Legal->getWidestInductionType());
7253
7254 #ifdef EXPENSIVE_CHECKS
7255 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7256 #endif
7257
7258 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7259 // making any changes to the CFG.
7260 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
7261 auto *Entry = cast<VPIRBasicBlock>(BestVPlan.getEntry());
7262 State.Builder.SetInsertPoint(Entry->getIRBasicBlock()->getTerminator());
7263 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
7264 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
7265 if (!ExpSCEV)
7266 continue;
7267 ExpSCEV->execute(State);
7268 ExpandedSCEVs[ExpSCEV->getSCEV()] = State.get(ExpSCEV, VPLane(0));
7269 VPValue *Exp = BestVPlan.getOrAddLiveIn(ExpandedSCEVs[ExpSCEV->getSCEV()]);
7270 ExpSCEV->replaceAllUsesWith(Exp);
7271 if (BestVPlan.getTripCount() == ExpSCEV)
7272 BestVPlan.resetTripCount(Exp);
7273 ExpSCEV->eraseFromParent();
7274 }
7275
7276 if (!ILV.getTripCount())
7277 ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7278 else
7279 assert(VectorizingEpilogue && "should only re-use the existing trip "
7280 "count during epilogue vectorization");
7281
7282 // 1. Set up the skeleton for vectorization, including vector pre-header and
7283 // middle block. The vector loop is created during VPlan execution.
7284 BasicBlock *EntryBB =
7285 cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock();
7286 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7287 if (VectorizingEpilogue)
7288 VPlanTransforms::removeDeadRecipes(BestVPlan);
7289
7290 assert(verifyVPlanIsValid(BestVPlan, true /*VerifyLate*/) &&
7291 "final VPlan is invalid");
7292
7293 ILV.printDebugTracesAtStart();
7294
7295 //===------------------------------------------------===//
7296 //
7297 // Notice: any optimization or new instruction that go
7298 // into the code below should also be implemented in
7299 // the cost-model.
7300 //
7301 //===------------------------------------------------===//
7302
7303 // 2. Copy and widen instructions from the old loop into the new loop.
7304 BestVPlan.prepareToExecute(
7305 ILV.getTripCount(),
7306 ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
7307 replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
7308
7309 // Move check blocks to their final position.
7310 // TODO: Move as part of VPIRBB execute and update impacted tests.
7311 if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second)
7312 MemCheckBlock->moveAfter(EntryBB);
7313 if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVChecks().second)
7314 SCEVCheckBlock->moveAfter(EntryBB);
7315
7316 BestVPlan.execute(&State);
7317
7318 // 2.5 When vectorizing the epilogue, fix reduction resume values from the
7319 // additional bypass block.
7320 if (VectorizingEpilogue) {
7321 assert(!BestVPlan.hasEarlyExit() &&
7322 "Epilogue vectorisation not yet supported with early exits");
7323 BasicBlock *PH = OrigLoop->getLoopPreheader();
7324 BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7325 for (auto *Pred : predecessors(PH)) {
7326 for (PHINode &Phi : PH->phis()) {
7327 if (Phi.getBasicBlockIndex(Pred) != -1)
7328 continue;
7329 Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
7330 }
7331 }
7332 VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader();
7333 if (ScalarPH->getNumPredecessors() > 0) {
7334 // If ScalarPH has predecessors, we may need to update its reduction
7335 // resume values.
7336 for (VPRecipeBase &R : ScalarPH->phis()) {
7337 fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), State,
7338 BypassBlock);
7339 }
7340 }
7341 }
7342
7343 // 2.6. Maintain Loop Hints
7344 // Keep all loop hints from the original loop on the vector loop (we'll
7345 // replace the vectorizer-specific hints below).
7346 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
7347 if (HeaderVPBB) {
7348 MDNode *OrigLoopID = OrigLoop->getLoopID();
7349
7350 std::optional<MDNode *> VectorizedLoopID =
7351 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7352 LLVMLoopVectorizeFollowupVectorized});
7353
7354 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7355 if (VectorizedLoopID) {
7356 L->setLoopID(*VectorizedLoopID);
7357 } else {
7358 // Keep all loop hints from the original loop on the vector loop (we'll
7359 // replace the vectorizer-specific hints below).
7360 if (MDNode *LID = OrigLoop->getLoopID())
7361 L->setLoopID(LID);
7362
7363 LoopVectorizeHints Hints(L, true, *ORE);
7364 Hints.setAlreadyVectorized();
7365
7366 // Check if it's EVL-vectorized and mark the corresponding metadata.
7367 bool IsEVLVectorized =
7368 llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) {
7369 // Looking for the ExplictVectorLength VPInstruction.
7370 if (const auto *VI = dyn_cast<VPInstruction>(&Recipe))
7371 return VI->getOpcode() == VPInstruction::ExplicitVectorLength;
7372 return false;
7373 });
7374 if (IsEVLVectorized) {
7375 LLVMContext &Context = L->getHeader()->getContext();
7376 MDNode *LoopID = L->getLoopID();
7377 auto *IsEVLVectorizedMD = MDNode::get(
7378 Context,
7379 {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"),
7380 MDString::get(Context, "evl")});
7381 MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {},
7382 {IsEVLVectorizedMD});
7383 L->setLoopID(NewLoopID);
7384 }
7385 }
7386 TargetTransformInfo::UnrollingPreferences UP;
7387 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7388 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7389 addRuntimeUnrollDisableMetaData(L);
7390 }
7391
7392 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7393 // predication, updating analyses.
7394 ILV.fixVectorizedLoop(State);
7395
7396 ILV.printDebugTracesAtEnd();
7397
7398 return ExpandedSCEVs;
7399 }
7400
7401 //===--------------------------------------------------------------------===//
7402 // EpilogueVectorizerMainLoop
7403 //===--------------------------------------------------------------------===//
7404
7405 /// This function is partially responsible for generating the control flow
7406 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
createEpilogueVectorizedLoopSkeleton()7407 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7408 createVectorLoopSkeleton("");
7409
7410 // Generate the code to check the minimum iteration count of the vector
7411 // epilogue (see below).
7412 EPI.EpilogueIterationCountCheck =
7413 emitIterationCountCheck(LoopScalarPreHeader, true);
7414 EPI.EpilogueIterationCountCheck->setName("iter.check");
7415
7416 // Generate the iteration count check for the main loop, *after* the check
7417 // for the epilogue loop, so that the path-length is shorter for the case
7418 // that goes directly through the vector epilogue. The longer-path length for
7419 // the main loop is compensated for, by the gain from vectorizing the larger
7420 // trip count. Note: the branch will get updated later on when we vectorize
7421 // the epilogue.
7422 EPI.MainLoopIterationCountCheck =
7423 emitIterationCountCheck(LoopScalarPreHeader, false);
7424
7425 // Generate the induction variable.
7426 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7427
7428 replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
7429 return LoopVectorPreHeader;
7430 }
7431
printDebugTracesAtStart()7432 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7433 LLVM_DEBUG({
7434 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7435 << "Main Loop VF:" << EPI.MainLoopVF
7436 << ", Main Loop UF:" << EPI.MainLoopUF
7437 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7438 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7439 });
7440 }
7441
printDebugTracesAtEnd()7442 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7443 DEBUG_WITH_TYPE(VerboseDebug, {
7444 dbgs() << "intermediate fn:\n"
7445 << *OrigLoop->getHeader()->getParent() << "\n";
7446 });
7447 }
7448
7449 BasicBlock *
emitIterationCountCheck(BasicBlock * Bypass,bool ForEpilogue)7450 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7451 bool ForEpilogue) {
7452 assert(Bypass && "Expected valid bypass basic block.");
7453 Value *Count = getTripCount();
7454 MinProfitableTripCount = ElementCount::getFixed(0);
7455 Value *CheckMinIters = createIterationCountCheck(
7456 ForEpilogue ? EPI.EpilogueVF : VF, ForEpilogue ? EPI.EpilogueUF : UF);
7457
7458 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7459 if (!ForEpilogue)
7460 TCCheckBlock->setName("vector.main.loop.iter.check");
7461
7462 // Create new preheader for vector loop.
7463 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7464 static_cast<DominatorTree *>(nullptr), LI,
7465 nullptr, "vector.ph");
7466
7467 if (ForEpilogue) {
7468 // Save the trip count so we don't have to regenerate it in the
7469 // vec.epilog.iter.check. This is safe to do because the trip count
7470 // generated here dominates the vector epilog iter check.
7471 EPI.TripCount = Count;
7472 }
7473
7474 BranchInst &BI =
7475 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7476 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7477 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7478 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7479
7480 // When vectorizing the main loop, its trip-count check is placed in a new
7481 // block, whereas the overall trip-count check is placed in the VPlan entry
7482 // block. When vectorizing the epilogue loop, its trip-count check is placed
7483 // in the VPlan entry block.
7484 if (!ForEpilogue)
7485 introduceCheckBlockInVPlan(TCCheckBlock);
7486 return TCCheckBlock;
7487 }
7488
7489 //===--------------------------------------------------------------------===//
7490 // EpilogueVectorizerEpilogueLoop
7491 //===--------------------------------------------------------------------===//
7492
7493 /// This function is partially responsible for generating the control flow
7494 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7495 BasicBlock *
createEpilogueVectorizedLoopSkeleton()7496 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7497 createVectorLoopSkeleton("vec.epilog.");
7498
7499 // Now, compare the remaining count and if there aren't enough iterations to
7500 // execute the vectorized epilogue skip to the scalar part.
7501 LoopVectorPreHeader->setName("vec.epilog.ph");
7502 BasicBlock *VecEpilogueIterationCountCheck =
7503 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
7504 nullptr, "vec.epilog.iter.check", true);
7505 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7506 VecEpilogueIterationCountCheck);
7507 AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7508
7509 // Adjust the control flow taking the state info from the main loop
7510 // vectorization into account.
7511 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7512 "expected this to be saved from the previous pass.");
7513 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7514 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7515
7516 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7517 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7518
7519 // Adjust the terminators of runtime check blocks and phis using them.
7520 BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
7521 BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
7522 if (SCEVCheckBlock)
7523 SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
7524 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7525 if (MemCheckBlock)
7526 MemCheckBlock->getTerminator()->replaceUsesOfWith(
7527 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7528
7529 DT->changeImmediateDominator(LoopScalarPreHeader,
7530 EPI.EpilogueIterationCountCheck);
7531
7532 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7533 // reductions which merge control-flow from the latch block and the middle
7534 // block. Update the incoming values here and move the Phi into the preheader.
7535 SmallVector<PHINode *, 4> PhisInBlock(
7536 llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
7537
7538 for (PHINode *Phi : PhisInBlock) {
7539 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7540 Phi->replaceIncomingBlockWith(
7541 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7542 VecEpilogueIterationCountCheck);
7543
7544 // If the phi doesn't have an incoming value from the
7545 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7546 // value and also those from other check blocks. This is needed for
7547 // reduction phis only.
7548 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7549 return EPI.EpilogueIterationCountCheck == IncB;
7550 }))
7551 continue;
7552 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7553 if (SCEVCheckBlock)
7554 Phi->removeIncomingValue(SCEVCheckBlock);
7555 if (MemCheckBlock)
7556 Phi->removeIncomingValue(MemCheckBlock);
7557 }
7558
7559 replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
7560 return LoopVectorPreHeader;
7561 }
7562
7563 BasicBlock *
emitMinimumVectorEpilogueIterCountCheck(BasicBlock * Bypass,BasicBlock * Insert)7564 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7565 BasicBlock *Bypass, BasicBlock *Insert) {
7566
7567 assert(EPI.TripCount &&
7568 "Expected trip count to have been saved in the first pass.");
7569 Value *TC = EPI.TripCount;
7570 IRBuilder<> Builder(Insert->getTerminator());
7571 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7572
7573 // Generate code to check if the loop's trip count is less than VF * UF of the
7574 // vector epilogue loop.
7575 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7576 ? ICmpInst::ICMP_ULE
7577 : ICmpInst::ICMP_ULT;
7578
7579 Value *CheckMinIters =
7580 Builder.CreateICmp(P, Count,
7581 createStepForVF(Builder, Count->getType(),
7582 EPI.EpilogueVF, EPI.EpilogueUF),
7583 "min.epilog.iters.check");
7584
7585 BranchInst &BI =
7586 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7587 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7588 // FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't
7589 // think the MainLoopStep is correct.
7590 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7591 unsigned EpilogueLoopStep =
7592 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7593 // We assume the remaining `Count` is equally distributed in
7594 // [0, MainLoopStep)
7595 // So the probability for `Count < EpilogueLoopStep` should be
7596 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7597 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7598 const uint32_t Weights[] = {EstimatedSkipCount,
7599 MainLoopStep - EstimatedSkipCount};
7600 setBranchWeights(BI, Weights, /*IsExpected=*/false);
7601 }
7602 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7603
7604 // A new entry block has been created for the epilogue VPlan. Hook it in, as
7605 // otherwise we would try to modify the entry to the main vector loop.
7606 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
7607 VPBasicBlock *OldEntry = Plan.getEntry();
7608 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
7609 Plan.setEntry(NewEntry);
7610 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7611
7612 return Insert;
7613 }
7614
printDebugTracesAtStart()7615 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7616 LLVM_DEBUG({
7617 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7618 << "Epilogue Loop VF:" << EPI.EpilogueVF
7619 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7620 });
7621 }
7622
printDebugTracesAtEnd()7623 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7624 DEBUG_WITH_TYPE(VerboseDebug, {
7625 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7626 });
7627 }
7628
7629 VPWidenMemoryRecipe *
tryToWidenMemory(Instruction * I,ArrayRef<VPValue * > Operands,VFRange & Range)7630 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
7631 VFRange &Range) {
7632 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7633 "Must be called with either a load or store");
7634
7635 auto WillWiden = [&](ElementCount VF) -> bool {
7636 LoopVectorizationCostModel::InstWidening Decision =
7637 CM.getWideningDecision(I, VF);
7638 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7639 "CM decision should be taken at this point.");
7640 if (Decision == LoopVectorizationCostModel::CM_Interleave)
7641 return true;
7642 if (CM.isScalarAfterVectorization(I, VF) ||
7643 CM.isProfitableToScalarize(I, VF))
7644 return false;
7645 return Decision != LoopVectorizationCostModel::CM_Scalarize;
7646 };
7647
7648 if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range))
7649 return nullptr;
7650
7651 VPValue *Mask = nullptr;
7652 if (Legal->isMaskRequired(I))
7653 Mask = getBlockInMask(Builder.getInsertBlock());
7654
7655 // Determine if the pointer operand of the access is either consecutive or
7656 // reverse consecutive.
7657 LoopVectorizationCostModel::InstWidening Decision =
7658 CM.getWideningDecision(I, Range.Start);
7659 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7660 bool Consecutive =
7661 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
7662
7663 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
7664 if (Consecutive) {
7665 auto *GEP = dyn_cast<GetElementPtrInst>(
7666 Ptr->getUnderlyingValue()->stripPointerCasts());
7667 VPSingleDefRecipe *VectorPtr;
7668 if (Reverse) {
7669 // When folding the tail, we may compute an address that we don't in the
7670 // original scalar loop and it may not be inbounds. Drop Inbounds in that
7671 // case.
7672 GEPNoWrapFlags Flags =
7673 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
7674 ? GEPNoWrapFlags::none()
7675 : GEPNoWrapFlags::inBounds();
7676 VectorPtr =
7677 new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
7678 /*Stride*/ -1, Flags, I->getDebugLoc());
7679 } else {
7680 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7681 GEP ? GEP->getNoWrapFlags()
7682 : GEPNoWrapFlags::none(),
7683 I->getDebugLoc());
7684 }
7685 Builder.insert(VectorPtr);
7686 Ptr = VectorPtr;
7687 }
7688 if (LoadInst *Load = dyn_cast<LoadInst>(I))
7689 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7690 VPIRMetadata(*Load, LVer), I->getDebugLoc());
7691
7692 StoreInst *Store = cast<StoreInst>(I);
7693 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
7694 Reverse, VPIRMetadata(*Store, LVer),
7695 I->getDebugLoc());
7696 }
7697
7698 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
7699 /// insert a recipe to expand the step for the induction recipe.
7700 static VPWidenIntOrFpInductionRecipe *
createWidenInductionRecipes(PHINode * Phi,Instruction * PhiOrTrunc,VPValue * Start,const InductionDescriptor & IndDesc,VPlan & Plan,ScalarEvolution & SE,Loop & OrigLoop)7701 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
7702 VPValue *Start, const InductionDescriptor &IndDesc,
7703 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
7704 assert(IndDesc.getStartValue() ==
7705 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
7706 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7707 "step must be loop invariant");
7708
7709 VPValue *Step =
7710 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
7711 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
7712 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7713 IndDesc, TruncI,
7714 TruncI->getDebugLoc());
7715 }
7716 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
7717 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7718 IndDesc, Phi->getDebugLoc());
7719 }
7720
tryToOptimizeInductionPHI(PHINode * Phi,ArrayRef<VPValue * > Operands,VFRange & Range)7721 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
7722 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
7723
7724 // Check if this is an integer or fp induction. If so, build the recipe that
7725 // produces its scalar and vector values.
7726 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7727 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
7728 *PSE.getSE(), *OrigLoop);
7729
7730 // Check if this is pointer induction. If so, build the recipe for it.
7731 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7732 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
7733 *PSE.getSE());
7734 return new VPWidenPointerInductionRecipe(
7735 Phi, Operands[0], Step, &Plan.getVFxUF(), *II,
7736 LoopVectorizationPlanner::getDecisionAndClampRange(
7737 [&](ElementCount VF) {
7738 return CM.isScalarAfterVectorization(Phi, VF);
7739 },
7740 Range),
7741 Phi->getDebugLoc());
7742 }
7743 return nullptr;
7744 }
7745
tryToOptimizeInductionTruncate(TruncInst * I,ArrayRef<VPValue * > Operands,VFRange & Range)7746 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
7747 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
7748 // Optimize the special case where the source is a constant integer
7749 // induction variable. Notice that we can only optimize the 'trunc' case
7750 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7751 // (c) other casts depend on pointer size.
7752
7753 // Determine whether \p K is a truncation based on an induction variable that
7754 // can be optimized.
7755 auto IsOptimizableIVTruncate =
7756 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7757 return [=](ElementCount VF) -> bool {
7758 return CM.isOptimizableIVTruncate(K, VF);
7759 };
7760 };
7761
7762 if (LoopVectorizationPlanner::getDecisionAndClampRange(
7763 IsOptimizableIVTruncate(I), Range)) {
7764
7765 auto *Phi = cast<PHINode>(I->getOperand(0));
7766 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
7767 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
7768 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
7769 *OrigLoop);
7770 }
7771 return nullptr;
7772 }
7773
tryToWidenCall(CallInst * CI,ArrayRef<VPValue * > Operands,VFRange & Range)7774 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
7775 ArrayRef<VPValue *> Operands,
7776 VFRange &Range) {
7777 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7778 [this, CI](ElementCount VF) {
7779 return CM.isScalarWithPredication(CI, VF);
7780 },
7781 Range);
7782
7783 if (IsPredicated)
7784 return nullptr;
7785
7786 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7787 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7788 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7789 ID == Intrinsic::pseudoprobe ||
7790 ID == Intrinsic::experimental_noalias_scope_decl))
7791 return nullptr;
7792
7793 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
7794
7795 // Is it beneficial to perform intrinsic call compared to lib call?
7796 bool ShouldUseVectorIntrinsic =
7797 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
7798 [&](ElementCount VF) -> bool {
7799 return CM.getCallWideningDecision(CI, VF).Kind ==
7800 LoopVectorizationCostModel::CM_IntrinsicCall;
7801 },
7802 Range);
7803 if (ShouldUseVectorIntrinsic)
7804 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
7805 CI->getDebugLoc());
7806
7807 Function *Variant = nullptr;
7808 std::optional<unsigned> MaskPos;
7809 // Is better to call a vectorized version of the function than to to scalarize
7810 // the call?
7811 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7812 [&](ElementCount VF) -> bool {
7813 // The following case may be scalarized depending on the VF.
7814 // The flag shows whether we can use a usual Call for vectorized
7815 // version of the instruction.
7816
7817 // If we've found a variant at a previous VF, then stop looking. A
7818 // vectorized variant of a function expects input in a certain shape
7819 // -- basically the number of input registers, the number of lanes
7820 // per register, and whether there's a mask required.
7821 // We store a pointer to the variant in the VPWidenCallRecipe, so
7822 // once we have an appropriate variant it's only valid for that VF.
7823 // This will force a different vplan to be generated for each VF that
7824 // finds a valid variant.
7825 if (Variant)
7826 return false;
7827 LoopVectorizationCostModel::CallWideningDecision Decision =
7828 CM.getCallWideningDecision(CI, VF);
7829 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
7830 Variant = Decision.Variant;
7831 MaskPos = Decision.MaskPos;
7832 return true;
7833 }
7834
7835 return false;
7836 },
7837 Range);
7838 if (ShouldUseVectorCall) {
7839 if (MaskPos.has_value()) {
7840 // We have 2 cases that would require a mask:
7841 // 1) The block needs to be predicated, either due to a conditional
7842 // in the scalar loop or use of an active lane mask with
7843 // tail-folding, and we use the appropriate mask for the block.
7844 // 2) No mask is required for the block, but the only available
7845 // vector variant at this VF requires a mask, so we synthesize an
7846 // all-true mask.
7847 VPValue *Mask = nullptr;
7848 if (Legal->isMaskRequired(CI))
7849 Mask = getBlockInMask(Builder.getInsertBlock());
7850 else
7851 Mask = Plan.getOrAddLiveIn(
7852 ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));
7853
7854 Ops.insert(Ops.begin() + *MaskPos, Mask);
7855 }
7856
7857 Ops.push_back(Operands.back());
7858 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
7859 }
7860
7861 return nullptr;
7862 }
7863
shouldWiden(Instruction * I,VFRange & Range) const7864 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7865 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7866 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7867 // Instruction should be widened, unless it is scalar after vectorization,
7868 // scalarization is profitable or it is predicated.
7869 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7870 return CM.isScalarAfterVectorization(I, VF) ||
7871 CM.isProfitableToScalarize(I, VF) ||
7872 CM.isScalarWithPredication(I, VF);
7873 };
7874 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7875 Range);
7876 }
7877
tryToWiden(Instruction * I,ArrayRef<VPValue * > Operands)7878 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
7879 ArrayRef<VPValue *> Operands) {
7880 switch (I->getOpcode()) {
7881 default:
7882 return nullptr;
7883 case Instruction::SDiv:
7884 case Instruction::UDiv:
7885 case Instruction::SRem:
7886 case Instruction::URem: {
7887 // If not provably safe, use a select to form a safe divisor before widening the
7888 // div/rem operation itself. Otherwise fall through to general handling below.
7889 if (CM.isPredicatedInst(I)) {
7890 SmallVector<VPValue *> Ops(Operands);
7891 VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
7892 VPValue *One =
7893 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
7894 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
7895 Ops[1] = SafeRHS;
7896 return new VPWidenRecipe(*I, Ops);
7897 }
7898 [[fallthrough]];
7899 }
7900 case Instruction::Add:
7901 case Instruction::And:
7902 case Instruction::AShr:
7903 case Instruction::FAdd:
7904 case Instruction::FCmp:
7905 case Instruction::FDiv:
7906 case Instruction::FMul:
7907 case Instruction::FNeg:
7908 case Instruction::FRem:
7909 case Instruction::FSub:
7910 case Instruction::ICmp:
7911 case Instruction::LShr:
7912 case Instruction::Mul:
7913 case Instruction::Or:
7914 case Instruction::Select:
7915 case Instruction::Shl:
7916 case Instruction::Sub:
7917 case Instruction::Xor:
7918 case Instruction::Freeze: {
7919 SmallVector<VPValue *> NewOps(Operands);
7920 if (Instruction::isBinaryOp(I->getOpcode())) {
7921 // The legacy cost model uses SCEV to check if some of the operands are
7922 // constants. To match the legacy cost model's behavior, use SCEV to try
7923 // to replace operands with constants.
7924 ScalarEvolution &SE = *PSE.getSE();
7925 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
7926 if (!Op->isLiveIn())
7927 return Op;
7928 Value *V = Op->getUnderlyingValue();
7929 if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
7930 return Op;
7931 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
7932 if (!C)
7933 return Op;
7934 return Plan.getOrAddLiveIn(C->getValue());
7935 };
7936 // For Mul, the legacy cost model checks both operands.
7937 if (I->getOpcode() == Instruction::Mul)
7938 NewOps[0] = GetConstantViaSCEV(NewOps[0]);
7939 // For other binops, the legacy cost model only checks the second operand.
7940 NewOps[1] = GetConstantViaSCEV(NewOps[1]);
7941 }
7942 return new VPWidenRecipe(*I, NewOps);
7943 }
7944 case Instruction::ExtractValue: {
7945 SmallVector<VPValue *> NewOps(Operands);
7946 Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
7947 auto *EVI = cast<ExtractValueInst>(I);
7948 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
7949 unsigned Idx = EVI->getIndices()[0];
7950 NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
7951 return new VPWidenRecipe(*I, NewOps);
7952 }
7953 };
7954 }
7955
7956 VPHistogramRecipe *
tryToWidenHistogram(const HistogramInfo * HI,ArrayRef<VPValue * > Operands)7957 VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7958 ArrayRef<VPValue *> Operands) {
7959 // FIXME: Support other operations.
7960 unsigned Opcode = HI->Update->getOpcode();
7961 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
7962 "Histogram update operation must be an Add or Sub");
7963
7964 SmallVector<VPValue *, 3> HGramOps;
7965 // Bucket address.
7966 HGramOps.push_back(Operands[1]);
7967 // Increment value.
7968 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
7969
7970 // In case of predicated execution (due to tail-folding, or conditional
7971 // execution, or both), pass the relevant mask.
7972 if (Legal->isMaskRequired(HI->Store))
7973 HGramOps.push_back(getBlockInMask(Builder.getInsertBlock()));
7974
7975 return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc());
7976 }
7977
7978 VPReplicateRecipe *
handleReplication(Instruction * I,ArrayRef<VPValue * > Operands,VFRange & Range)7979 VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands,
7980 VFRange &Range) {
7981 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7982 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7983 Range);
7984
7985 bool IsPredicated = CM.isPredicatedInst(I);
7986
7987 // Even if the instruction is not marked as uniform, there are certain
7988 // intrinsic calls that can be effectively treated as such, so we check for
7989 // them here. Conservatively, we only do this for scalable vectors, since
7990 // for fixed-width VFs we can always fall back on full scalarization.
7991 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
7992 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
7993 case Intrinsic::assume:
7994 case Intrinsic::lifetime_start:
7995 case Intrinsic::lifetime_end:
7996 // For scalable vectors if one of the operands is variant then we still
7997 // want to mark as uniform, which will generate one instruction for just
7998 // the first lane of the vector. We can't scalarize the call in the same
7999 // way as for fixed-width vectors because we don't know how many lanes
8000 // there are.
8001 //
8002 // The reasons for doing it this way for scalable vectors are:
8003 // 1. For the assume intrinsic generating the instruction for the first
8004 // lane is still be better than not generating any at all. For
8005 // example, the input may be a splat across all lanes.
8006 // 2. For the lifetime start/end intrinsics the pointer operand only
8007 // does anything useful when the input comes from a stack object,
8008 // which suggests it should always be uniform. For non-stack objects
8009 // the effect is to poison the object, which still allows us to
8010 // remove the call.
8011 IsUniform = true;
8012 break;
8013 default:
8014 break;
8015 }
8016 }
8017 VPValue *BlockInMask = nullptr;
8018 if (!IsPredicated) {
8019 // Finalize the recipe for Instr, first if it is not predicated.
8020 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8021 } else {
8022 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8023 // Instructions marked for predication are replicated and a mask operand is
8024 // added initially. Masked replicate recipes will later be placed under an
8025 // if-then construct to prevent side-effects. Generate recipes to compute
8026 // the block mask for this region.
8027 BlockInMask = getBlockInMask(Builder.getInsertBlock());
8028 }
8029
8030 // Note that there is some custom logic to mark some intrinsics as uniform
8031 // manually above for scalable vectors, which this assert needs to account for
8032 // as well.
8033 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8034 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8035 "Should not predicate a uniform recipe");
8036 auto *Recipe = new VPReplicateRecipe(I, Operands, IsUniform, BlockInMask,
8037 VPIRMetadata(*I, LVer));
8038 return Recipe;
8039 }
8040
8041 /// Find all possible partial reductions in the loop and track all of those that
8042 /// are valid so recipes can be formed later.
collectScaledReductions(VFRange & Range)8043 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
8044 // Find all possible partial reductions.
8045 SmallVector<std::pair<PartialReductionChain, unsigned>>
8046 PartialReductionChains;
8047 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
8048 getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range,
8049 PartialReductionChains);
8050 }
8051
8052 // A partial reduction is invalid if any of its extends are used by
8053 // something that isn't another partial reduction. This is because the
8054 // extends are intended to be lowered along with the reduction itself.
8055
8056 // Build up a set of partial reduction ops for efficient use checking.
8057 SmallSet<User *, 4> PartialReductionOps;
8058 for (const auto &[PartialRdx, _] : PartialReductionChains)
8059 PartialReductionOps.insert(PartialRdx.ExtendUser);
8060
8061 auto ExtendIsOnlyUsedByPartialReductions =
8062 [&PartialReductionOps](Instruction *Extend) {
8063 return all_of(Extend->users(), [&](const User *U) {
8064 return PartialReductionOps.contains(U);
8065 });
8066 };
8067
8068 // Check if each use of a chain's two extends is a partial reduction
8069 // and only add those that don't have non-partial reduction users.
8070 for (auto Pair : PartialReductionChains) {
8071 PartialReductionChain Chain = Pair.first;
8072 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8073 (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
8074 ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
8075 }
8076 }
8077
getScaledReductions(Instruction * PHI,Instruction * RdxExitInstr,VFRange & Range,SmallVectorImpl<std::pair<PartialReductionChain,unsigned>> & Chains)8078 bool VPRecipeBuilder::getScaledReductions(
8079 Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
8080 SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
8081 if (!CM.TheLoop->contains(RdxExitInstr))
8082 return false;
8083
8084 auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
8085 if (!Update)
8086 return false;
8087
8088 Value *Op = Update->getOperand(0);
8089 Value *PhiOp = Update->getOperand(1);
8090 if (Op == PHI)
8091 std::swap(Op, PhiOp);
8092
8093 // Try and get a scaled reduction from the first non-phi operand.
8094 // If one is found, we use the discovered reduction instruction in
8095 // place of the accumulator for costing.
8096 if (auto *OpInst = dyn_cast<Instruction>(Op)) {
8097 if (getScaledReductions(PHI, OpInst, Range, Chains)) {
8098 PHI = Chains.rbegin()->first.Reduction;
8099
8100 Op = Update->getOperand(0);
8101 PhiOp = Update->getOperand(1);
8102 if (Op == PHI)
8103 std::swap(Op, PhiOp);
8104 }
8105 }
8106 if (PhiOp != PHI)
8107 return false;
8108
8109 using namespace llvm::PatternMatch;
8110
8111 // If the update is a binary operator, check both of its operands to see if
8112 // they are extends. Otherwise, see if the update comes directly from an
8113 // extend.
8114 Instruction *Exts[2] = {nullptr};
8115 BinaryOperator *ExtendUser = dyn_cast<BinaryOperator>(Op);
8116 std::optional<unsigned> BinOpc;
8117 Type *ExtOpTypes[2] = {nullptr};
8118
8119 auto CollectExtInfo = [this, &Exts,
8120 &ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
8121 unsigned I = 0;
8122 for (Value *OpI : Ops) {
8123 Value *ExtOp;
8124 if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
8125 return false;
8126 Exts[I] = cast<Instruction>(OpI);
8127
8128 // TODO: We should be able to support live-ins.
8129 if (!CM.TheLoop->contains(Exts[I]))
8130 return false;
8131
8132 ExtOpTypes[I] = ExtOp->getType();
8133 I++;
8134 }
8135 return true;
8136 };
8137
8138 if (ExtendUser) {
8139 if (!ExtendUser->hasOneUse())
8140 return false;
8141
8142 // Use the side-effect of match to replace BinOp only if the pattern is
8143 // matched, we don't care at this point whether it actually matched.
8144 match(ExtendUser, m_Neg(m_BinOp(ExtendUser)));
8145
8146 SmallVector<Value *> Ops(ExtendUser->operands());
8147 if (!CollectExtInfo(Ops))
8148 return false;
8149
8150 BinOpc = std::make_optional(ExtendUser->getOpcode());
8151 } else if (match(Update, m_Add(m_Value(), m_Value()))) {
8152 // We already know the operands for Update are Op and PhiOp.
8153 SmallVector<Value *> Ops({Op});
8154 if (!CollectExtInfo(Ops))
8155 return false;
8156
8157 ExtendUser = Update;
8158 BinOpc = std::nullopt;
8159 } else
8160 return false;
8161
8162 TTI::PartialReductionExtendKind OpAExtend =
8163 TTI::getPartialReductionExtendKind(Exts[0]);
8164 TTI::PartialReductionExtendKind OpBExtend =
8165 Exts[1] ? TTI::getPartialReductionExtendKind(Exts[1]) : TTI::PR_None;
8166 PartialReductionChain Chain(RdxExitInstr, Exts[0], Exts[1], ExtendUser);
8167
8168 TypeSize PHISize = PHI->getType()->getPrimitiveSizeInBits();
8169 TypeSize ASize = ExtOpTypes[0]->getPrimitiveSizeInBits();
8170 if (!PHISize.hasKnownScalarFactor(ASize))
8171 return false;
8172 unsigned TargetScaleFactor = PHISize.getKnownScalarFactor(ASize);
8173
8174 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8175 [&](ElementCount VF) {
8176 InstructionCost Cost = TTI->getPartialReductionCost(
8177 Update->getOpcode(), ExtOpTypes[0], ExtOpTypes[1],
8178 PHI->getType(), VF, OpAExtend, OpBExtend, BinOpc, CM.CostKind);
8179 return Cost.isValid();
8180 },
8181 Range)) {
8182 Chains.emplace_back(Chain, TargetScaleFactor);
8183 return true;
8184 }
8185
8186 return false;
8187 }
8188
tryToCreateWidenRecipe(VPSingleDefRecipe * R,VFRange & Range)8189 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
8190 VFRange &Range) {
8191 // First, check for specific widening recipes that deal with inductions, Phi
8192 // nodes, calls and memory operations.
8193 VPRecipeBase *Recipe;
8194 Instruction *Instr = R->getUnderlyingInstr();
8195 SmallVector<VPValue *, 4> Operands(R->operands());
8196 if (auto *PhiR = dyn_cast<VPWidenPHIRecipe>(R)) {
8197 VPBasicBlock *Parent = PhiR->getParent();
8198 [[maybe_unused]] VPRegionBlock *LoopRegionOf =
8199 Parent->getEnclosingLoopRegion();
8200 assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
8201 "Non-header phis should have been handled during predication");
8202 auto *Phi = cast<PHINode>(R->getUnderlyingInstr());
8203 assert(Operands.size() == 2 && "Must have 2 operands for header phis");
8204 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8205 return Recipe;
8206
8207 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8208 assert((Legal->isReductionVariable(Phi) ||
8209 Legal->isFixedOrderRecurrence(Phi)) &&
8210 "can only widen reductions and fixed-order recurrences here");
8211 VPValue *StartV = Operands[0];
8212 if (Legal->isReductionVariable(Phi)) {
8213 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(Phi);
8214 assert(RdxDesc.getRecurrenceStartValue() ==
8215 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8216
8217 // If the PHI is used by a partial reduction, set the scale factor.
8218 unsigned ScaleFactor =
8219 getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8220 PhiRecipe = new VPReductionPHIRecipe(
8221 Phi, RdxDesc.getRecurrenceKind(), *StartV, CM.isInLoopReduction(Phi),
8222 CM.useOrderedReductions(RdxDesc), ScaleFactor);
8223 } else {
8224 // TODO: Currently fixed-order recurrences are modeled as chains of
8225 // first-order recurrences. If there are no users of the intermediate
8226 // recurrences in the chain, the fixed order recurrence should be modeled
8227 // directly, enabling more efficient codegen.
8228 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8229 }
8230 // Add backedge value.
8231 PhiRecipe->addOperand(Operands[1]);
8232 return PhiRecipe;
8233 }
8234
8235 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8236 cast<TruncInst>(Instr), Operands, Range)))
8237 return Recipe;
8238
8239 // All widen recipes below deal only with VF > 1.
8240 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8241 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8242 return nullptr;
8243
8244 if (auto *CI = dyn_cast<CallInst>(Instr))
8245 return tryToWidenCall(CI, Operands, Range);
8246
8247 if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8248 if (auto HistInfo = Legal->getHistogramInfo(SI))
8249 return tryToWidenHistogram(*HistInfo, Operands);
8250
8251 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8252 return tryToWidenMemory(Instr, Operands, Range);
8253
8254 if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
8255 return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
8256
8257 if (!shouldWiden(Instr, Range))
8258 return nullptr;
8259
8260 if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8261 return new VPWidenGEPRecipe(GEP, Operands);
8262
8263 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8264 return new VPWidenSelectRecipe(*SI, Operands);
8265 }
8266
8267 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8268 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8269 *CI);
8270 }
8271
8272 return tryToWiden(Instr, Operands);
8273 }
8274
8275 VPRecipeBase *
tryToCreatePartialReduction(Instruction * Reduction,ArrayRef<VPValue * > Operands,unsigned ScaleFactor)8276 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8277 ArrayRef<VPValue *> Operands,
8278 unsigned ScaleFactor) {
8279 assert(Operands.size() == 2 &&
8280 "Unexpected number of operands for partial reduction");
8281
8282 VPValue *BinOp = Operands[0];
8283 VPValue *Accumulator = Operands[1];
8284 VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8285 if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
8286 isa<VPPartialReductionRecipe>(BinOpRecipe))
8287 std::swap(BinOp, Accumulator);
8288
8289 unsigned ReductionOpcode = Reduction->getOpcode();
8290 if (ReductionOpcode == Instruction::Sub) {
8291 auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
8292 SmallVector<VPValue *, 2> Ops;
8293 Ops.push_back(Plan.getOrAddLiveIn(Zero));
8294 Ops.push_back(BinOp);
8295 BinOp = new VPWidenRecipe(*Reduction, Ops);
8296 Builder.insert(BinOp->getDefiningRecipe());
8297 ReductionOpcode = Instruction::Add;
8298 }
8299
8300 VPValue *Cond = nullptr;
8301 if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) {
8302 assert((ReductionOpcode == Instruction::Add ||
8303 ReductionOpcode == Instruction::Sub) &&
8304 "Expected an ADD or SUB operation for predicated partial "
8305 "reductions (because the neutral element in the mask is zero)!");
8306 Cond = getBlockInMask(Builder.getInsertBlock());
8307 VPValue *Zero =
8308 Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
8309 BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
8310 }
8311 return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
8312 ScaleFactor, Reduction);
8313 }
8314
buildVPlansWithVPRecipes(ElementCount MinVF,ElementCount MaxVF)8315 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8316 ElementCount MaxVF) {
8317 if (ElementCount::isKnownGT(MinVF, MaxVF))
8318 return;
8319
8320 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8321
8322 const LoopAccessInfo *LAI = Legal->getLAI();
8323 LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
8324 OrigLoop, LI, DT, PSE.getSE());
8325 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8326 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
8327 // Only use noalias metadata when using memory checks guaranteeing no
8328 // overlap across all iterations.
8329 LVer.prepareNoAliasMetadata();
8330 }
8331
8332 auto MaxVFTimes2 = MaxVF * 2;
8333 auto VPlan0 = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
8334 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8335 VFRange SubRange = {VF, MaxVFTimes2};
8336 if (auto Plan = tryToBuildVPlanWithVPRecipes(
8337 std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
8338 bool HasScalarVF = Plan->hasScalarVFOnly();
8339 // Now optimize the initial VPlan.
8340 if (!HasScalarVF)
8341 VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths,
8342 *Plan, CM.getMinimalBitwidths());
8343 VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan);
8344 // TODO: try to put it close to addActiveLaneMask().
8345 // Discard the plan if it is not EVL-compatible
8346 if (CM.foldTailWithEVL() && !HasScalarVF &&
8347 !VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength,
8348 *Plan, CM.getMaxSafeElements()))
8349 break;
8350 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8351 VPlans.push_back(std::move(Plan));
8352 }
8353 VF = SubRange.End;
8354 }
8355 }
8356
8357 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8358 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8359 /// the end value of the induction.
addResumePhiRecipeForInduction(VPWidenInductionRecipe * WideIV,VPBuilder & VectorPHBuilder,VPBuilder & ScalarPHBuilder,VPTypeAnalysis & TypeInfo,VPValue * VectorTC)8360 static VPInstruction *addResumePhiRecipeForInduction(
8361 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8362 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
8363 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
8364 // Truncated wide inductions resume from the last lane of their vector value
8365 // in the last vector iteration which is handled elsewhere.
8366 if (WideIntOrFp && WideIntOrFp->getTruncInst())
8367 return nullptr;
8368
8369 VPValue *Start = WideIV->getStartValue();
8370 VPValue *Step = WideIV->getStepValue();
8371 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
8372 VPValue *EndValue = VectorTC;
8373 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
8374 EndValue = VectorPHBuilder.createDerivedIV(
8375 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
8376 Start, VectorTC, Step);
8377 }
8378
8379 // EndValue is derived from the vector trip count (which has the same type as
8380 // the widest induction) and thus may be wider than the induction here.
8381 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
8382 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
8383 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
8384 ScalarTypeOfWideIV,
8385 WideIV->getDebugLoc());
8386 }
8387
8388 auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
8389 {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
8390 return ResumePhiRecipe;
8391 }
8392
8393 /// Create resume phis in the scalar preheader for first-order recurrences,
8394 /// reductions and inductions, and update the VPIRInstructions wrapping the
8395 /// original phis in the scalar header. End values for inductions are added to
8396 /// \p IVEndValues.
addScalarResumePhis(VPRecipeBuilder & Builder,VPlan & Plan,DenseMap<VPValue *,VPValue * > & IVEndValues)8397 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
8398 DenseMap<VPValue *, VPValue *> &IVEndValues) {
8399 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
8400 auto *ScalarPH = Plan.getScalarPreheader();
8401 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
8402 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8403 VPBuilder VectorPHBuilder(
8404 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
8405 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8406 VPBuilder ScalarPHBuilder(ScalarPH);
8407 for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
8408 auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
8409
8410 // TODO: Extract final value from induction recipe initially, optimize to
8411 // pre-computed end value together in optimizeInductionExitUsers.
8412 auto *VectorPhiR =
8413 cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
8414 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
8415 if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
8416 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
8417 &Plan.getVectorTripCount())) {
8418 assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
8419 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
8420 ScalarPhiIRI->addOperand(ResumePhi);
8421 continue;
8422 }
8423 // TODO: Also handle truncated inductions here. Computing end-values
8424 // separately should be done as VPlan-to-VPlan optimization, after
8425 // legalizing all resume values to use the last lane from the loop.
8426 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
8427 "should only skip truncated wide inductions");
8428 continue;
8429 }
8430
8431 // The backedge value provides the value to resume coming out of a loop,
8432 // which for FORs is a vector whose last element needs to be extracted. The
8433 // start value provides the value if the loop is bypassed.
8434 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
8435 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
8436 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8437 "Cannot handle loops with uncountable early exits");
8438 if (IsFOR)
8439 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
8440 VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
8441 "vector.recur.extract");
8442 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
8443 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
8444 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
8445 ScalarPhiIRI->addOperand(ResumePhiR);
8446 }
8447 }
8448
8449 // Collect VPIRInstructions for phis in the exit block from the latch only.
collectUsersInLatchExitBlock(VPlan & Plan)8450 static SetVector<VPIRInstruction *> collectUsersInLatchExitBlock(VPlan &Plan) {
8451 SetVector<VPIRInstruction *> ExitUsersToFix;
8452 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
8453
8454 if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock())
8455 continue;
8456
8457 for (VPRecipeBase &R : ExitVPBB->phis()) {
8458 auto *ExitIRI = cast<VPIRPhi>(&R);
8459 assert(ExitIRI->getNumOperands() == 1 && "must have a single operand");
8460 VPValue *V = ExitIRI->getOperand(0);
8461 if (V->isLiveIn())
8462 continue;
8463 assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
8464 "Only recipes defined inside a region should need fixing.");
8465 ExitUsersToFix.insert(ExitIRI);
8466 }
8467 }
8468 return ExitUsersToFix;
8469 }
8470
8471 // Add exit values to \p Plan. Extracts are added for each entry in \p
8472 // ExitUsersToFix if needed and their operands are updated.
8473 static void
addUsersInExitBlocks(VPlan & Plan,const SetVector<VPIRInstruction * > & ExitUsersToFix)8474 addUsersInExitBlocks(VPlan &Plan,
8475 const SetVector<VPIRInstruction *> &ExitUsersToFix) {
8476 if (ExitUsersToFix.empty())
8477 return;
8478
8479 auto *MiddleVPBB = Plan.getMiddleBlock();
8480 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8481
8482 // Introduce extract for exiting values and update the VPIRInstructions
8483 // modeling the corresponding LCSSA phis.
8484 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8485 assert(ExitIRI->getNumOperands() == 1 &&
8486 ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
8487 "exit values from early exits must be fixed when branch to "
8488 "early-exit is added");
8489 ExitIRI->extractLastLaneOfFirstOperand(B);
8490 }
8491 }
8492
8493 /// Handle users in the exit block for first order reductions in the original
8494 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
8495 /// users in the original exit block using the VPIRInstruction wrapping to the
8496 /// LCSSA phi.
addExitUsersForFirstOrderRecurrences(VPlan & Plan,SetVector<VPIRInstruction * > & ExitUsersToFix,VFRange & Range)8497 static void addExitUsersForFirstOrderRecurrences(
8498 VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix, VFRange &Range) {
8499 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8500 auto *ScalarPHVPBB = Plan.getScalarPreheader();
8501 auto *MiddleVPBB = Plan.getMiddleBlock();
8502 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8503 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8504
8505 auto IsScalableOne = [](ElementCount VF) -> bool {
8506 return VF == ElementCount::getScalable(1);
8507 };
8508
8509 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8510 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
8511 if (!FOR)
8512 continue;
8513
8514 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8515 "Cannot handle loops with uncountable early exits");
8516
8517 // This is the second phase of vectorizing first-order recurrences, creating
8518 // extract for users outside the loop. An overview of the transformation is
8519 // described below. Suppose we have the following loop with some use after
8520 // the loop of the last a[i-1],
8521 //
8522 // for (int i = 0; i < n; ++i) {
8523 // t = a[i - 1];
8524 // b[i] = a[i] - t;
8525 // }
8526 // use t;
8527 //
8528 // There is a first-order recurrence on "a". For this loop, the shorthand
8529 // scalar IR looks like:
8530 //
8531 // scalar.ph:
8532 // s.init = a[-1]
8533 // br scalar.body
8534 //
8535 // scalar.body:
8536 // i = phi [0, scalar.ph], [i+1, scalar.body]
8537 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
8538 // s2 = a[i]
8539 // b[i] = s2 - s1
8540 // br cond, scalar.body, exit.block
8541 //
8542 // exit.block:
8543 // use = lcssa.phi [s1, scalar.body]
8544 //
8545 // In this example, s1 is a recurrence because it's value depends on the
8546 // previous iteration. In the first phase of vectorization, we created a
8547 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
8548 // for users in the scalar preheader and exit block.
8549 //
8550 // vector.ph:
8551 // v_init = vector(..., ..., ..., a[-1])
8552 // br vector.body
8553 //
8554 // vector.body
8555 // i = phi [0, vector.ph], [i+4, vector.body]
8556 // v1 = phi [v_init, vector.ph], [v2, vector.body]
8557 // v2 = a[i, i+1, i+2, i+3]
8558 // b[i] = v2 - v1
8559 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
8560 // b[i, i+1, i+2, i+3] = v2 - v1
8561 // br cond, vector.body, middle.block
8562 //
8563 // middle.block:
8564 // vector.recur.extract.for.phi = v2(2)
8565 // vector.recur.extract = v2(3)
8566 // br cond, scalar.ph, exit.block
8567 //
8568 // scalar.ph:
8569 // scalar.recur.init = phi [vector.recur.extract, middle.block],
8570 // [s.init, otherwise]
8571 // br scalar.body
8572 //
8573 // scalar.body:
8574 // i = phi [0, scalar.ph], [i+1, scalar.body]
8575 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
8576 // s2 = a[i]
8577 // b[i] = s2 - s1
8578 // br cond, scalar.body, exit.block
8579 //
8580 // exit.block:
8581 // lo = lcssa.phi [s1, scalar.body],
8582 // [vector.recur.extract.for.phi, middle.block]
8583 //
8584 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
8585 // Extract the penultimate value of the recurrence and use it as operand for
8586 // the VPIRInstruction modeling the phi.
8587 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8588 if (ExitIRI->getOperand(0) != FOR)
8589 continue;
8590 // For VF vscale x 1, if vscale = 1, we are unable to extract the
8591 // penultimate value of the recurrence. Instead, we rely on function
8592 // addUsersInExitBlocks to extract the last element from the result of
8593 // VPInstruction::FirstOrderRecurrenceSplice by leaving the user of the
8594 // recurrence phi in ExitUsersToFix.
8595 // TODO: Consider vscale_range info and UF.
8596 if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,
8597 Range))
8598 return;
8599 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
8600 VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
8601 {}, "vector.recur.extract.for.phi");
8602 ExitIRI->setOperand(0, PenultimateElement);
8603 ExitUsersToFix.remove(ExitIRI);
8604 }
8605 }
8606 }
8607
tryToBuildVPlanWithVPRecipes(VPlanPtr Plan,VFRange & Range,LoopVersioning * LVer)8608 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8609 VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8610
8611 using namespace llvm::VPlanPatternMatch;
8612 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8613
8614 // ---------------------------------------------------------------------------
8615 // Build initial VPlan: Scan the body of the loop in a topological order to
8616 // visit each basic block after having visited its predecessor basic blocks.
8617 // ---------------------------------------------------------------------------
8618
8619 // Create initial VPlan skeleton, having a basic block for the pre-header
8620 // which contains SCEV expansions that need to happen before the CFG is
8621 // modified; a basic block for the vector pre-header, followed by a region for
8622 // the vector loop, followed by the middle basic block. The skeleton vector
8623 // loop region contains a header and latch basic blocks.
8624
8625 bool RequiresScalarEpilogueCheck =
8626 LoopVectorizationPlanner::getDecisionAndClampRange(
8627 [this](ElementCount VF) {
8628 return !CM.requiresScalarEpilogue(VF.isVector());
8629 },
8630 Range);
8631 VPlanTransforms::prepareForVectorization(
8632 *Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
8633 CM.foldTailByMasking(), OrigLoop,
8634 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()),
8635 Legal->hasUncountableEarlyExit(), Range);
8636 VPlanTransforms::createLoopRegions(*Plan);
8637
8638 // Don't use getDecisionAndClampRange here, because we don't know the UF
8639 // so this function is better to be conservative, rather than to split
8640 // it up into different VPlans.
8641 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8642 bool IVUpdateMayOverflow = false;
8643 for (ElementCount VF : Range)
8644 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8645
8646 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8647 // Use NUW for the induction increment if we proved that it won't overflow in
8648 // the vector loop or when not folding the tail. In the later case, we know
8649 // that the canonical induction increment will not overflow as the vector trip
8650 // count is >= increment and a multiple of the increment.
8651 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8652 if (!HasNUW) {
8653 auto *IVInc = Plan->getVectorLoopRegion()
8654 ->getExitingBasicBlock()
8655 ->getTerminator()
8656 ->getOperand(0);
8657 assert(match(IVInc, m_VPInstruction<Instruction::Add>(
8658 m_Specific(Plan->getCanonicalIV()), m_VPValue())) &&
8659 "Did not find the canonical IV increment");
8660 cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
8661 }
8662
8663 // ---------------------------------------------------------------------------
8664 // Pre-construction: record ingredients whose recipes we'll need to further
8665 // process after constructing the initial VPlan.
8666 // ---------------------------------------------------------------------------
8667
8668 // For each interleave group which is relevant for this (possibly trimmed)
8669 // Range, add it to the set of groups to be later applied to the VPlan and add
8670 // placeholders for its members' Recipes which we'll be replacing with a
8671 // single VPInterleaveRecipe.
8672 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8673 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8674 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8675 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8676 LoopVectorizationCostModel::CM_Interleave);
8677 // For scalable vectors, the interleave factors must be <= 8 since we
8678 // require the (de)interleaveN intrinsics instead of shufflevectors.
8679 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8680 "Unsupported interleave factor for scalable vectors");
8681 return Result;
8682 };
8683 if (!getDecisionAndClampRange(ApplyIG, Range))
8684 continue;
8685 InterleaveGroups.insert(IG);
8686 }
8687
8688 // ---------------------------------------------------------------------------
8689 // Predicate and linearize the top-level loop region.
8690 // ---------------------------------------------------------------------------
8691 auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
8692 *Plan, CM.foldTailByMasking());
8693
8694 // ---------------------------------------------------------------------------
8695 // Construct wide recipes and apply predication for original scalar
8696 // VPInstructions in the loop.
8697 // ---------------------------------------------------------------------------
8698 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8699 Builder, BlockMaskCache, LVer);
8700 RecipeBuilder.collectScaledReductions(Range);
8701
8702 // Scan the body of the loop in a topological order to visit each basic block
8703 // after having visited its predecessor basic blocks.
8704 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8705 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8706 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8707 HeaderVPBB);
8708
8709 auto *MiddleVPBB = Plan->getMiddleBlock();
8710 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8711 // Mapping from VPValues in the initial plan to their widened VPValues. Needed
8712 // temporarily to update created block masks.
8713 DenseMap<VPValue *, VPValue *> Old2New;
8714 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
8715 // Convert input VPInstructions to widened recipes.
8716 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
8717 auto *SingleDef = cast<VPSingleDefRecipe>(&R);
8718 auto *UnderlyingValue = SingleDef->getUnderlyingValue();
8719 // Skip recipes that do not need transforming, including canonical IV,
8720 // wide canonical IV and VPInstructions without underlying values. The
8721 // latter are added above for masking.
8722 // FIXME: Migrate code relying on the underlying instruction from VPlan0
8723 // to construct recipes below to not use the underlying instruction.
8724 if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
8725 &R) ||
8726 (isa<VPInstruction>(&R) && !UnderlyingValue))
8727 continue;
8728
8729 // FIXME: VPlan0, which models a copy of the original scalar loop, should
8730 // not use VPWidenPHIRecipe to model the phis.
8731 assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
8732 UnderlyingValue && "unsupported recipe");
8733
8734 // TODO: Gradually replace uses of underlying instruction by analyses on
8735 // VPlan.
8736 Instruction *Instr = cast<Instruction>(UnderlyingValue);
8737 Builder.setInsertPoint(SingleDef);
8738
8739 // The stores with invariant address inside the loop will be deleted, and
8740 // in the exit block, a uniform store recipe will be created for the final
8741 // invariant store of the reduction.
8742 StoreInst *SI;
8743 if ((SI = dyn_cast<StoreInst>(Instr)) &&
8744 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
8745 // Only create recipe for the final invariant store of the reduction.
8746 if (Legal->isInvariantStoreOfReduction(SI)) {
8747 auto *Recipe =
8748 new VPReplicateRecipe(SI, R.operands(), true /* IsUniform */,
8749 nullptr /*Mask*/, VPIRMetadata(*SI, LVer));
8750 Recipe->insertBefore(*MiddleVPBB, MBIP);
8751 }
8752 R.eraseFromParent();
8753 continue;
8754 }
8755
8756 VPRecipeBase *Recipe =
8757 RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range);
8758 if (!Recipe) {
8759 SmallVector<VPValue *, 4> Operands(R.operands());
8760 Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
8761 }
8762
8763 RecipeBuilder.setRecipe(Instr, Recipe);
8764 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
8765 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8766 // moved to the phi section in the header.
8767 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8768 } else {
8769 Builder.insert(Recipe);
8770 }
8771 if (Recipe->getNumDefinedValues() == 1) {
8772 SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
8773 Old2New[SingleDef] = Recipe->getVPSingleValue();
8774 } else {
8775 assert(Recipe->getNumDefinedValues() == 0 &&
8776 "Unexpected multidef recipe");
8777 R.eraseFromParent();
8778 }
8779 }
8780 }
8781
8782 // replaceAllUsesWith above may invalidate the block masks. Update them here.
8783 // TODO: Include the masks as operands in the predicated VPlan directly
8784 // to remove the need to keep a map of masks beyond the predication
8785 // transform.
8786 RecipeBuilder.updateBlockMaskCache(Old2New);
8787 for (const auto &[Old, _] : Old2New)
8788 Old->getDefiningRecipe()->eraseFromParent();
8789
8790 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8791 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8792 "entry block must be set to a VPRegionBlock having a non-empty entry "
8793 "VPBasicBlock");
8794
8795 // Update wide induction increments to use the same step as the corresponding
8796 // wide induction. This enables detecting induction increments directly in
8797 // VPlan and removes redundant splats.
8798 for (const auto &[Phi, ID] : Legal->getInductionVars()) {
8799 auto *IVInc = cast<Instruction>(
8800 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8801 if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
8802 continue;
8803 VPWidenInductionRecipe *WideIV =
8804 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
8805 VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
8806 R->setOperand(1, WideIV->getStepValue());
8807 }
8808
8809 DenseMap<VPValue *, VPValue *> IVEndValues;
8810 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
8811 SetVector<VPIRInstruction *> ExitUsersToFix =
8812 collectUsersInLatchExitBlock(*Plan);
8813 addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix, Range);
8814 addUsersInExitBlocks(*Plan, ExitUsersToFix);
8815
8816 // ---------------------------------------------------------------------------
8817 // Transform initial VPlan: Apply previously taken decisions, in order, to
8818 // bring the VPlan to its final state.
8819 // ---------------------------------------------------------------------------
8820
8821 // Adjust the recipes for any inloop reductions.
8822 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
8823
8824 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
8825 // NaNs if possible, bail out otherwise.
8826 if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
8827 *Plan))
8828 return nullptr;
8829
8830 // Transform recipes to abstract recipes if it is legal and beneficial and
8831 // clamp the range for better cost estimation.
8832 // TODO: Enable following transform when the EVL-version of extended-reduction
8833 // and mulacc-reduction are implemented.
8834 if (!CM.foldTailWithEVL()) {
8835 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
8836 CM.CostKind);
8837 VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
8838 CostCtx, Range);
8839 }
8840
8841 for (ElementCount VF : Range)
8842 Plan->addVF(VF);
8843 Plan->setName("Initial VPlan");
8844
8845 // Interleave memory: for each Interleave Group we marked earlier as relevant
8846 // for this VPlan, replace the Recipes widening its memory instructions with a
8847 // single VPInterleaveRecipe at its insertion point.
8848 VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan,
8849 InterleaveGroups, RecipeBuilder,
8850 CM.isScalarEpilogueAllowed());
8851
8852 // Replace VPValues for known constant strides guaranteed by predicate scalar
8853 // evolution.
8854 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
8855 auto *R = cast<VPRecipeBase>(&U);
8856 return R->getParent()->getParent() ||
8857 R->getParent() ==
8858 Plan->getVectorLoopRegion()->getSinglePredecessor();
8859 };
8860 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8861 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8862 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8863 // Only handle constant strides for now.
8864 if (!ScevStride)
8865 continue;
8866
8867 auto *CI = Plan->getOrAddLiveIn(
8868 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
8869 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
8870 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
8871
8872 // The versioned value may not be used in the loop directly but through a
8873 // sext/zext. Add new live-ins in those cases.
8874 for (Value *U : StrideV->users()) {
8875 if (!isa<SExtInst, ZExtInst>(U))
8876 continue;
8877 VPValue *StrideVPV = Plan->getLiveIn(U);
8878 if (!StrideVPV)
8879 continue;
8880 unsigned BW = U->getType()->getScalarSizeInBits();
8881 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
8882 : ScevStride->getAPInt().zext(BW);
8883 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
8884 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
8885 }
8886 }
8887
8888 auto BlockNeedsPredication = [this](BasicBlock *BB) {
8889 return Legal->blockNeedsPredication(BB);
8890 };
8891 VPlanTransforms::runPass(VPlanTransforms::dropPoisonGeneratingRecipes, *Plan,
8892 BlockNeedsPredication);
8893
8894 // Sink users of fixed-order recurrence past the recipe defining the previous
8895 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8896 if (!VPlanTransforms::runPass(VPlanTransforms::adjustFixedOrderRecurrences,
8897 *Plan, Builder))
8898 return nullptr;
8899
8900 if (useActiveLaneMask(Style)) {
8901 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8902 // TailFoldingStyle is visible there.
8903 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8904 bool WithoutRuntimeCheck =
8905 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8906 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8907 WithoutRuntimeCheck);
8908 }
8909 VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
8910
8911 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8912 return Plan;
8913 }
8914
tryToBuildVPlan(VFRange & Range)8915 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8916 // Outer loop handling: They may require CFG and instruction level
8917 // transformations before even evaluating whether vectorization is profitable.
8918 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8919 // the vectorization pipeline.
8920 assert(!OrigLoop->isInnermost());
8921 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8922
8923 auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
8924 VPlanTransforms::prepareForVectorization(
8925 *Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop,
8926 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), false,
8927 Range);
8928 VPlanTransforms::createLoopRegions(*Plan);
8929
8930 for (ElementCount VF : Range)
8931 Plan->addVF(VF);
8932
8933 if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
8934 Plan,
8935 [this](PHINode *P) {
8936 return Legal->getIntOrFpInductionDescriptor(P);
8937 },
8938 *PSE.getSE(), *TLI))
8939 return nullptr;
8940
8941 // Collect mapping of IR header phis to header phi recipes, to be used in
8942 // addScalarResumePhis.
8943 DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
8944 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8945 Builder, BlockMaskCache, nullptr /*LVer*/);
8946 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8947 if (isa<VPCanonicalIVPHIRecipe>(&R))
8948 continue;
8949 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
8950 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
8951 }
8952 DenseMap<VPValue *, VPValue *> IVEndValues;
8953 // TODO: IVEndValues are not used yet in the native path, to optimize exit
8954 // values.
8955 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
8956
8957 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8958 return Plan;
8959 }
8960
8961 // Adjust the recipes for reductions. For in-loop reductions the chain of
8962 // instructions leading from the loop exit instr to the phi need to be converted
8963 // to reductions, with one operand being vector and the other being the scalar
8964 // reduction chain. For other reductions, a select is introduced between the phi
8965 // and users outside the vector region when folding the tail.
8966 //
8967 // A ComputeReductionResult recipe is added to the middle block, also for
8968 // in-loop reductions which compute their result in-loop, because generating
8969 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8970 //
8971 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8972 // with a boolean reduction phi node to check if the condition is true in any
8973 // iteration. The final value is selected by the final ComputeReductionResult.
adjustRecipesForReductions(VPlanPtr & Plan,VPRecipeBuilder & RecipeBuilder,ElementCount MinVF)8974 void LoopVectorizationPlanner::adjustRecipesForReductions(
8975 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8976 using namespace VPlanPatternMatch;
8977 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8978 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8979 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8980 SmallVector<VPRecipeBase *> ToDelete;
8981
8982 for (VPRecipeBase &R : Header->phis()) {
8983 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8984 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8985 continue;
8986
8987 RecurKind Kind = PhiR->getRecurrenceKind();
8988 assert(
8989 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
8990 !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
8991 "AnyOf and FindIV reductions are not allowed for in-loop reductions");
8992
8993 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8994 SetVector<VPSingleDefRecipe *> Worklist;
8995 Worklist.insert(PhiR);
8996 for (unsigned I = 0; I != Worklist.size(); ++I) {
8997 VPSingleDefRecipe *Cur = Worklist[I];
8998 for (VPUser *U : Cur->users()) {
8999 auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9000 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9001 assert((UserRecipe->getParent() == MiddleVPBB ||
9002 UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9003 "U must be either in the loop region, the middle block or the "
9004 "scalar preheader.");
9005 continue;
9006 }
9007 Worklist.insert(UserRecipe);
9008 }
9009 }
9010
9011 // Visit operation "Links" along the reduction chain top-down starting from
9012 // the phi until LoopExitValue. We keep track of the previous item
9013 // (PreviousLink) to tell which of the two operands of a Link will remain
9014 // scalar and which will be reduced. For minmax by select(cmp), Link will be
9015 // the select instructions. Blend recipes of in-loop reduction phi's will
9016 // get folded to their non-phi operand, as the reduction recipe handles the
9017 // condition directly.
9018 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9019 for (VPSingleDefRecipe *CurrentLink : drop_begin(Worklist)) {
9020 if (auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink)) {
9021 assert(Blend->getNumIncomingValues() == 2 &&
9022 "Blend must have 2 incoming values");
9023 if (Blend->getIncomingValue(0) == PhiR) {
9024 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9025 } else {
9026 assert(Blend->getIncomingValue(1) == PhiR &&
9027 "PhiR must be an operand of the blend");
9028 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9029 }
9030 continue;
9031 }
9032
9033 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9034
9035 // Index of the first operand which holds a non-mask vector operand.
9036 unsigned IndexOfFirstOperand;
9037 // Recognize a call to the llvm.fmuladd intrinsic.
9038 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9039 VPValue *VecOp;
9040 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9041 if (IsFMulAdd) {
9042 assert(
9043 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9044 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9045 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9046 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9047 CurrentLink->getOperand(2) == PreviousLink &&
9048 "expected a call where the previous link is the added operand");
9049
9050 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9051 // need to create an fmul recipe (multiplying the first two operands of
9052 // the fmuladd together) to use as the vector operand for the fadd
9053 // reduction.
9054 VPInstruction *FMulRecipe = new VPInstruction(
9055 Instruction::FMul,
9056 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9057 CurrentLinkI->getFastMathFlags());
9058 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9059 VecOp = FMulRecipe;
9060 } else {
9061 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9062 if (isa<VPWidenRecipe>(CurrentLink)) {
9063 assert(isa<CmpInst>(CurrentLinkI) &&
9064 "need to have the compare of the select");
9065 continue;
9066 }
9067 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9068 "must be a select recipe");
9069 IndexOfFirstOperand = 1;
9070 } else {
9071 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9072 "Expected to replace a VPWidenSC");
9073 IndexOfFirstOperand = 0;
9074 }
9075 // Note that for non-commutable operands (cmp-selects), the semantics of
9076 // the cmp-select are captured in the recurrence kind.
9077 unsigned VecOpId =
9078 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9079 ? IndexOfFirstOperand + 1
9080 : IndexOfFirstOperand;
9081 VecOp = CurrentLink->getOperand(VecOpId);
9082 assert(VecOp != PreviousLink &&
9083 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9084 (VecOpId - IndexOfFirstOperand)) ==
9085 PreviousLink &&
9086 "PreviousLink must be the operand other than VecOp");
9087 }
9088
9089 VPValue *CondOp = nullptr;
9090 if (CM.blockNeedsPredicationForAnyReason(CurrentLinkI->getParent()))
9091 CondOp = RecipeBuilder.getBlockInMask(CurrentLink->getParent());
9092
9093 // TODO: Retrieve FMFs from recipes directly.
9094 RecurrenceDescriptor RdxDesc = Legal->getRecurrenceDescriptor(
9095 cast<PHINode>(PhiR->getUnderlyingInstr()));
9096 // Non-FP RdxDescs will have all fast math flags set, so clear them.
9097 FastMathFlags FMFs = isa<FPMathOperator>(CurrentLinkI)
9098 ? RdxDesc.getFastMathFlags()
9099 : FastMathFlags();
9100 auto *RedRecipe = new VPReductionRecipe(
9101 Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
9102 PhiR->isOrdered(), CurrentLinkI->getDebugLoc());
9103 // Append the recipe to the end of the VPBasicBlock because we need to
9104 // ensure that it comes after all of it's inputs, including CondOp.
9105 // Delete CurrentLink as it will be invalid if its operand is replaced
9106 // with a reduction defined at the bottom of the block in the next link.
9107 if (LinkVPBB->getNumSuccessors() == 0)
9108 RedRecipe->insertBefore(&*std::prev(std::prev(LinkVPBB->end())));
9109 else
9110 LinkVPBB->appendRecipe(RedRecipe);
9111
9112 CurrentLink->replaceAllUsesWith(RedRecipe);
9113 ToDelete.push_back(CurrentLink);
9114 PreviousLink = RedRecipe;
9115 }
9116 }
9117 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9118 Builder.setInsertPoint(&*std::prev(std::prev(LatchVPBB->end())));
9119 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9120 for (VPRecipeBase &R :
9121 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9122 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9123 if (!PhiR)
9124 continue;
9125
9126 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
9127 cast<PHINode>(PhiR->getUnderlyingInstr()));
9128 Type *PhiTy = PhiR->getUnderlyingValue()->getType();
9129 // If tail is folded by masking, introduce selects between the phi
9130 // and the users outside the vector region of each reduction, at the
9131 // beginning of the dedicated latch block.
9132 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9133 auto *NewExitingVPV = PhiR->getBackedgeValue();
9134 // Don't output selects for partial reductions because they have an output
9135 // with fewer lanes than the VF. So the operands of the select would have
9136 // different numbers of lanes. Partial reductions mask the input instead.
9137 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
9138 !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
9139 VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent());
9140 std::optional<FastMathFlags> FMFs =
9141 PhiTy->isFloatingPointTy()
9142 ? std::make_optional(RdxDesc.getFastMathFlags())
9143 : std::nullopt;
9144 NewExitingVPV =
9145 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9146 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9147 return isa<VPInstruction>(&U) &&
9148 (cast<VPInstruction>(&U)->getOpcode() ==
9149 VPInstruction::ComputeAnyOfResult ||
9150 cast<VPInstruction>(&U)->getOpcode() ==
9151 VPInstruction::ComputeReductionResult ||
9152 cast<VPInstruction>(&U)->getOpcode() ==
9153 VPInstruction::ComputeFindIVResult);
9154 });
9155 if (CM.usePredicatedReductionSelect())
9156 PhiR->setOperand(1, NewExitingVPV);
9157 }
9158
9159 // We want code in the middle block to appear to execute on the location of
9160 // the scalar loop's latch terminator because: (a) it is all compiler
9161 // generated, (b) these instructions are always executed after evaluating
9162 // the latch conditional branch, and (c) other passes may add new
9163 // predecessors which terminate on this line. This is the easiest way to
9164 // ensure we don't accidentally cause an extra step back into the loop while
9165 // debugging.
9166 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9167
9168 // TODO: At the moment ComputeReductionResult also drives creation of the
9169 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9170 // even for in-loop reductions, until the reduction resume value handling is
9171 // also modeled in VPlan.
9172 VPInstruction *FinalReductionResult;
9173 VPBuilder::InsertPointGuard Guard(Builder);
9174 Builder.setInsertPoint(MiddleVPBB, IP);
9175 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
9176 if (RecurrenceDescriptor::isFindIVRecurrenceKind(RecurrenceKind)) {
9177 VPValue *Start = PhiR->getStartValue();
9178 VPValue *Sentinel = Plan->getOrAddLiveIn(RdxDesc.getSentinelValue());
9179 FinalReductionResult =
9180 Builder.createNaryOp(VPInstruction::ComputeFindIVResult,
9181 {PhiR, Start, Sentinel, NewExitingVPV}, ExitDL);
9182 } else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
9183 VPValue *Start = PhiR->getStartValue();
9184 FinalReductionResult =
9185 Builder.createNaryOp(VPInstruction::ComputeAnyOfResult,
9186 {PhiR, Start, NewExitingVPV}, ExitDL);
9187 } else {
9188 VPIRFlags Flags =
9189 RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind)
9190 ? VPIRFlags(RdxDesc.getFastMathFlags())
9191 : VPIRFlags();
9192 FinalReductionResult =
9193 Builder.createNaryOp(VPInstruction::ComputeReductionResult,
9194 {PhiR, NewExitingVPV}, Flags, ExitDL);
9195 }
9196 // If the vector reduction can be performed in a smaller type, we truncate
9197 // then extend the loop exit value to enable InstCombine to evaluate the
9198 // entire expression in the smaller type.
9199 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9200 !RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
9201 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9202 assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(RecurrenceKind) &&
9203 "Unexpected truncated min-max recurrence!");
9204 Type *RdxTy = RdxDesc.getRecurrenceType();
9205 auto *Trunc =
9206 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9207 Instruction::CastOps ExtendOpc =
9208 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
9209 auto *Extnd = new VPWidenCastRecipe(ExtendOpc, Trunc, PhiTy);
9210 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9211 Extnd->insertAfter(Trunc);
9212 if (PhiR->getOperand(1) == NewExitingVPV)
9213 PhiR->setOperand(1, Extnd->getVPSingleValue());
9214
9215 // Update ComputeReductionResult with the truncated exiting value and
9216 // extend its result.
9217 FinalReductionResult->setOperand(1, Trunc);
9218 FinalReductionResult =
9219 Builder.createScalarCast(ExtendOpc, FinalReductionResult, PhiTy, {});
9220 }
9221
9222 // Update all users outside the vector region. Also replace redundant
9223 // ExtractLastElement.
9224 for (auto *U : to_vector(OrigExitingVPV->users())) {
9225 auto *Parent = cast<VPRecipeBase>(U)->getParent();
9226 if (FinalReductionResult == U || Parent->getParent())
9227 continue;
9228 U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
9229 if (match(U, m_VPInstruction<VPInstruction::ExtractLastElement>(
9230 m_VPValue())))
9231 cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
9232 }
9233
9234 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9235 // with a boolean reduction phi node to check if the condition is true in
9236 // any iteration. The final value is selected by the final
9237 // ComputeReductionResult.
9238 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
9239 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9240 return isa<VPWidenSelectRecipe>(U) ||
9241 (isa<VPReplicateRecipe>(U) &&
9242 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9243 Instruction::Select);
9244 }));
9245 VPValue *Cmp = Select->getOperand(0);
9246 // If the compare is checking the reduction PHI node, adjust it to check
9247 // the start value.
9248 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
9249 CmpR->replaceUsesOfWith(PhiR, PhiR->getStartValue());
9250 Builder.setInsertPoint(Select);
9251
9252 // If the true value of the select is the reduction phi, the new value is
9253 // selected if the negated condition is true in any iteration.
9254 if (Select->getOperand(1) == PhiR)
9255 Cmp = Builder.createNot(Cmp);
9256 VPValue *Or = Builder.createOr(PhiR, Cmp);
9257 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9258 // Delete Select now that it has invalid types.
9259 ToDelete.push_back(Select);
9260
9261 // Convert the reduction phi to operate on bools.
9262 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9263 OrigLoop->getHeader()->getContext())));
9264 continue;
9265 }
9266
9267 if (RecurrenceDescriptor::isFindIVRecurrenceKind(
9268 RdxDesc.getRecurrenceKind())) {
9269 // Adjust the start value for FindFirstIV/FindLastIV recurrences to use
9270 // the sentinel value after generating the ResumePhi recipe, which uses
9271 // the original start value.
9272 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9273 }
9274 RecurKind RK = RdxDesc.getRecurrenceKind();
9275 if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
9276 !RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
9277 !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
9278 VPBuilder PHBuilder(Plan->getVectorPreheader());
9279 VPValue *Iden = Plan->getOrAddLiveIn(
9280 getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
9281 // If the PHI is used by a partial reduction, set the scale factor.
9282 unsigned ScaleFactor =
9283 RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
9284 .value_or(1);
9285 Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext());
9286 auto *ScaleFactorVPV =
9287 Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor));
9288 VPValue *StartV = PHBuilder.createNaryOp(
9289 VPInstruction::ReductionStartVector,
9290 {PhiR->getStartValue(), Iden, ScaleFactorVPV},
9291 PhiTy->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
9292 : FastMathFlags());
9293 PhiR->setOperand(0, StartV);
9294 }
9295 }
9296 for (VPRecipeBase *R : ToDelete)
9297 R->eraseFromParent();
9298
9299 VPlanTransforms::runPass(VPlanTransforms::clearReductionWrapFlags, *Plan);
9300 }
9301
attachRuntimeChecks(VPlan & Plan,GeneratedRTChecks & RTChecks,bool HasBranchWeights) const9302 void LoopVectorizationPlanner::attachRuntimeChecks(
9303 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
9304 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
9305 if (SCEVCheckBlock) {
9306 assert((!CM.OptForSize ||
9307 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
9308 "Cannot SCEV check stride or overflow when optimizing for size");
9309 VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
9310 HasBranchWeights);
9311 }
9312 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
9313 if (MemCheckBlock) {
9314 // VPlan-native path does not do any analysis for runtime checks
9315 // currently.
9316 assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
9317 "Runtime checks are not supported for outer loops yet");
9318
9319 if (CM.OptForSize) {
9320 assert(
9321 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
9322 "Cannot emit memory checks when optimizing for size, unless forced "
9323 "to vectorize.");
9324 ORE->emit([&]() {
9325 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
9326 OrigLoop->getStartLoc(),
9327 OrigLoop->getHeader())
9328 << "Code-size may be reduced by not forcing "
9329 "vectorization, or by source-code modifications "
9330 "eliminating the need for runtime checks "
9331 "(e.g., adding 'restrict').";
9332 });
9333 }
9334 VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
9335 HasBranchWeights);
9336 }
9337 }
9338
execute(VPTransformState & State)9339 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9340 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9341
9342 // Fast-math-flags propagate from the original induction instruction.
9343 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9344 if (FPBinOp)
9345 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9346
9347 Value *Step = State.get(getStepValue(), VPLane(0));
9348 Value *Index = State.get(getOperand(1), VPLane(0));
9349 Value *DerivedIV = emitTransformedIndex(
9350 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9351 cast_if_present<BinaryOperator>(FPBinOp));
9352 DerivedIV->setName(Name);
9353 // If index is the vector trip count, the concrete value will only be set in
9354 // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9355 // TODO: Remove the special case for the vector trip count once it is computed
9356 // in VPlan and can be used during VPlan simplification.
9357 assert((DerivedIV != Index ||
9358 getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
9359 "IV didn't need transforming?");
9360 State.set(this, DerivedIV, VPLane(0));
9361 }
9362
9363 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9364 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9365 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9366 // for predication.
getScalarEpilogueLowering(Function * F,Loop * L,LoopVectorizeHints & Hints,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,LoopVectorizationLegality & LVL,InterleavedAccessInfo * IAI)9367 static ScalarEpilogueLowering getScalarEpilogueLowering(
9368 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9369 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9370 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9371 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9372 // don't look at hints or options, and don't request a scalar epilogue.
9373 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9374 // LoopAccessInfo (due to code dependency and not being able to reliably get
9375 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9376 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9377 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9378 // back to the old way and vectorize with versioning when forced. See D81345.)
9379 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9380 PGSOQueryType::IRPass) &&
9381 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9382 return CM_ScalarEpilogueNotAllowedOptSize;
9383
9384 // 2) If set, obey the directives
9385 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9386 switch (PreferPredicateOverEpilogue) {
9387 case PreferPredicateTy::ScalarEpilogue:
9388 return CM_ScalarEpilogueAllowed;
9389 case PreferPredicateTy::PredicateElseScalarEpilogue:
9390 return CM_ScalarEpilogueNotNeededUsePredicate;
9391 case PreferPredicateTy::PredicateOrDontVectorize:
9392 return CM_ScalarEpilogueNotAllowedUsePredicate;
9393 };
9394 }
9395
9396 // 3) If set, obey the hints
9397 switch (Hints.getPredicate()) {
9398 case LoopVectorizeHints::FK_Enabled:
9399 return CM_ScalarEpilogueNotNeededUsePredicate;
9400 case LoopVectorizeHints::FK_Disabled:
9401 return CM_ScalarEpilogueAllowed;
9402 };
9403
9404 // 4) if the TTI hook indicates this is profitable, request predication.
9405 TailFoldingInfo TFI(TLI, &LVL, IAI);
9406 if (TTI->preferPredicateOverEpilogue(&TFI))
9407 return CM_ScalarEpilogueNotNeededUsePredicate;
9408
9409 return CM_ScalarEpilogueAllowed;
9410 }
9411
9412 // Process the loop in the VPlan-native vectorization path. This path builds
9413 // VPlan upfront in the vectorization pipeline, which allows to apply
9414 // VPlan-to-VPlan transformations from the very beginning without modifying the
9415 // input LLVM IR.
processLoopInVPlanNativePath(Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,LoopVectorizationLegality * LVL,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,LoopVectorizeHints & Hints,LoopVectorizationRequirements & Requirements)9416 static bool processLoopInVPlanNativePath(
9417 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9418 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9419 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9420 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9421 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9422 LoopVectorizationRequirements &Requirements) {
9423
9424 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9425 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9426 return false;
9427 }
9428 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9429 Function *F = L->getHeader()->getParent();
9430 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9431
9432 ScalarEpilogueLowering SEL =
9433 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9434
9435 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9436 &Hints, IAI, PSI, BFI);
9437 // Use the planner for outer loop vectorization.
9438 // TODO: CM is not used at this point inside the planner. Turn CM into an
9439 // optional argument if we don't need it in the future.
9440 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9441 ORE);
9442
9443 // Get user vectorization factor.
9444 ElementCount UserVF = Hints.getWidth();
9445
9446 CM.collectElementTypesForWidening();
9447
9448 // Plan how to best vectorize, return the best VF and its cost.
9449 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9450
9451 // If we are stress testing VPlan builds, do not attempt to generate vector
9452 // code. Masked vector code generation support will follow soon.
9453 // Also, do not attempt to vectorize if no vector code will be produced.
9454 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9455 return false;
9456
9457 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
9458
9459 {
9460 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
9461 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9462 VF.Width, 1, &CM, BFI, PSI, Checks, BestPlan);
9463 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9464 << L->getHeader()->getParent()->getName() << "\"\n");
9465 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9466 }
9467
9468 reportVectorization(ORE, L, VF, 1);
9469
9470 // Mark the loop as already vectorized to avoid vectorizing again.
9471 Hints.setAlreadyVectorized();
9472 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9473 return true;
9474 }
9475
9476 // Emit a remark if there are stores to floats that required a floating point
9477 // extension. If the vectorized loop was generated with floating point there
9478 // will be a performance penalty from the conversion overhead and the change in
9479 // the vector width.
checkMixedPrecision(Loop * L,OptimizationRemarkEmitter * ORE)9480 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9481 SmallVector<Instruction *, 4> Worklist;
9482 for (BasicBlock *BB : L->getBlocks()) {
9483 for (Instruction &Inst : *BB) {
9484 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9485 if (S->getValueOperand()->getType()->isFloatTy())
9486 Worklist.push_back(S);
9487 }
9488 }
9489 }
9490
9491 // Traverse the floating point stores upwards searching, for floating point
9492 // conversions.
9493 SmallPtrSet<const Instruction *, 4> Visited;
9494 SmallPtrSet<const Instruction *, 4> EmittedRemark;
9495 while (!Worklist.empty()) {
9496 auto *I = Worklist.pop_back_val();
9497 if (!L->contains(I))
9498 continue;
9499 if (!Visited.insert(I).second)
9500 continue;
9501
9502 // Emit a remark if the floating point store required a floating
9503 // point conversion.
9504 // TODO: More work could be done to identify the root cause such as a
9505 // constant or a function return type and point the user to it.
9506 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9507 ORE->emit([&]() {
9508 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9509 I->getDebugLoc(), L->getHeader())
9510 << "floating point conversion changes vector width. "
9511 << "Mixed floating point precision requires an up/down "
9512 << "cast that will negatively impact performance.";
9513 });
9514
9515 for (Use &Op : I->operands())
9516 if (auto *OpI = dyn_cast<Instruction>(Op))
9517 Worklist.push_back(OpI);
9518 }
9519 }
9520
9521 /// For loops with uncountable early exits, find the cost of doing work when
9522 /// exiting the loop early, such as calculating the final exit values of
9523 /// variables used outside the loop.
9524 /// TODO: This is currently overly pessimistic because the loop may not take
9525 /// the early exit, but better to keep this conservative for now. In future,
9526 /// it might be possible to relax this by using branch probabilities.
calculateEarlyExitCost(VPCostContext & CostCtx,VPlan & Plan,ElementCount VF)9527 static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
9528 VPlan &Plan, ElementCount VF) {
9529 InstructionCost Cost = 0;
9530 for (auto *ExitVPBB : Plan.getExitBlocks()) {
9531 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
9532 // If the predecessor is not the middle.block, then it must be the
9533 // vector.early.exit block, which may contain work to calculate the exit
9534 // values of variables used outside the loop.
9535 if (PredVPBB != Plan.getMiddleBlock()) {
9536 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
9537 << PredVPBB->getName() << ":\n");
9538 Cost += PredVPBB->cost(VF, CostCtx);
9539 }
9540 }
9541 }
9542 return Cost;
9543 }
9544
9545 /// This function determines whether or not it's still profitable to vectorize
9546 /// the loop given the extra work we have to do outside of the loop:
9547 /// 1. Perform the runtime checks before entering the loop to ensure it's safe
9548 /// to vectorize.
9549 /// 2. In the case of loops with uncountable early exits, we may have to do
9550 /// extra work when exiting the loop early, such as calculating the final
9551 /// exit values of variables used outside the loop.
isOutsideLoopWorkProfitable(GeneratedRTChecks & Checks,VectorizationFactor & VF,Loop * L,PredicatedScalarEvolution & PSE,VPCostContext & CostCtx,VPlan & Plan,ScalarEpilogueLowering SEL,std::optional<unsigned> VScale)9552 static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
9553 VectorizationFactor &VF, Loop *L,
9554 PredicatedScalarEvolution &PSE,
9555 VPCostContext &CostCtx, VPlan &Plan,
9556 ScalarEpilogueLowering SEL,
9557 std::optional<unsigned> VScale) {
9558 InstructionCost TotalCost = Checks.getCost();
9559 if (!TotalCost.isValid())
9560 return false;
9561
9562 // Add on the cost of any work required in the vector early exit block, if
9563 // one exists.
9564 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
9565
9566 // When interleaving only scalar and vector cost will be equal, which in turn
9567 // would lead to a divide by 0. Fall back to hard threshold.
9568 if (VF.Width.isScalar()) {
9569 // TODO: Should we rename VectorizeMemoryCheckThreshold?
9570 if (TotalCost > VectorizeMemoryCheckThreshold) {
9571 LLVM_DEBUG(
9572 dbgs()
9573 << "LV: Interleaving only is not profitable due to runtime checks\n");
9574 return false;
9575 }
9576 return true;
9577 }
9578
9579 // The scalar cost should only be 0 when vectorizing with a user specified
9580 // VF/IC. In those cases, runtime checks should always be generated.
9581 uint64_t ScalarC = VF.ScalarCost.getValue();
9582 if (ScalarC == 0)
9583 return true;
9584
9585 // First, compute the minimum iteration count required so that the vector
9586 // loop outperforms the scalar loop.
9587 // The total cost of the scalar loop is
9588 // ScalarC * TC
9589 // where
9590 // * TC is the actual trip count of the loop.
9591 // * ScalarC is the cost of a single scalar iteration.
9592 //
9593 // The total cost of the vector loop is
9594 // RtC + VecC * (TC / VF) + EpiC
9595 // where
9596 // * RtC is the cost of the generated runtime checks plus the cost of
9597 // performing any additional work in the vector.early.exit block for loops
9598 // with uncountable early exits.
9599 // * VecC is the cost of a single vector iteration.
9600 // * TC is the actual trip count of the loop
9601 // * VF is the vectorization factor
9602 // * EpiCost is the cost of the generated epilogue, including the cost
9603 // of the remaining scalar operations.
9604 //
9605 // Vectorization is profitable once the total vector cost is less than the
9606 // total scalar cost:
9607 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9608 //
9609 // Now we can compute the minimum required trip count TC as
9610 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9611 //
9612 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9613 // the computations are performed on doubles, not integers and the result
9614 // is rounded up, hence we get an upper estimate of the TC.
9615 unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
9616 uint64_t RtC = TotalCost.getValue();
9617 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
9618 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9619
9620 // Second, compute a minimum iteration count so that the cost of the
9621 // runtime checks is only a fraction of the total scalar loop cost. This
9622 // adds a loop-dependent bound on the overhead incurred if the runtime
9623 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9624 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9625 // cost, compute
9626 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9627 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9628
9629 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9630 // epilogue is allowed, choose the next closest multiple of VF. This should
9631 // partly compensate for ignoring the epilogue cost.
9632 uint64_t MinTC = std::max(MinTC1, MinTC2);
9633 if (SEL == CM_ScalarEpilogueAllowed)
9634 MinTC = alignTo(MinTC, IntVF);
9635 VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
9636
9637 LLVM_DEBUG(
9638 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9639 << VF.MinProfitableTripCount << "\n");
9640
9641 // Skip vectorization if the expected trip count is less than the minimum
9642 // required trip count.
9643 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
9644 if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
9645 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9646 "trip count < minimum profitable VF ("
9647 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9648 << ")\n");
9649
9650 return false;
9651 }
9652 }
9653 return true;
9654 }
9655
LoopVectorizePass(LoopVectorizeOptions Opts)9656 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9657 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9658 !EnableLoopInterleaving),
9659 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9660 !EnableLoopVectorization) {}
9661
9662 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
9663 /// vectorization. Remove ResumePhis from \p MainPlan for inductions that
9664 /// don't have a corresponding wide induction in \p EpiPlan.
preparePlanForMainVectorLoop(VPlan & MainPlan,VPlan & EpiPlan)9665 static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
9666 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
9667 // will need their resume-values computed in the main vector loop. Others
9668 // can be removed from the main VPlan.
9669 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
9670 for (VPRecipeBase &R :
9671 EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9672 if (isa<VPCanonicalIVPHIRecipe>(&R))
9673 continue;
9674 EpiWidenedPhis.insert(
9675 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
9676 }
9677 for (VPRecipeBase &R :
9678 make_early_inc_range(MainPlan.getScalarHeader()->phis())) {
9679 auto *VPIRInst = cast<VPIRPhi>(&R);
9680 if (EpiWidenedPhis.contains(&VPIRInst->getIRPhi()))
9681 continue;
9682 // There is no corresponding wide induction in the epilogue plan that would
9683 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
9684 // together with the corresponding ResumePhi. The resume values for the
9685 // scalar loop will be created during execution of EpiPlan.
9686 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
9687 VPIRInst->eraseFromParent();
9688 ResumePhi->eraseFromParent();
9689 }
9690 VPlanTransforms::runPass(VPlanTransforms::removeDeadRecipes, MainPlan);
9691
9692 using namespace VPlanPatternMatch;
9693 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
9694 // introduce multiple uses of undef/poison. If the reduction start value may
9695 // be undef or poison it needs to be frozen and the frozen start has to be
9696 // used when computing the reduction result. We also need to use the frozen
9697 // value in the resume phi generated by the main vector loop, as this is also
9698 // used to compute the reduction result after the epilogue vector loop.
9699 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
9700 bool UpdateResumePhis) {
9701 VPBuilder Builder(Plan.getEntry());
9702 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
9703 auto *VPI = dyn_cast<VPInstruction>(&R);
9704 if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindIVResult)
9705 continue;
9706 VPValue *OrigStart = VPI->getOperand(1);
9707 if (isGuaranteedNotToBeUndefOrPoison(OrigStart->getLiveInIRValue()))
9708 continue;
9709 VPInstruction *Freeze =
9710 Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr");
9711 VPI->setOperand(1, Freeze);
9712 if (UpdateResumePhis)
9713 OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
9714 return Freeze != &U && isa<VPPhi>(&U);
9715 });
9716 }
9717 };
9718 AddFreezeForFindLastIVReductions(MainPlan, true);
9719 AddFreezeForFindLastIVReductions(EpiPlan, false);
9720
9721 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
9722 VPValue *VectorTC = &MainPlan.getVectorTripCount();
9723 // If there is a suitable resume value for the canonical induction in the
9724 // scalar (which will become vector) epilogue loop we are done. Otherwise
9725 // create it below.
9726 if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
9727 return match(&R, m_VPInstruction<Instruction::PHI>(m_Specific(VectorTC),
9728 m_SpecificInt(0)));
9729 }))
9730 return;
9731 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
9732 ScalarPHBuilder.createScalarPhi(
9733 {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
9734 "vec.epilog.resume.val");
9735 }
9736
9737 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9738 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
9739 static void
preparePlanForEpilogueVectorLoop(VPlan & Plan,Loop * L,const SCEV2ValueTy & ExpandedSCEVs,const EpilogueLoopVectorizationInfo & EPI)9740 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
9741 const SCEV2ValueTy &ExpandedSCEVs,
9742 const EpilogueLoopVectorizationInfo &EPI) {
9743 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9744 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9745 Header->setName("vec.epilog.vector.body");
9746
9747 DenseMap<Value *, Value *> ToFrozen;
9748 // Ensure that the start values for all header phi recipes are updated before
9749 // vectorizing the epilogue loop.
9750 for (VPRecipeBase &R : Header->phis()) {
9751 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
9752 // When vectorizing the epilogue loop, the canonical induction start
9753 // value needs to be changed from zero to the value after the main
9754 // vector loop. Find the resume value created during execution of the main
9755 // VPlan.
9756 // FIXME: Improve modeling for canonical IV start values in the epilogue
9757 // loop.
9758 using namespace llvm::PatternMatch;
9759 Type *IdxTy = IV->getScalarType();
9760 PHINode *EPResumeVal = find_singleton<PHINode>(
9761 L->getLoopPreheader()->phis(),
9762 [&EPI, IdxTy](PHINode &P, bool) -> PHINode * {
9763 if (P.getType() == IdxTy &&
9764 match(
9765 P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
9766 m_SpecificInt(0)) &&
9767 all_of(P.incoming_values(), [&EPI](Value *Inc) {
9768 return Inc == EPI.VectorTripCount ||
9769 match(Inc, m_SpecificInt(0));
9770 }))
9771 return &P;
9772 return nullptr;
9773 });
9774 assert(EPResumeVal && "must have a resume value for the canonical IV");
9775 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
9776 assert(all_of(IV->users(),
9777 [](const VPUser *U) {
9778 return isa<VPScalarIVStepsRecipe>(U) ||
9779 isa<VPDerivedIVRecipe>(U) ||
9780 cast<VPRecipeBase>(U)->isScalarCast() ||
9781 cast<VPInstruction>(U)->getOpcode() ==
9782 Instruction::Add;
9783 }) &&
9784 "the canonical IV should only be used by its increment or "
9785 "ScalarIVSteps when resetting the start value");
9786 IV->setOperand(0, VPV);
9787 continue;
9788 }
9789
9790 Value *ResumeV = nullptr;
9791 // TODO: Move setting of resume values to prepareToExecute.
9792 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
9793 auto *RdxResult =
9794 cast<VPInstruction>(*find_if(ReductionPhi->users(), [](VPUser *U) {
9795 auto *VPI = dyn_cast<VPInstruction>(U);
9796 return VPI &&
9797 (VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
9798 VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
9799 VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
9800 }));
9801 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
9802 ->getIncomingValueForBlock(L->getLoopPreheader());
9803 RecurKind RK = ReductionPhi->getRecurrenceKind();
9804 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
9805 Value *StartV = RdxResult->getOperand(1)->getLiveInIRValue();
9806 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9807 // start value; compare the final value from the main vector loop
9808 // to the start value.
9809 BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
9810 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9811 ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
9812 } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(RK)) {
9813 Value *StartV = getStartValueFromReductionResult(RdxResult);
9814 ToFrozen[StartV] = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
9815 EPI.MainLoopIterationCountCheck);
9816
9817 // VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions requires
9818 // an adjustment to the resume value. The resume value is adjusted to
9819 // the sentinel value when the final value from the main vector loop
9820 // equals the start value. This ensures correctness when the start value
9821 // might not be less than the minimum value of a monotonically
9822 // increasing induction variable.
9823 BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
9824 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9825 Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
9826 Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
9827 ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
9828 } else {
9829 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9830 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9831 if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
9832 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9833 "unexpected start value");
9834 VPI->setOperand(0, StartVal);
9835 continue;
9836 }
9837 }
9838 } else {
9839 // Retrieve the induction resume values for wide inductions from
9840 // their original phi nodes in the scalar loop.
9841 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
9842 // Hook up to the PHINode generated by a ResumePhi recipe of main
9843 // loop VPlan, which feeds the scalar loop.
9844 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
9845 }
9846 assert(ResumeV && "Must have a resume value");
9847 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9848 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
9849 }
9850
9851 // For some VPValues in the epilogue plan we must re-use the generated IR
9852 // values from the main plan. Replace them with live-in VPValues.
9853 // TODO: This is a workaround needed for epilogue vectorization and it
9854 // should be removed once induction resume value creation is done
9855 // directly in VPlan.
9856 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
9857 // Re-use frozen values from the main plan for Freeze VPInstructions in the
9858 // epilogue plan. This ensures all users use the same frozen value.
9859 auto *VPI = dyn_cast<VPInstruction>(&R);
9860 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9861 VPI->replaceAllUsesWith(Plan.getOrAddLiveIn(
9862 ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
9863 continue;
9864 }
9865
9866 // Re-use the trip count and steps expanded for the main loop, as
9867 // skeleton creation needs it as a value that dominates both the scalar
9868 // and vector epilogue loops
9869 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
9870 if (!ExpandR)
9871 continue;
9872 VPValue *ExpandedVal =
9873 Plan.getOrAddLiveIn(ExpandedSCEVs.lookup(ExpandR->getSCEV()));
9874 ExpandR->replaceAllUsesWith(ExpandedVal);
9875 if (Plan.getTripCount() == ExpandR)
9876 Plan.resetTripCount(ExpandedVal);
9877 ExpandR->eraseFromParent();
9878 }
9879 }
9880
9881 // Generate bypass values from the additional bypass block. Note that when the
9882 // vectorized epilogue is skipped due to iteration count check, then the
9883 // resume value for the induction variable comes from the trip count of the
9884 // main vector loop, passed as the second argument.
createInductionAdditionalBypassValues(PHINode * OrigPhi,const InductionDescriptor & II,IRBuilder<> & BypassBuilder,const SCEV2ValueTy & ExpandedSCEVs,Value * MainVectorTripCount,Instruction * OldInduction)9885 static Value *createInductionAdditionalBypassValues(
9886 PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
9887 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
9888 Instruction *OldInduction) {
9889 Value *Step = getExpandedStep(II, ExpandedSCEVs);
9890 // For the primary induction the additional bypass end value is known.
9891 // Otherwise it is computed.
9892 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
9893 if (OrigPhi != OldInduction) {
9894 auto *BinOp = II.getInductionBinOp();
9895 // Fast-math-flags propagate from the original induction instruction.
9896 if (isa_and_nonnull<FPMathOperator>(BinOp))
9897 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
9898
9899 // Compute the end value for the additional bypass.
9900 EndValueFromAdditionalBypass =
9901 emitTransformedIndex(BypassBuilder, MainVectorTripCount,
9902 II.getStartValue(), Step, II.getKind(), BinOp);
9903 EndValueFromAdditionalBypass->setName("ind.end");
9904 }
9905 return EndValueFromAdditionalBypass;
9906 }
9907
processLoop(Loop * L)9908 bool LoopVectorizePass::processLoop(Loop *L) {
9909 assert((EnableVPlanNativePath || L->isInnermost()) &&
9910 "VPlan-native path is not enabled. Only process inner loops.");
9911
9912 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9913 << L->getHeader()->getParent()->getName() << "' from "
9914 << L->getLocStr() << "\n");
9915
9916 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9917
9918 LLVM_DEBUG(
9919 dbgs() << "LV: Loop hints:"
9920 << " force="
9921 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9922 ? "disabled"
9923 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9924 ? "enabled"
9925 : "?"))
9926 << " width=" << Hints.getWidth()
9927 << " interleave=" << Hints.getInterleave() << "\n");
9928
9929 // Function containing loop
9930 Function *F = L->getHeader()->getParent();
9931
9932 // Looking at the diagnostic output is the only way to determine if a loop
9933 // was vectorized (other than looking at the IR or machine code), so it
9934 // is important to generate an optimization remark for each loop. Most of
9935 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9936 // generated as OptimizationRemark and OptimizationRemarkMissed are
9937 // less verbose reporting vectorized loops and unvectorized loops that may
9938 // benefit from vectorization, respectively.
9939
9940 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9941 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9942 return false;
9943 }
9944
9945 PredicatedScalarEvolution PSE(*SE, *L);
9946
9947 // Check if it is legal to vectorize the loop.
9948 LoopVectorizationRequirements Requirements;
9949 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9950 &Requirements, &Hints, DB, AC, BFI, PSI);
9951 if (!LVL.canVectorize(EnableVPlanNativePath)) {
9952 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9953 Hints.emitRemarkWithHints();
9954 return false;
9955 }
9956
9957 if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
9958 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
9959 "early exit is not enabled",
9960 "UncountableEarlyExitLoopsDisabled", ORE, L);
9961 return false;
9962 }
9963
9964 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9965 // here. They may require CFG and instruction level transformations before
9966 // even evaluating whether vectorization is profitable. Since we cannot modify
9967 // the incoming IR, we need to build VPlan upfront in the vectorization
9968 // pipeline.
9969 if (!L->isInnermost())
9970 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9971 ORE, BFI, PSI, Hints, Requirements);
9972
9973 assert(L->isInnermost() && "Inner loop expected.");
9974
9975 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9976 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9977
9978 // If an override option has been passed in for interleaved accesses, use it.
9979 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9980 UseInterleaved = EnableInterleavedMemAccesses;
9981
9982 // Analyze interleaved memory accesses.
9983 if (UseInterleaved)
9984 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9985
9986 if (LVL.hasUncountableEarlyExit()) {
9987 BasicBlock *LoopLatch = L->getLoopLatch();
9988 if (IAI.requiresScalarEpilogue() ||
9989 any_of(LVL.getCountableExitingBlocks(),
9990 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
9991 reportVectorizationFailure("Auto-vectorization of early exit loops "
9992 "requiring a scalar epilogue is unsupported",
9993 "UncountableEarlyExitUnsupported", ORE, L);
9994 return false;
9995 }
9996 }
9997
9998 // Check the function attributes and profiles to find out if this function
9999 // should be optimized for size.
10000 ScalarEpilogueLowering SEL =
10001 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10002
10003 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10004 // count by optimizing for size, to minimize overheads.
10005 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10006 if (ExpectedTC && ExpectedTC->isFixed() &&
10007 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
10008 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10009 << "This loop is worth vectorizing only if no scalar "
10010 << "iteration overheads are incurred.");
10011 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10012 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10013 else {
10014 LLVM_DEBUG(dbgs() << "\n");
10015 // Predicate tail-folded loops are efficient even when the loop
10016 // iteration count is low. However, setting the epilogue policy to
10017 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10018 // with runtime checks. It's more effective to let
10019 // `isOutsideLoopWorkProfitable` determine if vectorization is
10020 // beneficial for the loop.
10021 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10022 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10023 }
10024 }
10025
10026 // Check the function attributes to see if implicit floats or vectors are
10027 // allowed.
10028 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10029 reportVectorizationFailure(
10030 "Can't vectorize when the NoImplicitFloat attribute is used",
10031 "loop not vectorized due to NoImplicitFloat attribute",
10032 "NoImplicitFloat", ORE, L);
10033 Hints.emitRemarkWithHints();
10034 return false;
10035 }
10036
10037 // Check if the target supports potentially unsafe FP vectorization.
10038 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10039 // for the target we're vectorizing for, to make sure none of the
10040 // additional fp-math flags can help.
10041 if (Hints.isPotentiallyUnsafe() &&
10042 TTI->isFPVectorizationPotentiallyUnsafe()) {
10043 reportVectorizationFailure(
10044 "Potentially unsafe FP op prevents vectorization",
10045 "loop not vectorized due to unsafe FP support.",
10046 "UnsafeFP", ORE, L);
10047 Hints.emitRemarkWithHints();
10048 return false;
10049 }
10050
10051 bool AllowOrderedReductions;
10052 // If the flag is set, use that instead and override the TTI behaviour.
10053 if (ForceOrderedReductions.getNumOccurrences() > 0)
10054 AllowOrderedReductions = ForceOrderedReductions;
10055 else
10056 AllowOrderedReductions = TTI->enableOrderedReductions();
10057 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10058 ORE->emit([&]() {
10059 auto *ExactFPMathInst = Requirements.getExactFPInst();
10060 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10061 ExactFPMathInst->getDebugLoc(),
10062 ExactFPMathInst->getParent())
10063 << "loop not vectorized: cannot prove it is safe to reorder "
10064 "floating-point operations";
10065 });
10066 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10067 "reorder floating-point operations\n");
10068 Hints.emitRemarkWithHints();
10069 return false;
10070 }
10071
10072 // Use the cost model.
10073 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10074 F, &Hints, IAI, PSI, BFI);
10075 // Use the planner for vectorization.
10076 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10077 ORE);
10078
10079 // Get user vectorization factor and interleave count.
10080 ElementCount UserVF = Hints.getWidth();
10081 unsigned UserIC = Hints.getInterleave();
10082 if (LVL.hasUncountableEarlyExit() && UserIC != 1) {
10083 UserIC = 1;
10084 reportVectorizationInfo("Interleaving not supported for loops "
10085 "with uncountable early exits",
10086 "InterleaveEarlyExitDisabled", ORE, L);
10087 }
10088
10089 // Plan how to best vectorize.
10090 LVP.plan(UserVF, UserIC);
10091 VectorizationFactor VF = LVP.computeBestVF();
10092 unsigned IC = 1;
10093
10094 if (ORE->allowExtraAnalysis(LV_NAME))
10095 LVP.emitInvalidCostRemarks(ORE);
10096
10097 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
10098 if (LVP.hasPlanWithVF(VF.Width)) {
10099 // Select the interleave count.
10100 IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
10101
10102 unsigned SelectedIC = std::max(IC, UserIC);
10103 // Optimistically generate runtime checks if they are needed. Drop them if
10104 // they turn out to not be profitable.
10105 if (VF.Width.isVector() || SelectedIC > 1)
10106 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10107
10108 // Check if it is profitable to vectorize with runtime checks.
10109 bool ForceVectorization =
10110 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10111 VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
10112 CM, CM.CostKind);
10113 if (!ForceVectorization &&
10114 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
10115 LVP.getPlanFor(VF.Width), SEL,
10116 CM.getVScaleForTuning())) {
10117 ORE->emit([&]() {
10118 return OptimizationRemarkAnalysisAliasing(
10119 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10120 L->getHeader())
10121 << "loop not vectorized: cannot prove it is safe to reorder "
10122 "memory operations";
10123 });
10124 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10125 Hints.emitRemarkWithHints();
10126 return false;
10127 }
10128 }
10129
10130 // Identify the diagnostic messages that should be produced.
10131 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10132 bool VectorizeLoop = true, InterleaveLoop = true;
10133 if (VF.Width.isScalar()) {
10134 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10135 VecDiagMsg = {
10136 "VectorizationNotBeneficial",
10137 "the cost-model indicates that vectorization is not beneficial"};
10138 VectorizeLoop = false;
10139 }
10140
10141 if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10142 // Tell the user interleaving was avoided up-front, despite being explicitly
10143 // requested.
10144 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10145 "interleaving should be avoided up front\n");
10146 IntDiagMsg = {"InterleavingAvoided",
10147 "Ignoring UserIC, because interleaving was avoided up front"};
10148 InterleaveLoop = false;
10149 } else if (IC == 1 && UserIC <= 1) {
10150 // Tell the user interleaving is not beneficial.
10151 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10152 IntDiagMsg = {
10153 "InterleavingNotBeneficial",
10154 "the cost-model indicates that interleaving is not beneficial"};
10155 InterleaveLoop = false;
10156 if (UserIC == 1) {
10157 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10158 IntDiagMsg.second +=
10159 " and is explicitly disabled or interleave count is set to 1";
10160 }
10161 } else if (IC > 1 && UserIC == 1) {
10162 // Tell the user interleaving is beneficial, but it explicitly disabled.
10163 LLVM_DEBUG(
10164 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10165 IntDiagMsg = {"InterleavingBeneficialButDisabled",
10166 "the cost-model indicates that interleaving is beneficial "
10167 "but is explicitly disabled or interleave count is set to 1"};
10168 InterleaveLoop = false;
10169 }
10170
10171 // If there is a histogram in the loop, do not just interleave without
10172 // vectorizing. The order of operations will be incorrect without the
10173 // histogram intrinsics, which are only used for recipes with VF > 1.
10174 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10175 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10176 << "to histogram operations.\n");
10177 IntDiagMsg = {
10178 "HistogramPreventsScalarInterleaving",
10179 "Unable to interleave without vectorization due to constraints on "
10180 "the order of histogram operations"};
10181 InterleaveLoop = false;
10182 }
10183
10184 // Override IC if user provided an interleave count.
10185 IC = UserIC > 0 ? UserIC : IC;
10186
10187 // Emit diagnostic messages, if any.
10188 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10189 if (!VectorizeLoop && !InterleaveLoop) {
10190 // Do not vectorize or interleaving the loop.
10191 ORE->emit([&]() {
10192 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10193 L->getStartLoc(), L->getHeader())
10194 << VecDiagMsg.second;
10195 });
10196 ORE->emit([&]() {
10197 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10198 L->getStartLoc(), L->getHeader())
10199 << IntDiagMsg.second;
10200 });
10201 return false;
10202 }
10203
10204 if (!VectorizeLoop && InterleaveLoop) {
10205 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10206 ORE->emit([&]() {
10207 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10208 L->getStartLoc(), L->getHeader())
10209 << VecDiagMsg.second;
10210 });
10211 } else if (VectorizeLoop && !InterleaveLoop) {
10212 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10213 << ") in " << L->getLocStr() << '\n');
10214 ORE->emit([&]() {
10215 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10216 L->getStartLoc(), L->getHeader())
10217 << IntDiagMsg.second;
10218 });
10219 } else if (VectorizeLoop && InterleaveLoop) {
10220 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10221 << ") in " << L->getLocStr() << '\n');
10222 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10223 }
10224
10225 bool DisableRuntimeUnroll = false;
10226 MDNode *OrigLoopID = L->getLoopID();
10227 {
10228 using namespace ore;
10229 if (!VectorizeLoop) {
10230 assert(IC > 1 && "interleave count should not be 1 or 0");
10231 // If we decided that it is not legal to vectorize the loop, then
10232 // interleave it.
10233 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10234 InnerLoopVectorizer Unroller(
10235 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10236 ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
10237
10238 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10239
10240 ORE->emit([&]() {
10241 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10242 L->getHeader())
10243 << "interleaved loop (interleaved count: "
10244 << NV("InterleaveCount", IC) << ")";
10245 });
10246 } else {
10247 // If we decided that it is *legal* to vectorize the loop, then do it.
10248
10249 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10250 // Consider vectorizing the epilogue too if it's profitable.
10251 VectorizationFactor EpilogueVF =
10252 LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10253 if (EpilogueVF.Width.isVector()) {
10254 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10255
10256 // The first pass vectorizes the main loop and creates a scalar epilogue
10257 // to be vectorized by executing the plan (potentially with a different
10258 // factor) again shortly afterwards.
10259 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10260 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
10261 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10262 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10263 BestEpiPlan);
10264 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10265 EPI, &CM, BFI, PSI, Checks,
10266 *BestMainPlan);
10267 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10268 *BestMainPlan, MainILV, DT, false);
10269 ++LoopsVectorized;
10270
10271 // Second pass vectorizes the epilogue and adjusts the control flow
10272 // edges from the first pass.
10273 EPI.MainLoopVF = EPI.EpilogueVF;
10274 EPI.MainLoopUF = EPI.EpilogueUF;
10275 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10276 ORE, EPI, &CM, BFI, PSI,
10277 Checks, BestEpiPlan);
10278 EpilogILV.setTripCount(MainILV.getTripCount());
10279 preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10280
10281 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10282 DT, true);
10283
10284 // Fix induction resume values from the additional bypass block.
10285 BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock();
10286 IRBuilder<> BypassBuilder(BypassBlock,
10287 BypassBlock->getFirstInsertionPt());
10288 BasicBlock *PH = L->getLoopPreheader();
10289 for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
10290 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
10291 Value *V = createInductionAdditionalBypassValues(
10292 IVPhi, II, BypassBuilder, ExpandedSCEVs, EPI.VectorTripCount,
10293 LVL.getPrimaryInduction());
10294 // TODO: Directly add as extra operand to the VPResumePHI recipe.
10295 Inc->setIncomingValueForBlock(BypassBlock, V);
10296 }
10297 ++LoopsEpilogueVectorized;
10298
10299 if (!Checks.hasChecks())
10300 DisableRuntimeUnroll = true;
10301 } else {
10302 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10303 VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
10304 Checks, BestPlan);
10305 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10306 ++LoopsVectorized;
10307
10308 // Add metadata to disable runtime unrolling a scalar loop when there
10309 // are no runtime checks about strides and memory. A scalar loop that is
10310 // rarely used is not worth unrolling.
10311 if (!Checks.hasChecks())
10312 DisableRuntimeUnroll = true;
10313 }
10314 // Report the vectorization decision.
10315 reportVectorization(ORE, L, VF, IC);
10316 }
10317
10318 if (ORE->allowExtraAnalysis(LV_NAME))
10319 checkMixedPrecision(L, ORE);
10320 }
10321
10322 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10323 "DT not preserved correctly");
10324
10325 std::optional<MDNode *> RemainderLoopID =
10326 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10327 LLVMLoopVectorizeFollowupEpilogue});
10328 if (RemainderLoopID) {
10329 L->setLoopID(*RemainderLoopID);
10330 } else {
10331 if (DisableRuntimeUnroll)
10332 addRuntimeUnrollDisableMetaData(L);
10333
10334 // Mark the loop as already vectorized to avoid vectorizing again.
10335 Hints.setAlreadyVectorized();
10336 }
10337
10338 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10339 return true;
10340 }
10341
runImpl(Function & F)10342 LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
10343
10344 // Don't attempt if
10345 // 1. the target claims to have no vector registers, and
10346 // 2. interleaving won't help ILP.
10347 //
10348 // The second condition is necessary because, even if the target has no
10349 // vector registers, loop vectorization may still enable scalar
10350 // interleaving.
10351 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10352 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10353 return LoopVectorizeResult(false, false);
10354
10355 bool Changed = false, CFGChanged = false;
10356
10357 // The vectorizer requires loops to be in simplified form.
10358 // Since simplification may add new inner loops, it has to run before the
10359 // legality and profitability checks. This means running the loop vectorizer
10360 // will simplify all loops, regardless of whether anything end up being
10361 // vectorized.
10362 for (const auto &L : *LI)
10363 Changed |= CFGChanged |=
10364 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10365
10366 // Build up a worklist of inner-loops to vectorize. This is necessary as
10367 // the act of vectorizing or partially unrolling a loop creates new loops
10368 // and can invalidate iterators across the loops.
10369 SmallVector<Loop *, 8> Worklist;
10370
10371 for (Loop *L : *LI)
10372 collectSupportedLoops(*L, LI, ORE, Worklist);
10373
10374 LoopsAnalyzed += Worklist.size();
10375
10376 // Now walk the identified inner loops.
10377 while (!Worklist.empty()) {
10378 Loop *L = Worklist.pop_back_val();
10379
10380 // For the inner loops we actually process, form LCSSA to simplify the
10381 // transform.
10382 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10383
10384 Changed |= CFGChanged |= processLoop(L);
10385
10386 if (Changed) {
10387 LAIs->clear();
10388
10389 #ifndef NDEBUG
10390 if (VerifySCEV)
10391 SE->verify();
10392 #endif
10393 }
10394 }
10395
10396 // Process each loop nest in the function.
10397 return LoopVectorizeResult(Changed, CFGChanged);
10398 }
10399
run(Function & F,FunctionAnalysisManager & AM)10400 PreservedAnalyses LoopVectorizePass::run(Function &F,
10401 FunctionAnalysisManager &AM) {
10402 LI = &AM.getResult<LoopAnalysis>(F);
10403 // There are no loops in the function. Return before computing other
10404 // expensive analyses.
10405 if (LI->empty())
10406 return PreservedAnalyses::all();
10407 SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
10408 TTI = &AM.getResult<TargetIRAnalysis>(F);
10409 DT = &AM.getResult<DominatorTreeAnalysis>(F);
10410 TLI = &AM.getResult<TargetLibraryAnalysis>(F);
10411 AC = &AM.getResult<AssumptionAnalysis>(F);
10412 DB = &AM.getResult<DemandedBitsAnalysis>(F);
10413 ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10414 LAIs = &AM.getResult<LoopAccessAnalysis>(F);
10415
10416 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10417 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10418 BFI = nullptr;
10419 if (PSI && PSI->hasProfileSummary())
10420 BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10421 LoopVectorizeResult Result = runImpl(F);
10422 if (!Result.MadeAnyChange)
10423 return PreservedAnalyses::all();
10424 PreservedAnalyses PA;
10425
10426 if (isAssignmentTrackingEnabled(*F.getParent())) {
10427 for (auto &BB : F)
10428 RemoveRedundantDbgInstrs(&BB);
10429 }
10430
10431 PA.preserve<LoopAnalysis>();
10432 PA.preserve<DominatorTreeAnalysis>();
10433 PA.preserve<ScalarEvolutionAnalysis>();
10434 PA.preserve<LoopAccessAnalysis>();
10435
10436 if (Result.MadeCFGChange) {
10437 // Making CFG changes likely means a loop got vectorized. Indicate that
10438 // extra simplification passes should be run.
10439 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10440 // be run if runtime checks have been added.
10441 AM.getResult<ShouldRunExtraVectorPasses>(F);
10442 PA.preserve<ShouldRunExtraVectorPasses>();
10443 } else {
10444 PA.preserveSet<CFGAnalyses>();
10445 }
10446 return PA;
10447 }
10448
printPipeline(raw_ostream & OS,function_ref<StringRef (StringRef)> MapClassName2PassName)10449 void LoopVectorizePass::printPipeline(
10450 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10451 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10452 OS, MapClassName2PassName);
10453
10454 OS << '<';
10455 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10456 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10457 OS << '>';
10458 }
10459