xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision e64bea71c21eb42e97aa615188ba91f6cce0d36d)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanCFG.h"
62 #include "VPlanHelpers.h"
63 #include "VPlanPatternMatch.h"
64 #include "VPlanTransforms.h"
65 #include "VPlanUtils.h"
66 #include "VPlanVerifier.h"
67 #include "llvm/ADT/APInt.h"
68 #include "llvm/ADT/ArrayRef.h"
69 #include "llvm/ADT/DenseMap.h"
70 #include "llvm/ADT/DenseMapInfo.h"
71 #include "llvm/ADT/Hashing.h"
72 #include "llvm/ADT/MapVector.h"
73 #include "llvm/ADT/STLExtras.h"
74 #include "llvm/ADT/SmallPtrSet.h"
75 #include "llvm/ADT/SmallVector.h"
76 #include "llvm/ADT/Statistic.h"
77 #include "llvm/ADT/StringRef.h"
78 #include "llvm/ADT/Twine.h"
79 #include "llvm/ADT/TypeSwitch.h"
80 #include "llvm/ADT/iterator_range.h"
81 #include "llvm/Analysis/AssumptionCache.h"
82 #include "llvm/Analysis/BasicAliasAnalysis.h"
83 #include "llvm/Analysis/BlockFrequencyInfo.h"
84 #include "llvm/Analysis/CFG.h"
85 #include "llvm/Analysis/CodeMetrics.h"
86 #include "llvm/Analysis/DemandedBits.h"
87 #include "llvm/Analysis/GlobalsModRef.h"
88 #include "llvm/Analysis/LoopAccessAnalysis.h"
89 #include "llvm/Analysis/LoopAnalysisManager.h"
90 #include "llvm/Analysis/LoopInfo.h"
91 #include "llvm/Analysis/LoopIterator.h"
92 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
93 #include "llvm/Analysis/ProfileSummaryInfo.h"
94 #include "llvm/Analysis/ScalarEvolution.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/ValueTracking.h"
99 #include "llvm/Analysis/VectorUtils.h"
100 #include "llvm/IR/Attributes.h"
101 #include "llvm/IR/BasicBlock.h"
102 #include "llvm/IR/CFG.h"
103 #include "llvm/IR/Constant.h"
104 #include "llvm/IR/Constants.h"
105 #include "llvm/IR/DataLayout.h"
106 #include "llvm/IR/DebugInfo.h"
107 #include "llvm/IR/DebugLoc.h"
108 #include "llvm/IR/DerivedTypes.h"
109 #include "llvm/IR/DiagnosticInfo.h"
110 #include "llvm/IR/Dominators.h"
111 #include "llvm/IR/Function.h"
112 #include "llvm/IR/IRBuilder.h"
113 #include "llvm/IR/InstrTypes.h"
114 #include "llvm/IR/Instruction.h"
115 #include "llvm/IR/Instructions.h"
116 #include "llvm/IR/IntrinsicInst.h"
117 #include "llvm/IR/Intrinsics.h"
118 #include "llvm/IR/MDBuilder.h"
119 #include "llvm/IR/Metadata.h"
120 #include "llvm/IR/Module.h"
121 #include "llvm/IR/Operator.h"
122 #include "llvm/IR/PatternMatch.h"
123 #include "llvm/IR/ProfDataUtils.h"
124 #include "llvm/IR/Type.h"
125 #include "llvm/IR/Use.h"
126 #include "llvm/IR/User.h"
127 #include "llvm/IR/Value.h"
128 #include "llvm/IR/Verifier.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/NativeFormatting.h"
136 #include "llvm/Support/raw_ostream.h"
137 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
138 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
139 #include "llvm/Transforms/Utils/Local.h"
140 #include "llvm/Transforms/Utils/LoopSimplify.h"
141 #include "llvm/Transforms/Utils/LoopUtils.h"
142 #include "llvm/Transforms/Utils/LoopVersioning.h"
143 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
144 #include "llvm/Transforms/Utils/SizeOpts.h"
145 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
146 #include <algorithm>
147 #include <cassert>
148 #include <cstdint>
149 #include <functional>
150 #include <iterator>
151 #include <limits>
152 #include <memory>
153 #include <string>
154 #include <tuple>
155 #include <utility>
156 
157 using namespace llvm;
158 
159 #define LV_NAME "loop-vectorize"
160 #define DEBUG_TYPE LV_NAME
161 
162 #ifndef NDEBUG
163 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
164 #endif
165 
166 /// @{
167 /// Metadata attribute names
168 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169 const char LLVMLoopVectorizeFollowupVectorized[] =
170     "llvm.loop.vectorize.followup_vectorized";
171 const char LLVMLoopVectorizeFollowupEpilogue[] =
172     "llvm.loop.vectorize.followup_epilogue";
173 /// @}
174 
175 STATISTIC(LoopsVectorized, "Number of loops vectorized");
176 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
177 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178 STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
179 
180 static cl::opt<bool> EnableEpilogueVectorization(
181     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
182     cl::desc("Enable vectorization of epilogue loops."));
183 
184 static cl::opt<unsigned> EpilogueVectorizationForceVF(
185     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
186     cl::desc("When epilogue vectorization is enabled, and a value greater than "
187              "1 is specified, forces the given VF for all applicable epilogue "
188              "loops."));
189 
190 static cl::opt<unsigned> EpilogueVectorizationMinVF(
191     "epilogue-vectorization-minimum-VF", cl::Hidden,
192     cl::desc("Only loops with vectorization factor equal to or larger than "
193              "the specified value are considered for epilogue vectorization."));
194 
195 /// Loops with a known constant trip count below this number are vectorized only
196 /// if no scalar iteration overheads are incurred.
197 static cl::opt<unsigned> TinyTripCountVectorThreshold(
198     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
199     cl::desc("Loops with a constant trip count that is smaller than this "
200              "value are vectorized only if no scalar iteration overheads "
201              "are incurred."));
202 
203 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
204     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
205     cl::desc("The maximum allowed number of runtime memory checks"));
206 
207 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208 // that predication is preferred, and this lists all options. I.e., the
209 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
210 // and predicate the instructions accordingly. If tail-folding fails, there are
211 // different fallback strategies depending on these values:
212 namespace PreferPredicateTy {
213   enum Option {
214     ScalarEpilogue = 0,
215     PredicateElseScalarEpilogue,
216     PredicateOrDontVectorize
217   };
218 } // namespace PreferPredicateTy
219 
220 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
221     "prefer-predicate-over-epilogue",
222     cl::init(PreferPredicateTy::ScalarEpilogue),
223     cl::Hidden,
224     cl::desc("Tail-folding and predication preferences over creating a scalar "
225              "epilogue loop."),
226     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
227                          "scalar-epilogue",
228                          "Don't tail-predicate loops, create scalar epilogue"),
229               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
230                          "predicate-else-scalar-epilogue",
231                          "prefer tail-folding, create scalar epilogue if tail "
232                          "folding fails."),
233               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
234                          "predicate-dont-vectorize",
235                          "prefers tail-folding, don't attempt vectorization if "
236                          "tail-folding fails.")));
237 
238 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
239     "force-tail-folding-style", cl::desc("Force the tail folding style"),
240     cl::init(TailFoldingStyle::None),
241     cl::values(
242         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
243         clEnumValN(
244             TailFoldingStyle::Data, "data",
245             "Create lane mask for data only, using active.lane.mask intrinsic"),
246         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
247                    "data-without-lane-mask",
248                    "Create lane mask with compare/stepvector"),
249         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
250                    "Create lane mask using active.lane.mask intrinsic, and use "
251                    "it for both data and control flow"),
252         clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253                    "data-and-control-without-rt-check",
254                    "Similar to data-and-control, but remove the runtime check"),
255         clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
256                    "Use predicated EVL instructions for tail folding. If EVL "
257                    "is unsupported, fallback to data-without-lane-mask.")));
258 
259 static cl::opt<bool> MaximizeBandwidth(
260     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
261     cl::desc("Maximize bandwidth when selecting vectorization factor which "
262              "will be determined by the smallest type in loop."));
263 
264 static cl::opt<bool> EnableInterleavedMemAccesses(
265     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
266     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
267 
268 /// An interleave-group may need masking if it resides in a block that needs
269 /// predication, or in order to mask away gaps.
270 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
271     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
272     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
273 
274 static cl::opt<unsigned> ForceTargetNumScalarRegs(
275     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
276     cl::desc("A flag that overrides the target's number of scalar registers."));
277 
278 static cl::opt<unsigned> ForceTargetNumVectorRegs(
279     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
280     cl::desc("A flag that overrides the target's number of vector registers."));
281 
282 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
283     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
284     cl::desc("A flag that overrides the target's max interleave factor for "
285              "scalar loops."));
286 
287 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
288     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
289     cl::desc("A flag that overrides the target's max interleave factor for "
290              "vectorized loops."));
291 
292 cl::opt<unsigned> llvm::ForceTargetInstructionCost(
293     "force-target-instruction-cost", cl::init(0), cl::Hidden,
294     cl::desc("A flag that overrides the target's expected cost for "
295              "an instruction to a single constant value. Mostly "
296              "useful for getting consistent testing."));
297 
298 static cl::opt<bool> ForceTargetSupportsScalableVectors(
299     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
300     cl::desc(
301         "Pretend that scalable vectors are supported, even if the target does "
302         "not support them. This flag should only be used for testing."));
303 
304 static cl::opt<unsigned> SmallLoopCost(
305     "small-loop-cost", cl::init(20), cl::Hidden,
306     cl::desc(
307         "The cost of a loop that is considered 'small' by the interleaver."));
308 
309 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
310     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
311     cl::desc("Enable the use of the block frequency analysis to access PGO "
312              "heuristics minimizing code growth in cold regions and being more "
313              "aggressive in hot regions."));
314 
315 // Runtime interleave loops for load/store throughput.
316 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
317     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
318     cl::desc(
319         "Enable runtime interleaving until load/store ports are saturated"));
320 
321 /// The number of stores in a loop that are allowed to need predication.
322 static cl::opt<unsigned> NumberOfStoresToPredicate(
323     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
324     cl::desc("Max number of stores to be predicated behind an if."));
325 
326 static cl::opt<bool> EnableIndVarRegisterHeur(
327     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
328     cl::desc("Count the induction variable only once when interleaving"));
329 
330 static cl::opt<bool> EnableCondStoresVectorization(
331     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
332     cl::desc("Enable if predication of stores during vectorization."));
333 
334 static cl::opt<unsigned> MaxNestedScalarReductionIC(
335     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
336     cl::desc("The maximum interleave count to use when interleaving a scalar "
337              "reduction in a nested loop."));
338 
339 static cl::opt<bool>
340     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
341                            cl::Hidden,
342                            cl::desc("Prefer in-loop vector reductions, "
343                                     "overriding the targets preference."));
344 
345 static cl::opt<bool> ForceOrderedReductions(
346     "force-ordered-reductions", cl::init(false), cl::Hidden,
347     cl::desc("Enable the vectorisation of loops with in-order (strict) "
348              "FP reductions"));
349 
350 static cl::opt<bool> PreferPredicatedReductionSelect(
351     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
352     cl::desc(
353         "Prefer predicating a reduction operation over an after loop select."));
354 
355 cl::opt<bool> llvm::EnableVPlanNativePath(
356     "enable-vplan-native-path", cl::Hidden,
357     cl::desc("Enable VPlan-native vectorization path with "
358              "support for outer loop vectorization."));
359 
360 cl::opt<bool>
361     llvm::VerifyEachVPlan("vplan-verify-each",
362 #ifdef EXPENSIVE_CHECKS
363                           cl::init(true),
364 #else
365                           cl::init(false),
366 #endif
367                           cl::Hidden,
368                           cl::desc("Verfiy VPlans after VPlan transforms."));
369 
370 // This flag enables the stress testing of the VPlan H-CFG construction in the
371 // VPlan-native vectorization path. It must be used in conjuction with
372 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
373 // verification of the H-CFGs built.
374 static cl::opt<bool> VPlanBuildStressTest(
375     "vplan-build-stress-test", cl::init(false), cl::Hidden,
376     cl::desc(
377         "Build VPlan for every supported loop nest in the function and bail "
378         "out right after the build (stress test the VPlan H-CFG construction "
379         "in the VPlan-native vectorization path)."));
380 
381 cl::opt<bool> llvm::EnableLoopInterleaving(
382     "interleave-loops", cl::init(true), cl::Hidden,
383     cl::desc("Enable loop interleaving in Loop vectorization passes"));
384 cl::opt<bool> llvm::EnableLoopVectorization(
385     "vectorize-loops", cl::init(true), cl::Hidden,
386     cl::desc("Run the Loop vectorization passes"));
387 
388 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
389     "force-widen-divrem-via-safe-divisor", cl::Hidden,
390     cl::desc(
391         "Override cost based safe divisor widening for div/rem instructions"));
392 
393 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
394     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
395     cl::Hidden,
396     cl::desc("Try wider VFs if they enable the use of vector variants"));
397 
398 static cl::opt<bool> EnableEarlyExitVectorization(
399     "enable-early-exit-vectorization", cl::init(true), cl::Hidden,
400     cl::desc(
401         "Enable vectorization of early exit loops with uncountable exits."));
402 
403 // Likelyhood of bypassing the vectorized loop because there are zero trips left
404 // after prolog. See `emitIterationCountCheck`.
405 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
406 
407 /// A helper function that returns true if the given type is irregular. The
408 /// type is irregular if its allocated size doesn't equal the store size of an
409 /// element of the corresponding vector type.
hasIrregularType(Type * Ty,const DataLayout & DL)410 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
411   // Determine if an array of N elements of type Ty is "bitcast compatible"
412   // with a <N x Ty> vector.
413   // This is only true if there is no padding between the array elements.
414   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
415 }
416 
417 /// A version of ScalarEvolution::getSmallConstantTripCount that returns an
418 /// ElementCount to include loops whose trip count is a function of vscale.
getSmallConstantTripCount(ScalarEvolution * SE,const Loop * L)419 static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
420                                               const Loop *L) {
421   return ElementCount::getFixed(SE->getSmallConstantTripCount(L));
422 }
423 
424 /// Returns "best known" trip count, which is either a valid positive trip count
425 /// or std::nullopt when an estimate cannot be made (including when the trip
426 /// count would overflow), for the specified loop \p L as defined by the
427 /// following procedure:
428 ///   1) Returns exact trip count if it is known.
429 ///   2) Returns expected trip count according to profile data if any.
430 ///   3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
431 ///   4) Returns std::nullopt if all of the above failed.
432 static std::optional<ElementCount>
getSmallBestKnownTC(PredicatedScalarEvolution & PSE,Loop * L,bool CanUseConstantMax=true)433 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
434                     bool CanUseConstantMax = true) {
435   // Check if exact trip count is known.
436   if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
437     return ExpectedTC;
438 
439   // Check if there is an expected trip count available from profile data.
440   if (LoopVectorizeWithBlockFrequency)
441     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
442       return ElementCount::getFixed(*EstimatedTC);
443 
444   if (!CanUseConstantMax)
445     return std::nullopt;
446 
447   // Check if upper bound estimate is known.
448   if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
449     return ElementCount::getFixed(ExpectedTC);
450 
451   return std::nullopt;
452 }
453 
454 namespace {
455 // Forward declare GeneratedRTChecks.
456 class GeneratedRTChecks;
457 
458 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
459 } // namespace
460 
461 namespace llvm {
462 
463 AnalysisKey ShouldRunExtraVectorPasses::Key;
464 
465 /// InnerLoopVectorizer vectorizes loops which contain only one basic
466 /// block to a specified vectorization factor (VF).
467 /// This class performs the widening of scalars into vectors, or multiple
468 /// scalars. This class also implements the following features:
469 /// * It inserts an epilogue loop for handling loops that don't have iteration
470 ///   counts that are known to be a multiple of the vectorization factor.
471 /// * It handles the code generation for reduction variables.
472 /// * Scalarization (implementation using scalars) of un-vectorizable
473 ///   instructions.
474 /// InnerLoopVectorizer does not perform any vectorization-legality
475 /// checks, and relies on the caller to check for the different legality
476 /// aspects. The InnerLoopVectorizer relies on the
477 /// LoopVectorizationLegality class to provide information about the induction
478 /// and reduction variables that were found to a given vectorization factor.
479 class InnerLoopVectorizer {
480 public:
InnerLoopVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,ElementCount VecWidth,ElementCount MinProfitableTripCount,unsigned UnrollFactor,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & RTChecks,VPlan & Plan)481   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
482                       LoopInfo *LI, DominatorTree *DT,
483                       const TargetLibraryInfo *TLI,
484                       const TargetTransformInfo *TTI, AssumptionCache *AC,
485                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
486                       ElementCount MinProfitableTripCount,
487                       unsigned UnrollFactor, LoopVectorizationCostModel *CM,
488                       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
489                       GeneratedRTChecks &RTChecks, VPlan &Plan)
490       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
491         AC(AC), ORE(ORE), VF(VecWidth),
492         MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
493         Builder(PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI),
494         RTChecks(RTChecks), Plan(Plan),
495         VectorPHVPB(Plan.getVectorLoopRegion()->getSinglePredecessor()) {}
496 
497   virtual ~InnerLoopVectorizer() = default;
498 
499   /// Create a new empty loop that will contain vectorized instructions later
500   /// on, while the old loop will be used as the scalar remainder. Control flow
501   /// is generated around the vectorized (and scalar epilogue) loops consisting
502   /// of various checks and bypasses. Return the pre-header block of the new
503   /// loop. In the case of epilogue vectorization, this function is overriden to
504   /// handle the more complex control flow around the loops.
505   virtual BasicBlock *createVectorizedLoopSkeleton();
506 
507   /// Fix the vectorized code, taking care of header phi's, and more.
508   void fixVectorizedLoop(VPTransformState &State);
509 
510   /// Fix the non-induction PHIs in \p Plan.
511   void fixNonInductionPHIs(VPTransformState &State);
512 
513   /// Returns the original loop trip count.
getTripCount() const514   Value *getTripCount() const { return TripCount; }
515 
516   /// Used to set the trip count after ILV's construction and after the
517   /// preheader block has been executed. Note that this always holds the trip
518   /// count of the original loop for both main loop and epilogue vectorization.
setTripCount(Value * TC)519   void setTripCount(Value *TC) { TripCount = TC; }
520 
521   /// Return the additional bypass block which targets the scalar loop by
522   /// skipping the epilogue loop after completing the main loop.
getAdditionalBypassBlock() const523   BasicBlock *getAdditionalBypassBlock() const {
524     assert(AdditionalBypassBlock &&
525            "Trying to access AdditionalBypassBlock but it has not been set");
526     return AdditionalBypassBlock;
527   }
528 
529 protected:
530   friend class LoopVectorizationPlanner;
531 
532   /// Returns (and creates if needed) the trip count of the widened loop.
533   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
534 
535   // Create a check to see if the vector loop should be executed
536   Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
537 
538   /// Emit a bypass check to see if the vector trip count is zero, including if
539   /// it overflows.
540   void emitIterationCountCheck(BasicBlock *Bypass);
541 
542   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
543   /// vector loop preheader, middle block and scalar preheader.
544   void createVectorLoopSkeleton(StringRef Prefix);
545 
546   /// Allow subclasses to override and print debug traces before/after vplan
547   /// execution, when trace information is requested.
printDebugTracesAtStart()548   virtual void printDebugTracesAtStart() {}
printDebugTracesAtEnd()549   virtual void printDebugTracesAtEnd() {}
550 
551   /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
552   /// vector preheader and its predecessor, also connecting the new block to the
553   /// scalar preheader.
554   void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
555 
556   /// The original loop.
557   Loop *OrigLoop;
558 
559   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
560   /// dynamic knowledge to simplify SCEV expressions and converts them to a
561   /// more usable form.
562   PredicatedScalarEvolution &PSE;
563 
564   /// Loop Info.
565   LoopInfo *LI;
566 
567   /// Dominator Tree.
568   DominatorTree *DT;
569 
570   /// Target Library Info.
571   const TargetLibraryInfo *TLI;
572 
573   /// Target Transform Info.
574   const TargetTransformInfo *TTI;
575 
576   /// Assumption Cache.
577   AssumptionCache *AC;
578 
579   /// Interface to emit optimization remarks.
580   OptimizationRemarkEmitter *ORE;
581 
582   /// The vectorization SIMD factor to use. Each vector will have this many
583   /// vector elements.
584   ElementCount VF;
585 
586   ElementCount MinProfitableTripCount;
587 
588   /// The vectorization unroll factor to use. Each scalar is vectorized to this
589   /// many different vector instructions.
590   unsigned UF;
591 
592   /// The builder that we use
593   IRBuilder<> Builder;
594 
595   // --- Vectorization state ---
596 
597   /// The vector-loop preheader.
598   BasicBlock *LoopVectorPreHeader = nullptr;
599 
600   /// The scalar-loop preheader.
601   BasicBlock *LoopScalarPreHeader = nullptr;
602 
603   /// Middle Block between the vector and the scalar.
604   BasicBlock *LoopMiddleBlock = nullptr;
605 
606   /// Trip count of the original loop.
607   Value *TripCount = nullptr;
608 
609   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
610   Value *VectorTripCount = nullptr;
611 
612   /// The profitablity analysis.
613   LoopVectorizationCostModel *Cost;
614 
615   /// BFI and PSI are used to check for profile guided size optimizations.
616   BlockFrequencyInfo *BFI;
617   ProfileSummaryInfo *PSI;
618 
619   /// Structure to hold information about generated runtime checks, responsible
620   /// for cleaning the checks, if vectorization turns out unprofitable.
621   GeneratedRTChecks &RTChecks;
622 
623   /// The additional bypass block which conditionally skips over the epilogue
624   /// loop after executing the main loop. Needed to resume inductions and
625   /// reductions during epilogue vectorization.
626   BasicBlock *AdditionalBypassBlock = nullptr;
627 
628   VPlan &Plan;
629 
630   /// The vector preheader block of \p Plan, used as target for check blocks
631   /// introduced during skeleton creation.
632   VPBlockBase *VectorPHVPB;
633 };
634 
635 /// Encapsulate information regarding vectorization of a loop and its epilogue.
636 /// This information is meant to be updated and used across two stages of
637 /// epilogue vectorization.
638 struct EpilogueLoopVectorizationInfo {
639   ElementCount MainLoopVF = ElementCount::getFixed(0);
640   unsigned MainLoopUF = 0;
641   ElementCount EpilogueVF = ElementCount::getFixed(0);
642   unsigned EpilogueUF = 0;
643   BasicBlock *MainLoopIterationCountCheck = nullptr;
644   BasicBlock *EpilogueIterationCountCheck = nullptr;
645   Value *TripCount = nullptr;
646   Value *VectorTripCount = nullptr;
647   VPlan &EpiloguePlan;
648 
EpilogueLoopVectorizationInfollvm::EpilogueLoopVectorizationInfo649   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
650                                 ElementCount EVF, unsigned EUF,
651                                 VPlan &EpiloguePlan)
652       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
653         EpiloguePlan(EpiloguePlan) {
654     assert(EUF == 1 &&
655            "A high UF for the epilogue loop is likely not beneficial.");
656   }
657 };
658 
659 /// An extension of the inner loop vectorizer that creates a skeleton for a
660 /// vectorized loop that has its epilogue (residual) also vectorized.
661 /// The idea is to run the vplan on a given loop twice, firstly to setup the
662 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
663 /// from the first step and vectorize the epilogue.  This is achieved by
664 /// deriving two concrete strategy classes from this base class and invoking
665 /// them in succession from the loop vectorizer planner.
666 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
667 public:
InnerLoopAndEpilogueVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks,VPlan & Plan)668   InnerLoopAndEpilogueVectorizer(
669       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
670       DominatorTree *DT, const TargetLibraryInfo *TLI,
671       const TargetTransformInfo *TTI, AssumptionCache *AC,
672       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
673       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
674       ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
675       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
676                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, CM,
677                             BFI, PSI, Checks, Plan),
678         EPI(EPI) {}
679 
680   // Override this function to handle the more complex control flow around the
681   // three loops.
createVectorizedLoopSkeleton()682   BasicBlock *createVectorizedLoopSkeleton() final {
683     return createEpilogueVectorizedLoopSkeleton();
684   }
685 
686   /// The interface for creating a vectorized skeleton using one of two
687   /// different strategies, each corresponding to one execution of the vplan
688   /// as described above.
689   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
690 
691   /// Holds and updates state information required to vectorize the main loop
692   /// and its epilogue in two separate passes. This setup helps us avoid
693   /// regenerating and recomputing runtime safety checks. It also helps us to
694   /// shorten the iteration-count-check path length for the cases where the
695   /// iteration count of the loop is so small that the main vector loop is
696   /// completely skipped.
697   EpilogueLoopVectorizationInfo &EPI;
698 };
699 
700 /// A specialized derived class of inner loop vectorizer that performs
701 /// vectorization of *main* loops in the process of vectorizing loops and their
702 /// epilogues.
703 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
704 public:
EpilogueVectorizerMainLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check,VPlan & Plan)705   EpilogueVectorizerMainLoop(
706       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
707       DominatorTree *DT, const TargetLibraryInfo *TLI,
708       const TargetTransformInfo *TTI, AssumptionCache *AC,
709       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
710       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
711       ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
712       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
713                                        EPI, CM, BFI, PSI, Check, Plan) {}
714   /// Implements the interface for creating a vectorized skeleton using the
715   /// *main loop* strategy (ie the first pass of vplan execution).
716   BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
717 
718 protected:
719   /// Emits an iteration count bypass check once for the main loop (when \p
720   /// ForEpilogue is false) and once for the epilogue loop (when \p
721   /// ForEpilogue is true).
722   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
723   void printDebugTracesAtStart() override;
724   void printDebugTracesAtEnd() override;
725 };
726 
727 // A specialized derived class of inner loop vectorizer that performs
728 // vectorization of *epilogue* loops in the process of vectorizing loops and
729 // their epilogues.
730 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
731 public:
EpilogueVectorizerEpilogueLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks,VPlan & Plan)732   EpilogueVectorizerEpilogueLoop(
733       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
734       DominatorTree *DT, const TargetLibraryInfo *TLI,
735       const TargetTransformInfo *TTI, AssumptionCache *AC,
736       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
737       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
738       ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
739       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
740                                        EPI, CM, BFI, PSI, Checks, Plan) {
741     TripCount = EPI.TripCount;
742   }
743   /// Implements the interface for creating a vectorized skeleton using the
744   /// *epilogue loop* strategy (ie the second pass of vplan execution).
745   BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
746 
747 protected:
748   /// Emits an iteration count bypass check after the main vector loop has
749   /// finished to see if there are any iterations left to execute by either
750   /// the vector epilogue or the scalar epilogue.
751   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
752                                                       BasicBlock *Bypass,
753                                                       BasicBlock *Insert);
754   void printDebugTracesAtStart() override;
755   void printDebugTracesAtEnd() override;
756 };
757 } // end namespace llvm
758 
759 /// Look for a meaningful debug location on the instruction or its operands.
getDebugLocFromInstOrOperands(Instruction * I)760 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
761   if (!I)
762     return DebugLoc::getUnknown();
763 
764   DebugLoc Empty;
765   if (I->getDebugLoc() != Empty)
766     return I->getDebugLoc();
767 
768   for (Use &Op : I->operands()) {
769     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
770       if (OpInst->getDebugLoc() != Empty)
771         return OpInst->getDebugLoc();
772   }
773 
774   return I->getDebugLoc();
775 }
776 
777 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
778 /// is passed, the message relates to that particular instruction.
779 #ifndef NDEBUG
debugVectorizationMessage(const StringRef Prefix,const StringRef DebugMsg,Instruction * I)780 static void debugVectorizationMessage(const StringRef Prefix,
781                                       const StringRef DebugMsg,
782                                       Instruction *I) {
783   dbgs() << "LV: " << Prefix << DebugMsg;
784   if (I != nullptr)
785     dbgs() << " " << *I;
786   else
787     dbgs() << '.';
788   dbgs() << '\n';
789 }
790 #endif
791 
792 /// Create an analysis remark that explains why vectorization failed
793 ///
794 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
795 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
796 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
797 /// the location of the remark. If \p DL is passed, use it as debug location for
798 /// the remark. \return the remark object that can be streamed to.
799 static OptimizationRemarkAnalysis
createLVAnalysis(const char * PassName,StringRef RemarkName,Loop * TheLoop,Instruction * I,DebugLoc DL={})800 createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
801                  Instruction *I, DebugLoc DL = {}) {
802   BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
803   // If debug location is attached to the instruction, use it. Otherwise if DL
804   // was not provided, use the loop's.
805   if (I && I->getDebugLoc())
806     DL = I->getDebugLoc();
807   else if (!DL)
808     DL = TheLoop->getStartLoc();
809 
810   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
811 }
812 
813 namespace llvm {
814 
815 /// Return a value for Step multiplied by VF.
createStepForVF(IRBuilderBase & B,Type * Ty,ElementCount VF,int64_t Step)816 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
817                        int64_t Step) {
818   assert(Ty->isIntegerTy() && "Expected an integer step");
819   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
820 }
821 
822 /// Return the runtime value for VF.
getRuntimeVF(IRBuilderBase & B,Type * Ty,ElementCount VF)823 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
824   return B.CreateElementCount(Ty, VF);
825 }
826 
reportVectorizationFailure(const StringRef DebugMsg,const StringRef OREMsg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)827 void reportVectorizationFailure(const StringRef DebugMsg,
828                                 const StringRef OREMsg, const StringRef ORETag,
829                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
830                                 Instruction *I) {
831   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
832   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
833   ORE->emit(
834       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
835       << "loop not vectorized: " << OREMsg);
836 }
837 
838 /// Reports an informative message: print \p Msg for debugging purposes as well
839 /// as an optimization remark. Uses either \p I as location of the remark, or
840 /// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
841 /// remark. If \p DL is passed, use it as debug location for the remark.
reportVectorizationInfo(const StringRef Msg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I=nullptr,DebugLoc DL={})842 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
843                                     OptimizationRemarkEmitter *ORE,
844                                     Loop *TheLoop, Instruction *I = nullptr,
845                                     DebugLoc DL = {}) {
846   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
847   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
848   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
849                              I, DL)
850             << Msg);
851 }
852 
853 /// Report successful vectorization of the loop. In case an outer loop is
854 /// vectorized, prepend "outer" to the vectorization remark.
reportVectorization(OptimizationRemarkEmitter * ORE,Loop * TheLoop,VectorizationFactor VF,unsigned IC)855 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
856                                 VectorizationFactor VF, unsigned IC) {
857   LLVM_DEBUG(debugVectorizationMessage(
858       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
859       nullptr));
860   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
861   ORE->emit([&]() {
862     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
863                               TheLoop->getHeader())
864            << "vectorized " << LoopType << "loop (vectorization width: "
865            << ore::NV("VectorizationFactor", VF.Width)
866            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
867   });
868 }
869 
870 } // end namespace llvm
871 
872 namespace llvm {
873 
874 // Loop vectorization cost-model hints how the scalar epilogue loop should be
875 // lowered.
876 enum ScalarEpilogueLowering {
877 
878   // The default: allowing scalar epilogues.
879   CM_ScalarEpilogueAllowed,
880 
881   // Vectorization with OptForSize: don't allow epilogues.
882   CM_ScalarEpilogueNotAllowedOptSize,
883 
884   // A special case of vectorisation with OptForSize: loops with a very small
885   // trip count are considered for vectorization under OptForSize, thereby
886   // making sure the cost of their loop body is dominant, free of runtime
887   // guards and scalar iteration overheads.
888   CM_ScalarEpilogueNotAllowedLowTripLoop,
889 
890   // Loop hint predicate indicating an epilogue is undesired.
891   CM_ScalarEpilogueNotNeededUsePredicate,
892 
893   // Directive indicating we must either tail fold or not vectorize
894   CM_ScalarEpilogueNotAllowedUsePredicate
895 };
896 
897 /// LoopVectorizationCostModel - estimates the expected speedups due to
898 /// vectorization.
899 /// In many cases vectorization is not profitable. This can happen because of
900 /// a number of reasons. In this class we mainly attempt to predict the
901 /// expected speedup/slowdowns due to the supported instruction set. We use the
902 /// TargetTransformInfo to query the different backends for the cost of
903 /// different operations.
904 class LoopVectorizationCostModel {
905   friend class LoopVectorizationPlanner;
906 
907 public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL,Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,LoopVectorizationLegality * Legal,const TargetTransformInfo & TTI,const TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,const Function * F,const LoopVectorizeHints * Hints,InterleavedAccessInfo & IAI,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)908   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
909                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
910                              LoopVectorizationLegality *Legal,
911                              const TargetTransformInfo &TTI,
912                              const TargetLibraryInfo *TLI, DemandedBits *DB,
913                              AssumptionCache *AC,
914                              OptimizationRemarkEmitter *ORE, const Function *F,
915                              const LoopVectorizeHints *Hints,
916                              InterleavedAccessInfo &IAI,
917                              ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
918       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
919         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
920         Hints(Hints), InterleaveInfo(IAI) {
921     if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
922       initializeVScaleForTuning();
923     CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
924     // Query this against the original loop and save it here because the profile
925     // of the original loop header may change as the transformation happens.
926     OptForSize = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
927                                              PGSOQueryType::IRPass);
928   }
929 
930   /// \return An upper bound for the vectorization factors (both fixed and
931   /// scalable). If the factors are 0, vectorization and interleaving should be
932   /// avoided up front.
933   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
934 
935   /// \return True if runtime checks are required for vectorization, and false
936   /// otherwise.
937   bool runtimeChecksRequired();
938 
939   /// Setup cost-based decisions for user vectorization factor.
940   /// \return true if the UserVF is a feasible VF to be chosen.
selectUserVectorizationFactor(ElementCount UserVF)941   bool selectUserVectorizationFactor(ElementCount UserVF) {
942     collectNonVectorizedAndSetWideningDecisions(UserVF);
943     return expectedCost(UserVF).isValid();
944   }
945 
946   /// \return True if maximizing vector bandwidth is enabled by the target or
947   /// user options, for the given register kind.
948   bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
949 
950   /// \return True if maximizing vector bandwidth is enabled by the target or
951   /// user options, for the given vector factor.
952   bool useMaxBandwidth(ElementCount VF);
953 
954   /// \return The size (in bits) of the smallest and widest types in the code
955   /// that needs to be vectorized. We ignore values that remain scalar such as
956   /// 64 bit loop indices.
957   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
958 
959   /// \return The desired interleave count.
960   /// If interleave count has been specified by metadata it will be returned.
961   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
962   /// are the selected vectorization factor and the cost of the selected VF.
963   unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
964                                  InstructionCost LoopCost);
965 
966   /// Memory access instruction may be vectorized in more than one way.
967   /// Form of instruction after vectorization depends on cost.
968   /// This function takes cost-based decisions for Load/Store instructions
969   /// and collects them in a map. This decisions map is used for building
970   /// the lists of loop-uniform and loop-scalar instructions.
971   /// The calculated cost is saved with widening decision in order to
972   /// avoid redundant calculations.
973   void setCostBasedWideningDecision(ElementCount VF);
974 
975   /// A call may be vectorized in different ways depending on whether we have
976   /// vectorized variants available and whether the target supports masking.
977   /// This function analyzes all calls in the function at the supplied VF,
978   /// makes a decision based on the costs of available options, and stores that
979   /// decision in a map for use in planning and plan execution.
980   void setVectorizedCallDecision(ElementCount VF);
981 
982   /// Collect values we want to ignore in the cost model.
983   void collectValuesToIgnore();
984 
985   /// Collect all element types in the loop for which widening is needed.
986   void collectElementTypesForWidening();
987 
988   /// Split reductions into those that happen in the loop, and those that happen
989   /// outside. In loop reductions are collected into InLoopReductions.
990   void collectInLoopReductions();
991 
992   /// Returns true if we should use strict in-order reductions for the given
993   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
994   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
995   /// of FP operations.
useOrderedReductions(const RecurrenceDescriptor & RdxDesc) const996   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
997     return !Hints->allowReordering() && RdxDesc.isOrdered();
998   }
999 
1000   /// \returns The smallest bitwidth each instruction can be represented with.
1001   /// The vector equivalents of these instructions should be truncated to this
1002   /// type.
getMinimalBitwidths() const1003   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1004     return MinBWs;
1005   }
1006 
1007   /// \returns True if it is more profitable to scalarize instruction \p I for
1008   /// vectorization factor \p VF.
isProfitableToScalarize(Instruction * I,ElementCount VF) const1009   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1010     assert(VF.isVector() &&
1011            "Profitable to scalarize relevant only for VF > 1.");
1012     assert(
1013         TheLoop->isInnermost() &&
1014         "cost-model should not be used for outer loops (in VPlan-native path)");
1015 
1016     auto Scalars = InstsToScalarize.find(VF);
1017     assert(Scalars != InstsToScalarize.end() &&
1018            "VF not yet analyzed for scalarization profitability");
1019     return Scalars->second.contains(I);
1020   }
1021 
1022   /// Returns true if \p I is known to be uniform after vectorization.
isUniformAfterVectorization(Instruction * I,ElementCount VF) const1023   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1024     assert(
1025         TheLoop->isInnermost() &&
1026         "cost-model should not be used for outer loops (in VPlan-native path)");
1027     // Pseudo probe needs to be duplicated for each unrolled iteration and
1028     // vector lane so that profiled loop trip count can be accurately
1029     // accumulated instead of being under counted.
1030     if (isa<PseudoProbeInst>(I))
1031       return false;
1032 
1033     if (VF.isScalar())
1034       return true;
1035 
1036     auto UniformsPerVF = Uniforms.find(VF);
1037     assert(UniformsPerVF != Uniforms.end() &&
1038            "VF not yet analyzed for uniformity");
1039     return UniformsPerVF->second.count(I);
1040   }
1041 
1042   /// Returns true if \p I is known to be scalar after vectorization.
isScalarAfterVectorization(Instruction * I,ElementCount VF) const1043   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1044     assert(
1045         TheLoop->isInnermost() &&
1046         "cost-model should not be used for outer loops (in VPlan-native path)");
1047     if (VF.isScalar())
1048       return true;
1049 
1050     auto ScalarsPerVF = Scalars.find(VF);
1051     assert(ScalarsPerVF != Scalars.end() &&
1052            "Scalar values are not calculated for VF");
1053     return ScalarsPerVF->second.count(I);
1054   }
1055 
1056   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1057   /// for vectorization factor \p VF.
canTruncateToMinimalBitwidth(Instruction * I,ElementCount VF) const1058   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1059     return VF.isVector() && MinBWs.contains(I) &&
1060            !isProfitableToScalarize(I, VF) &&
1061            !isScalarAfterVectorization(I, VF);
1062   }
1063 
1064   /// Decision that was taken during cost calculation for memory instruction.
1065   enum InstWidening {
1066     CM_Unknown,
1067     CM_Widen,         // For consecutive accesses with stride +1.
1068     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1069     CM_Interleave,
1070     CM_GatherScatter,
1071     CM_Scalarize,
1072     CM_VectorCall,
1073     CM_IntrinsicCall
1074   };
1075 
1076   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1077   /// instruction \p I and vector width \p VF.
setWideningDecision(Instruction * I,ElementCount VF,InstWidening W,InstructionCost Cost)1078   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1079                            InstructionCost Cost) {
1080     assert(VF.isVector() && "Expected VF >=2");
1081     WideningDecisions[{I, VF}] = {W, Cost};
1082   }
1083 
1084   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1085   /// interleaving group \p Grp and vector width \p VF.
setWideningDecision(const InterleaveGroup<Instruction> * Grp,ElementCount VF,InstWidening W,InstructionCost Cost)1086   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1087                            ElementCount VF, InstWidening W,
1088                            InstructionCost Cost) {
1089     assert(VF.isVector() && "Expected VF >=2");
1090     /// Broadcast this decicion to all instructions inside the group.
1091     /// When interleaving, the cost will only be assigned one instruction, the
1092     /// insert position. For other cases, add the appropriate fraction of the
1093     /// total cost to each instruction. This ensures accurate costs are used,
1094     /// even if the insert position instruction is not used.
1095     InstructionCost InsertPosCost = Cost;
1096     InstructionCost OtherMemberCost = 0;
1097     if (W != CM_Interleave)
1098       OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1099     ;
1100     for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1101       if (auto *I = Grp->getMember(Idx)) {
1102         if (Grp->getInsertPos() == I)
1103           WideningDecisions[{I, VF}] = {W, InsertPosCost};
1104         else
1105           WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1106       }
1107     }
1108   }
1109 
1110   /// Return the cost model decision for the given instruction \p I and vector
1111   /// width \p VF. Return CM_Unknown if this instruction did not pass
1112   /// through the cost modeling.
getWideningDecision(Instruction * I,ElementCount VF) const1113   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1114     assert(VF.isVector() && "Expected VF to be a vector VF");
1115     assert(
1116         TheLoop->isInnermost() &&
1117         "cost-model should not be used for outer loops (in VPlan-native path)");
1118 
1119     std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1120     auto Itr = WideningDecisions.find(InstOnVF);
1121     if (Itr == WideningDecisions.end())
1122       return CM_Unknown;
1123     return Itr->second.first;
1124   }
1125 
1126   /// Return the vectorization cost for the given instruction \p I and vector
1127   /// width \p VF.
getWideningCost(Instruction * I,ElementCount VF)1128   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1129     assert(VF.isVector() && "Expected VF >=2");
1130     std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1131     assert(WideningDecisions.contains(InstOnVF) &&
1132            "The cost is not calculated");
1133     return WideningDecisions[InstOnVF].second;
1134   }
1135 
1136   struct CallWideningDecision {
1137     InstWidening Kind;
1138     Function *Variant;
1139     Intrinsic::ID IID;
1140     std::optional<unsigned> MaskPos;
1141     InstructionCost Cost;
1142   };
1143 
setCallWideningDecision(CallInst * CI,ElementCount VF,InstWidening Kind,Function * Variant,Intrinsic::ID IID,std::optional<unsigned> MaskPos,InstructionCost Cost)1144   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1145                                Function *Variant, Intrinsic::ID IID,
1146                                std::optional<unsigned> MaskPos,
1147                                InstructionCost Cost) {
1148     assert(!VF.isScalar() && "Expected vector VF");
1149     CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, MaskPos, Cost};
1150   }
1151 
getCallWideningDecision(CallInst * CI,ElementCount VF) const1152   CallWideningDecision getCallWideningDecision(CallInst *CI,
1153                                                ElementCount VF) const {
1154     assert(!VF.isScalar() && "Expected vector VF");
1155     return CallWideningDecisions.at({CI, VF});
1156   }
1157 
1158   /// Return True if instruction \p I is an optimizable truncate whose operand
1159   /// is an induction variable. Such a truncate will be removed by adding a new
1160   /// induction variable with the destination type.
isOptimizableIVTruncate(Instruction * I,ElementCount VF)1161   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1162     // If the instruction is not a truncate, return false.
1163     auto *Trunc = dyn_cast<TruncInst>(I);
1164     if (!Trunc)
1165       return false;
1166 
1167     // Get the source and destination types of the truncate.
1168     Type *SrcTy = toVectorTy(Trunc->getSrcTy(), VF);
1169     Type *DestTy = toVectorTy(Trunc->getDestTy(), VF);
1170 
1171     // If the truncate is free for the given types, return false. Replacing a
1172     // free truncate with an induction variable would add an induction variable
1173     // update instruction to each iteration of the loop. We exclude from this
1174     // check the primary induction variable since it will need an update
1175     // instruction regardless.
1176     Value *Op = Trunc->getOperand(0);
1177     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1178       return false;
1179 
1180     // If the truncated value is not an induction variable, return false.
1181     return Legal->isInductionPhi(Op);
1182   }
1183 
1184   /// Collects the instructions to scalarize for each predicated instruction in
1185   /// the loop.
1186   void collectInstsToScalarize(ElementCount VF);
1187 
1188   /// Collect values that will not be widened, including Uniforms, Scalars, and
1189   /// Instructions to Scalarize for the given \p VF.
1190   /// The sets depend on CM decision for Load/Store instructions
1191   /// that may be vectorized as interleave, gather-scatter or scalarized.
1192   /// Also make a decision on what to do about call instructions in the loop
1193   /// at that VF -- scalarize, call a known vector routine, or call a
1194   /// vector intrinsic.
collectNonVectorizedAndSetWideningDecisions(ElementCount VF)1195   void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
1196     // Do the analysis once.
1197     if (VF.isScalar() || Uniforms.contains(VF))
1198       return;
1199     setCostBasedWideningDecision(VF);
1200     collectLoopUniforms(VF);
1201     setVectorizedCallDecision(VF);
1202     collectLoopScalars(VF);
1203     collectInstsToScalarize(VF);
1204   }
1205 
1206   /// Returns true if the target machine supports masked store operation
1207   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedStore(Type * DataType,Value * Ptr,Align Alignment,unsigned AddressSpace) const1208   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1209                           unsigned AddressSpace) const {
1210     return Legal->isConsecutivePtr(DataType, Ptr) &&
1211            TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
1212   }
1213 
1214   /// Returns true if the target machine supports masked load operation
1215   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedLoad(Type * DataType,Value * Ptr,Align Alignment,unsigned AddressSpace) const1216   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1217                          unsigned AddressSpace) const {
1218     return Legal->isConsecutivePtr(DataType, Ptr) &&
1219            TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1220   }
1221 
1222   /// Returns true if the target machine can represent \p V as a masked gather
1223   /// or scatter operation.
isLegalGatherOrScatter(Value * V,ElementCount VF)1224   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1225     bool LI = isa<LoadInst>(V);
1226     bool SI = isa<StoreInst>(V);
1227     if (!LI && !SI)
1228       return false;
1229     auto *Ty = getLoadStoreType(V);
1230     Align Align = getLoadStoreAlignment(V);
1231     if (VF.isVector())
1232       Ty = VectorType::get(Ty, VF);
1233     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1234            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1235   }
1236 
1237   /// Returns true if the target machine supports all of the reduction
1238   /// variables found for the given VF.
canVectorizeReductions(ElementCount VF) const1239   bool canVectorizeReductions(ElementCount VF) const {
1240     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1241       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1242       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1243     }));
1244   }
1245 
1246   /// Given costs for both strategies, return true if the scalar predication
1247   /// lowering should be used for div/rem.  This incorporates an override
1248   /// option so it is not simply a cost comparison.
isDivRemScalarWithPredication(InstructionCost ScalarCost,InstructionCost SafeDivisorCost) const1249   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1250                                      InstructionCost SafeDivisorCost) const {
1251     switch (ForceSafeDivisor) {
1252     case cl::BOU_UNSET:
1253       return ScalarCost < SafeDivisorCost;
1254     case cl::BOU_TRUE:
1255       return false;
1256     case cl::BOU_FALSE:
1257       return true;
1258     }
1259     llvm_unreachable("impossible case value");
1260   }
1261 
1262   /// Returns true if \p I is an instruction which requires predication and
1263   /// for which our chosen predication strategy is scalarization (i.e. we
1264   /// don't have an alternate strategy such as masking available).
1265   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1266   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1267 
1268   /// Returns true if \p I is an instruction that needs to be predicated
1269   /// at runtime.  The result is independent of the predication mechanism.
1270   /// Superset of instructions that return true for isScalarWithPredication.
1271   bool isPredicatedInst(Instruction *I) const;
1272 
1273   /// Return the costs for our two available strategies for lowering a
1274   /// div/rem operation which requires speculating at least one lane.
1275   /// First result is for scalarization (will be invalid for scalable
1276   /// vectors); second is for the safe-divisor strategy.
1277   std::pair<InstructionCost, InstructionCost>
1278   getDivRemSpeculationCost(Instruction *I,
1279                            ElementCount VF) const;
1280 
1281   /// Returns true if \p I is a memory instruction with consecutive memory
1282   /// access that can be widened.
1283   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1284 
1285   /// Returns true if \p I is a memory instruction in an interleaved-group
1286   /// of memory accesses that can be vectorized with wide vector loads/stores
1287   /// and shuffles.
1288   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1289 
1290   /// Check if \p Instr belongs to any interleaved access group.
isAccessInterleaved(Instruction * Instr) const1291   bool isAccessInterleaved(Instruction *Instr) const {
1292     return InterleaveInfo.isInterleaved(Instr);
1293   }
1294 
1295   /// Get the interleaved access group that \p Instr belongs to.
1296   const InterleaveGroup<Instruction> *
getInterleavedAccessGroup(Instruction * Instr) const1297   getInterleavedAccessGroup(Instruction *Instr) const {
1298     return InterleaveInfo.getInterleaveGroup(Instr);
1299   }
1300 
1301   /// Returns true if we're required to use a scalar epilogue for at least
1302   /// the final iteration of the original loop.
requiresScalarEpilogue(bool IsVectorizing) const1303   bool requiresScalarEpilogue(bool IsVectorizing) const {
1304     if (!isScalarEpilogueAllowed()) {
1305       LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1306       return false;
1307     }
1308     // If we might exit from anywhere but the latch and early exit vectorization
1309     // is disabled, we must run the exiting iteration in scalar form.
1310     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1311         !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1312       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1313                            "from latch block\n");
1314       return true;
1315     }
1316     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1317       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1318                            "interleaved group requires scalar epilogue\n");
1319       return true;
1320     }
1321     LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1322     return false;
1323   }
1324 
1325   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1326   /// loop hint annotation.
isScalarEpilogueAllowed() const1327   bool isScalarEpilogueAllowed() const {
1328     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1329   }
1330 
1331   /// Returns the TailFoldingStyle that is best for the current loop.
getTailFoldingStyle(bool IVUpdateMayOverflow=true) const1332   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1333     if (!ChosenTailFoldingStyle)
1334       return TailFoldingStyle::None;
1335     return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1336                                : ChosenTailFoldingStyle->second;
1337   }
1338 
1339   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1340   /// overflow or not.
1341   /// \param IsScalableVF true if scalable vector factors enabled.
1342   /// \param UserIC User specific interleave count.
setTailFoldingStyles(bool IsScalableVF,unsigned UserIC)1343   void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1344     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1345     if (!Legal->canFoldTailByMasking()) {
1346       ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1347       return;
1348     }
1349 
1350     // Default to TTI preference, but allow command line override.
1351     ChosenTailFoldingStyle = {
1352         TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1353         TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
1354     if (ForceTailFoldingStyle.getNumOccurrences())
1355       ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
1356                                 ForceTailFoldingStyle.getValue()};
1357 
1358     if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1359       return;
1360     // Override forced styles if needed.
1361     // FIXME: Investigate opportunity for fixed vector factor.
1362     bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1363                       TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1364     if (EVLIsLegal)
1365       return;
1366     // If for some reason EVL mode is unsupported, fallback to
1367     // DataWithoutLaneMask to try to vectorize the loop with folded tail
1368     // in a generic way.
1369     ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
1370                               TailFoldingStyle::DataWithoutLaneMask};
1371     LLVM_DEBUG(
1372         dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1373                   "not try to generate VP Intrinsics "
1374                << (UserIC > 1
1375                        ? "since interleave count specified is greater than 1.\n"
1376                        : "due to non-interleaving reasons.\n"));
1377   }
1378 
1379   /// Returns true if all loop blocks should be masked to fold tail loop.
foldTailByMasking() const1380   bool foldTailByMasking() const {
1381     // TODO: check if it is possible to check for None style independent of
1382     // IVUpdateMayOverflow flag in getTailFoldingStyle.
1383     return getTailFoldingStyle() != TailFoldingStyle::None;
1384   }
1385 
1386   /// Return maximum safe number of elements to be processed per vector
1387   /// iteration, which do not prevent store-load forwarding and are safe with
1388   /// regard to the memory dependencies. Required for EVL-based VPlans to
1389   /// correctly calculate AVL (application vector length) as min(remaining AVL,
1390   /// MaxSafeElements).
1391   /// TODO: need to consider adjusting cost model to use this value as a
1392   /// vectorization factor for EVL-based vectorization.
getMaxSafeElements() const1393   std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1394 
1395   /// Returns true if the instructions in this block requires predication
1396   /// for any reason, e.g. because tail folding now requires a predicate
1397   /// or because the block in the original loop was predicated.
blockNeedsPredicationForAnyReason(BasicBlock * BB) const1398   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1399     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1400   }
1401 
1402   /// Returns true if VP intrinsics with explicit vector length support should
1403   /// be generated in the tail folded loop.
foldTailWithEVL() const1404   bool foldTailWithEVL() const {
1405     return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1406   }
1407 
1408   /// Returns true if the Phi is part of an inloop reduction.
isInLoopReduction(PHINode * Phi) const1409   bool isInLoopReduction(PHINode *Phi) const {
1410     return InLoopReductions.contains(Phi);
1411   }
1412 
1413   /// Returns true if the predicated reduction select should be used to set the
1414   /// incoming value for the reduction phi.
usePredicatedReductionSelect() const1415   bool usePredicatedReductionSelect() const {
1416     // Force to use predicated reduction select since the EVL of the
1417     // second-to-last iteration might not be VF*UF.
1418     if (foldTailWithEVL())
1419       return true;
1420     return PreferPredicatedReductionSelect ||
1421            TTI.preferPredicatedReductionSelect();
1422   }
1423 
1424   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1425   /// with factor VF.  Return the cost of the instruction, including
1426   /// scalarization overhead if it's needed.
1427   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1428 
1429   /// Estimate cost of a call instruction CI if it were vectorized with factor
1430   /// VF. Return the cost of the instruction, including scalarization overhead
1431   /// if it's needed.
1432   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1433 
1434   /// Invalidates decisions already taken by the cost model.
invalidateCostModelingDecisions()1435   void invalidateCostModelingDecisions() {
1436     WideningDecisions.clear();
1437     CallWideningDecisions.clear();
1438     Uniforms.clear();
1439     Scalars.clear();
1440   }
1441 
1442   /// Returns the expected execution cost. The unit of the cost does
1443   /// not matter because we use the 'cost' units to compare different
1444   /// vector widths. The cost that is returned is *not* normalized by
1445   /// the factor width.
1446   InstructionCost expectedCost(ElementCount VF);
1447 
hasPredStores() const1448   bool hasPredStores() const { return NumPredStores > 0; }
1449 
1450   /// Returns true if epilogue vectorization is considered profitable, and
1451   /// false otherwise.
1452   /// \p VF is the vectorization factor chosen for the original loop.
1453   /// \p Multiplier is an aditional scaling factor applied to VF before
1454   /// comparing to EpilogueVectorizationMinVF.
1455   bool isEpilogueVectorizationProfitable(const ElementCount VF,
1456                                          const unsigned IC) const;
1457 
1458   /// Returns the execution time cost of an instruction for a given vector
1459   /// width. Vector width of one means scalar.
1460   InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1461 
1462   /// Return the cost of instructions in an inloop reduction pattern, if I is
1463   /// part of that pattern.
1464   std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1465                                                          ElementCount VF,
1466                                                          Type *VectorTy) const;
1467 
1468   /// Returns true if \p Op should be considered invariant and if it is
1469   /// trivially hoistable.
1470   bool shouldConsiderInvariant(Value *Op);
1471 
1472   /// Return the value of vscale used for tuning the cost model.
getVScaleForTuning() const1473   std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1474 
1475 private:
1476   unsigned NumPredStores = 0;
1477 
1478   /// Used to store the value of vscale used for tuning the cost model. It is
1479   /// initialized during object construction.
1480   std::optional<unsigned> VScaleForTuning;
1481 
1482   /// Initializes the value of vscale used for tuning the cost model. If
1483   /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1484   /// return the value returned by the corresponding TTI method.
initializeVScaleForTuning()1485   void initializeVScaleForTuning() {
1486     const Function *Fn = TheLoop->getHeader()->getParent();
1487     if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
1488       auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
1489       auto Min = Attr.getVScaleRangeMin();
1490       auto Max = Attr.getVScaleRangeMax();
1491       if (Max && Min == Max) {
1492         VScaleForTuning = Max;
1493         return;
1494       }
1495     }
1496 
1497     VScaleForTuning = TTI.getVScaleForTuning();
1498   }
1499 
1500   /// \return An upper bound for the vectorization factors for both
1501   /// fixed and scalable vectorization, where the minimum-known number of
1502   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1503   /// disabled or unsupported, then the scalable part will be equal to
1504   /// ElementCount::getScalable(0).
1505   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1506                                            ElementCount UserVF,
1507                                            bool FoldTailByMasking);
1508 
1509   /// \return the maximized element count based on the targets vector
1510   /// registers and the loop trip-count, but limited to a maximum safe VF.
1511   /// This is a helper function of computeFeasibleMaxVF.
1512   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1513                                        unsigned SmallestType,
1514                                        unsigned WidestType,
1515                                        ElementCount MaxSafeVF,
1516                                        bool FoldTailByMasking);
1517 
1518   /// Checks if scalable vectorization is supported and enabled. Caches the
1519   /// result to avoid repeated debug dumps for repeated queries.
1520   bool isScalableVectorizationAllowed();
1521 
1522   /// \return the maximum legal scalable VF, based on the safe max number
1523   /// of elements.
1524   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1525 
1526   /// Calculate vectorization cost of memory instruction \p I.
1527   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1528 
1529   /// The cost computation for scalarized memory instruction.
1530   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1531 
1532   /// The cost computation for interleaving group of memory instructions.
1533   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1534 
1535   /// The cost computation for Gather/Scatter instruction.
1536   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1537 
1538   /// The cost computation for widening instruction \p I with consecutive
1539   /// memory access.
1540   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1541 
1542   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1543   /// Load: scalar load + broadcast.
1544   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1545   /// element)
1546   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1547 
1548   /// Estimate the overhead of scalarizing an instruction. This is a
1549   /// convenience wrapper for the type-based getScalarizationOverhead API.
1550   InstructionCost getScalarizationOverhead(Instruction *I,
1551                                            ElementCount VF) const;
1552 
1553   /// Returns true if an artificially high cost for emulated masked memrefs
1554   /// should be used.
1555   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1556 
1557   /// Map of scalar integer values to the smallest bitwidth they can be legally
1558   /// represented as. The vector equivalents of these values should be truncated
1559   /// to this type.
1560   MapVector<Instruction *, uint64_t> MinBWs;
1561 
1562   /// A type representing the costs for instructions if they were to be
1563   /// scalarized rather than vectorized. The entries are Instruction-Cost
1564   /// pairs.
1565   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1566 
1567   /// A set containing all BasicBlocks that are known to present after
1568   /// vectorization as a predicated block.
1569   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1570       PredicatedBBsAfterVectorization;
1571 
1572   /// Records whether it is allowed to have the original scalar loop execute at
1573   /// least once. This may be needed as a fallback loop in case runtime
1574   /// aliasing/dependence checks fail, or to handle the tail/remainder
1575   /// iterations when the trip count is unknown or doesn't divide by the VF,
1576   /// or as a peel-loop to handle gaps in interleave-groups.
1577   /// Under optsize and when the trip count is very small we don't allow any
1578   /// iterations to execute in the scalar loop.
1579   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1580 
1581   /// Control finally chosen tail folding style. The first element is used if
1582   /// the IV update may overflow, the second element - if it does not.
1583   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1584       ChosenTailFoldingStyle;
1585 
1586   /// true if scalable vectorization is supported and enabled.
1587   std::optional<bool> IsScalableVectorizationAllowed;
1588 
1589   /// Maximum safe number of elements to be processed per vector iteration,
1590   /// which do not prevent store-load forwarding and are safe with regard to the
1591   /// memory dependencies. Required for EVL-based veectorization, where this
1592   /// value is used as the upper bound of the safe AVL.
1593   std::optional<unsigned> MaxSafeElements;
1594 
1595   /// A map holding scalar costs for different vectorization factors. The
1596   /// presence of a cost for an instruction in the mapping indicates that the
1597   /// instruction will be scalarized when vectorizing with the associated
1598   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1599   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1600 
1601   /// Holds the instructions known to be uniform after vectorization.
1602   /// The data is collected per VF.
1603   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1604 
1605   /// Holds the instructions known to be scalar after vectorization.
1606   /// The data is collected per VF.
1607   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1608 
1609   /// Holds the instructions (address computations) that are forced to be
1610   /// scalarized.
1611   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1612 
1613   /// PHINodes of the reductions that should be expanded in-loop.
1614   SmallPtrSet<PHINode *, 4> InLoopReductions;
1615 
1616   /// A Map of inloop reduction operations and their immediate chain operand.
1617   /// FIXME: This can be removed once reductions can be costed correctly in
1618   /// VPlan. This was added to allow quick lookup of the inloop operations.
1619   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1620 
1621   /// Returns the expected difference in cost from scalarizing the expression
1622   /// feeding a predicated instruction \p PredInst. The instructions to
1623   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1624   /// non-negative return value implies the expression will be scalarized.
1625   /// Currently, only single-use chains are considered for scalarization.
1626   InstructionCost computePredInstDiscount(Instruction *PredInst,
1627                                           ScalarCostsTy &ScalarCosts,
1628                                           ElementCount VF);
1629 
1630   /// Collect the instructions that are uniform after vectorization. An
1631   /// instruction is uniform if we represent it with a single scalar value in
1632   /// the vectorized loop corresponding to each vector iteration. Examples of
1633   /// uniform instructions include pointer operands of consecutive or
1634   /// interleaved memory accesses. Note that although uniformity implies an
1635   /// instruction will be scalar, the reverse is not true. In general, a
1636   /// scalarized instruction will be represented by VF scalar values in the
1637   /// vectorized loop, each corresponding to an iteration of the original
1638   /// scalar loop.
1639   void collectLoopUniforms(ElementCount VF);
1640 
1641   /// Collect the instructions that are scalar after vectorization. An
1642   /// instruction is scalar if it is known to be uniform or will be scalarized
1643   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1644   /// to the list if they are used by a load/store instruction that is marked as
1645   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1646   /// VF values in the vectorized loop, each corresponding to an iteration of
1647   /// the original scalar loop.
1648   void collectLoopScalars(ElementCount VF);
1649 
1650   /// Keeps cost model vectorization decision and cost for instructions.
1651   /// Right now it is used for memory instructions only.
1652   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1653                                 std::pair<InstWidening, InstructionCost>>;
1654 
1655   DecisionList WideningDecisions;
1656 
1657   using CallDecisionList =
1658       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1659 
1660   CallDecisionList CallWideningDecisions;
1661 
1662   /// Returns true if \p V is expected to be vectorized and it needs to be
1663   /// extracted.
needsExtract(Value * V,ElementCount VF) const1664   bool needsExtract(Value *V, ElementCount VF) const {
1665     Instruction *I = dyn_cast<Instruction>(V);
1666     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1667         TheLoop->isLoopInvariant(I) ||
1668         getWideningDecision(I, VF) == CM_Scalarize)
1669       return false;
1670 
1671     // Assume we can vectorize V (and hence we need extraction) if the
1672     // scalars are not computed yet. This can happen, because it is called
1673     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1674     // the scalars are collected. That should be a safe assumption in most
1675     // cases, because we check if the operands have vectorizable types
1676     // beforehand in LoopVectorizationLegality.
1677     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1678   };
1679 
1680   /// Returns a range containing only operands needing to be extracted.
filterExtractingOperands(Instruction::op_range Ops,ElementCount VF) const1681   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1682                                                    ElementCount VF) const {
1683     return SmallVector<Value *, 4>(make_filter_range(
1684         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1685   }
1686 
1687 public:
1688   /// The loop that we evaluate.
1689   Loop *TheLoop;
1690 
1691   /// Predicated scalar evolution analysis.
1692   PredicatedScalarEvolution &PSE;
1693 
1694   /// Loop Info analysis.
1695   LoopInfo *LI;
1696 
1697   /// Vectorization legality.
1698   LoopVectorizationLegality *Legal;
1699 
1700   /// Vector target information.
1701   const TargetTransformInfo &TTI;
1702 
1703   /// Target Library Info.
1704   const TargetLibraryInfo *TLI;
1705 
1706   /// Demanded bits analysis.
1707   DemandedBits *DB;
1708 
1709   /// Assumption cache.
1710   AssumptionCache *AC;
1711 
1712   /// Interface to emit optimization remarks.
1713   OptimizationRemarkEmitter *ORE;
1714 
1715   const Function *TheFunction;
1716 
1717   /// Loop Vectorize Hint.
1718   const LoopVectorizeHints *Hints;
1719 
1720   /// The interleave access information contains groups of interleaved accesses
1721   /// with the same stride and close to each other.
1722   InterleavedAccessInfo &InterleaveInfo;
1723 
1724   /// Values to ignore in the cost model.
1725   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1726 
1727   /// Values to ignore in the cost model when VF > 1.
1728   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1729 
1730   /// All element types found in the loop.
1731   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1732 
1733   /// The kind of cost that we are calculating
1734   TTI::TargetCostKind CostKind;
1735 
1736   /// Whether this loop should be optimized for size based on function attribute
1737   /// or profile information.
1738   bool OptForSize;
1739 };
1740 } // end namespace llvm
1741 
1742 namespace {
1743 /// Helper struct to manage generating runtime checks for vectorization.
1744 ///
1745 /// The runtime checks are created up-front in temporary blocks to allow better
1746 /// estimating the cost and un-linked from the existing IR. After deciding to
1747 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1748 /// temporary blocks are completely removed.
1749 class GeneratedRTChecks {
1750   /// Basic block which contains the generated SCEV checks, if any.
1751   BasicBlock *SCEVCheckBlock = nullptr;
1752 
1753   /// The value representing the result of the generated SCEV checks. If it is
1754   /// nullptr no SCEV checks have been generated.
1755   Value *SCEVCheckCond = nullptr;
1756 
1757   /// Basic block which contains the generated memory runtime checks, if any.
1758   BasicBlock *MemCheckBlock = nullptr;
1759 
1760   /// The value representing the result of the generated memory runtime checks.
1761   /// If it is nullptr no memory runtime checks have been generated.
1762   Value *MemRuntimeCheckCond = nullptr;
1763 
1764   DominatorTree *DT;
1765   LoopInfo *LI;
1766   TargetTransformInfo *TTI;
1767 
1768   SCEVExpander SCEVExp;
1769   SCEVExpander MemCheckExp;
1770 
1771   bool CostTooHigh = false;
1772 
1773   Loop *OuterLoop = nullptr;
1774 
1775   PredicatedScalarEvolution &PSE;
1776 
1777   /// The kind of cost that we are calculating
1778   TTI::TargetCostKind CostKind;
1779 
1780 public:
GeneratedRTChecks(PredicatedScalarEvolution & PSE,DominatorTree * DT,LoopInfo * LI,TargetTransformInfo * TTI,const DataLayout & DL,TTI::TargetCostKind CostKind)1781   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1782                     LoopInfo *LI, TargetTransformInfo *TTI,
1783                     const DataLayout &DL, TTI::TargetCostKind CostKind)
1784       : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1785         MemCheckExp(*PSE.getSE(), DL, "scev.check"), PSE(PSE),
1786         CostKind(CostKind) {}
1787 
1788   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1789   /// accurately estimate the cost of the runtime checks. The blocks are
1790   /// un-linked from the IR and are added back during vector code generation. If
1791   /// there is no vector code generation, the check blocks are removed
1792   /// completely.
create(Loop * L,const LoopAccessInfo & LAI,const SCEVPredicate & UnionPred,ElementCount VF,unsigned IC)1793   void create(Loop *L, const LoopAccessInfo &LAI,
1794               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1795 
1796     // Hard cutoff to limit compile-time increase in case a very large number of
1797     // runtime checks needs to be generated.
1798     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1799     // profile info.
1800     CostTooHigh =
1801         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1802     if (CostTooHigh)
1803       return;
1804 
1805     BasicBlock *LoopHeader = L->getHeader();
1806     BasicBlock *Preheader = L->getLoopPreheader();
1807 
1808     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1809     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1810     // may be used by SCEVExpander. The blocks will be un-linked from their
1811     // predecessors and removed from LI & DT at the end of the function.
1812     if (!UnionPred.isAlwaysTrue()) {
1813       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1814                                   nullptr, "vector.scevcheck");
1815 
1816       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1817           &UnionPred, SCEVCheckBlock->getTerminator());
1818       if (isa<Constant>(SCEVCheckCond)) {
1819         // Clean up directly after expanding the predicate to a constant, to
1820         // avoid further expansions re-using anything left over from SCEVExp.
1821         SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1822         SCEVCleaner.cleanup();
1823       }
1824     }
1825 
1826     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1827     if (RtPtrChecking.Need) {
1828       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1829       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1830                                  "vector.memcheck");
1831 
1832       auto DiffChecks = RtPtrChecking.getDiffChecks();
1833       if (DiffChecks) {
1834         Value *RuntimeVF = nullptr;
1835         MemRuntimeCheckCond = addDiffRuntimeChecks(
1836             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1837             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1838               if (!RuntimeVF)
1839                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1840               return RuntimeVF;
1841             },
1842             IC);
1843       } else {
1844         MemRuntimeCheckCond = addRuntimeChecks(
1845             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1846             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
1847       }
1848       assert(MemRuntimeCheckCond &&
1849              "no RT checks generated although RtPtrChecking "
1850              "claimed checks are required");
1851     }
1852 
1853     if (!MemCheckBlock && !SCEVCheckBlock)
1854       return;
1855 
1856     // Unhook the temporary block with the checks, update various places
1857     // accordingly.
1858     if (SCEVCheckBlock)
1859       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1860     if (MemCheckBlock)
1861       MemCheckBlock->replaceAllUsesWith(Preheader);
1862 
1863     if (SCEVCheckBlock) {
1864       SCEVCheckBlock->getTerminator()->moveBefore(
1865           Preheader->getTerminator()->getIterator());
1866       auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1867       UI->setDebugLoc(DebugLoc::getTemporary());
1868       Preheader->getTerminator()->eraseFromParent();
1869     }
1870     if (MemCheckBlock) {
1871       MemCheckBlock->getTerminator()->moveBefore(
1872           Preheader->getTerminator()->getIterator());
1873       auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1874       UI->setDebugLoc(DebugLoc::getTemporary());
1875       Preheader->getTerminator()->eraseFromParent();
1876     }
1877 
1878     DT->changeImmediateDominator(LoopHeader, Preheader);
1879     if (MemCheckBlock) {
1880       DT->eraseNode(MemCheckBlock);
1881       LI->removeBlock(MemCheckBlock);
1882     }
1883     if (SCEVCheckBlock) {
1884       DT->eraseNode(SCEVCheckBlock);
1885       LI->removeBlock(SCEVCheckBlock);
1886     }
1887 
1888     // Outer loop is used as part of the later cost calculations.
1889     OuterLoop = L->getParentLoop();
1890   }
1891 
getCost()1892   InstructionCost getCost() {
1893     if (SCEVCheckBlock || MemCheckBlock)
1894       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1895 
1896     if (CostTooHigh) {
1897       InstructionCost Cost;
1898       Cost.setInvalid();
1899       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
1900       return Cost;
1901     }
1902 
1903     InstructionCost RTCheckCost = 0;
1904     if (SCEVCheckBlock)
1905       for (Instruction &I : *SCEVCheckBlock) {
1906         if (SCEVCheckBlock->getTerminator() == &I)
1907           continue;
1908         InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1909         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1910         RTCheckCost += C;
1911       }
1912     if (MemCheckBlock) {
1913       InstructionCost MemCheckCost = 0;
1914       for (Instruction &I : *MemCheckBlock) {
1915         if (MemCheckBlock->getTerminator() == &I)
1916           continue;
1917         InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1918         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1919         MemCheckCost += C;
1920       }
1921 
1922       // If the runtime memory checks are being created inside an outer loop
1923       // we should find out if these checks are outer loop invariant. If so,
1924       // the checks will likely be hoisted out and so the effective cost will
1925       // reduce according to the outer loop trip count.
1926       if (OuterLoop) {
1927         ScalarEvolution *SE = MemCheckExp.getSE();
1928         // TODO: If profitable, we could refine this further by analysing every
1929         // individual memory check, since there could be a mixture of loop
1930         // variant and invariant checks that mean the final condition is
1931         // variant.
1932         const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1933         if (SE->isLoopInvariant(Cond, OuterLoop)) {
1934           // It seems reasonable to assume that we can reduce the effective
1935           // cost of the checks even when we know nothing about the trip
1936           // count. Assume that the outer loop executes at least twice.
1937           unsigned BestTripCount = 2;
1938 
1939           // Get the best known TC estimate.
1940           if (auto EstimatedTC = getSmallBestKnownTC(
1941                   PSE, OuterLoop, /* CanUseConstantMax = */ false))
1942             if (EstimatedTC->isFixed())
1943               BestTripCount = EstimatedTC->getFixedValue();
1944 
1945           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1946 
1947           // Let's ensure the cost is always at least 1.
1948           NewMemCheckCost = std::max(NewMemCheckCost.getValue(),
1949                                      (InstructionCost::CostType)1);
1950 
1951           if (BestTripCount > 1)
1952             LLVM_DEBUG(dbgs()
1953                        << "We expect runtime memory checks to be hoisted "
1954                        << "out of the outer loop. Cost reduced from "
1955                        << MemCheckCost << " to " << NewMemCheckCost << '\n');
1956 
1957           MemCheckCost = NewMemCheckCost;
1958         }
1959       }
1960 
1961       RTCheckCost += MemCheckCost;
1962     }
1963 
1964     if (SCEVCheckBlock || MemCheckBlock)
1965       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1966                         << "\n");
1967 
1968     return RTCheckCost;
1969   }
1970 
1971   /// Remove the created SCEV & memory runtime check blocks & instructions, if
1972   /// unused.
~GeneratedRTChecks()1973   ~GeneratedRTChecks() {
1974     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1975     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1976     bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(SCEVCheckBlock);
1977     bool MemChecksUsed = !MemCheckBlock || !pred_empty(MemCheckBlock);
1978     if (SCEVChecksUsed)
1979       SCEVCleaner.markResultUsed();
1980 
1981     if (MemChecksUsed) {
1982       MemCheckCleaner.markResultUsed();
1983     } else {
1984       auto &SE = *MemCheckExp.getSE();
1985       // Memory runtime check generation creates compares that use expanded
1986       // values. Remove them before running the SCEVExpanderCleaners.
1987       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
1988         if (MemCheckExp.isInsertedInstruction(&I))
1989           continue;
1990         SE.forgetValue(&I);
1991         I.eraseFromParent();
1992       }
1993     }
1994     MemCheckCleaner.cleanup();
1995     SCEVCleaner.cleanup();
1996 
1997     if (!SCEVChecksUsed)
1998       SCEVCheckBlock->eraseFromParent();
1999     if (!MemChecksUsed)
2000       MemCheckBlock->eraseFromParent();
2001   }
2002 
2003   /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2004   /// outside VPlan.
getSCEVChecks()2005   std::pair<Value *, BasicBlock *> getSCEVChecks() {
2006     using namespace llvm::PatternMatch;
2007     if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
2008       return {nullptr, nullptr};
2009 
2010     return {SCEVCheckCond, SCEVCheckBlock};
2011   }
2012 
2013   /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2014   /// outside VPlan.
getMemRuntimeChecks()2015   std::pair<Value *, BasicBlock *> getMemRuntimeChecks() {
2016     return {MemRuntimeCheckCond, MemCheckBlock};
2017   }
2018 
2019   /// Return true if any runtime checks have been added
hasChecks() const2020   bool hasChecks() const {
2021     using namespace llvm::PatternMatch;
2022     return (SCEVCheckCond && !match(SCEVCheckCond, m_ZeroInt())) ||
2023            MemRuntimeCheckCond;
2024   }
2025 };
2026 } // namespace
2027 
useActiveLaneMask(TailFoldingStyle Style)2028 static bool useActiveLaneMask(TailFoldingStyle Style) {
2029   return Style == TailFoldingStyle::Data ||
2030          Style == TailFoldingStyle::DataAndControlFlow ||
2031          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2032 }
2033 
useActiveLaneMaskForControlFlow(TailFoldingStyle Style)2034 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2035   return Style == TailFoldingStyle::DataAndControlFlow ||
2036          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2037 }
2038 
2039 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2040 // vectorization. The loop needs to be annotated with #pragma omp simd
2041 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2042 // vector length information is not provided, vectorization is not considered
2043 // explicit. Interleave hints are not allowed either. These limitations will be
2044 // relaxed in the future.
2045 // Please, note that we are currently forced to abuse the pragma 'clang
2046 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2047 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2048 // provides *explicit vectorization hints* (LV can bypass legal checks and
2049 // assume that vectorization is legal). However, both hints are implemented
2050 // using the same metadata (llvm.loop.vectorize, processed by
2051 // LoopVectorizeHints). This will be fixed in the future when the native IR
2052 // representation for pragma 'omp simd' is introduced.
isExplicitVecOuterLoop(Loop * OuterLp,OptimizationRemarkEmitter * ORE)2053 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2054                                    OptimizationRemarkEmitter *ORE) {
2055   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2056   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2057 
2058   // Only outer loops with an explicit vectorization hint are supported.
2059   // Unannotated outer loops are ignored.
2060   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2061     return false;
2062 
2063   Function *Fn = OuterLp->getHeader()->getParent();
2064   if (!Hints.allowVectorization(Fn, OuterLp,
2065                                 true /*VectorizeOnlyWhenForced*/)) {
2066     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2067     return false;
2068   }
2069 
2070   if (Hints.getInterleave() > 1) {
2071     // TODO: Interleave support is future work.
2072     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2073                          "outer loops.\n");
2074     Hints.emitRemarkWithHints();
2075     return false;
2076   }
2077 
2078   return true;
2079 }
2080 
collectSupportedLoops(Loop & L,LoopInfo * LI,OptimizationRemarkEmitter * ORE,SmallVectorImpl<Loop * > & V)2081 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2082                                   OptimizationRemarkEmitter *ORE,
2083                                   SmallVectorImpl<Loop *> &V) {
2084   // Collect inner loops and outer loops without irreducible control flow. For
2085   // now, only collect outer loops that have explicit vectorization hints. If we
2086   // are stress testing the VPlan H-CFG construction, we collect the outermost
2087   // loop of every loop nest.
2088   if (L.isInnermost() || VPlanBuildStressTest ||
2089       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2090     LoopBlocksRPO RPOT(&L);
2091     RPOT.perform(LI);
2092     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2093       V.push_back(&L);
2094       // TODO: Collect inner loops inside marked outer loops in case
2095       // vectorization fails for the outer loop. Do not invoke
2096       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2097       // already known to be reducible. We can use an inherited attribute for
2098       // that.
2099       return;
2100     }
2101   }
2102   for (Loop *InnerL : L)
2103     collectSupportedLoops(*InnerL, LI, ORE, V);
2104 }
2105 
2106 //===----------------------------------------------------------------------===//
2107 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2108 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2109 //===----------------------------------------------------------------------===//
2110 
2111 /// Compute the transformed value of Index at offset StartValue using step
2112 /// StepValue.
2113 /// For integer induction, returns StartValue + Index * StepValue.
2114 /// For pointer induction, returns StartValue[Index * StepValue].
2115 /// FIXME: The newly created binary instructions should contain nsw/nuw
2116 /// flags, which can be found from the original scalar operations.
2117 static Value *
emitTransformedIndex(IRBuilderBase & B,Value * Index,Value * StartValue,Value * Step,InductionDescriptor::InductionKind InductionKind,const BinaryOperator * InductionBinOp)2118 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2119                      Value *Step,
2120                      InductionDescriptor::InductionKind InductionKind,
2121                      const BinaryOperator *InductionBinOp) {
2122   using namespace llvm::PatternMatch;
2123   Type *StepTy = Step->getType();
2124   Value *CastedIndex = StepTy->isIntegerTy()
2125                            ? B.CreateSExtOrTrunc(Index, StepTy)
2126                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2127   if (CastedIndex != Index) {
2128     CastedIndex->setName(CastedIndex->getName() + ".cast");
2129     Index = CastedIndex;
2130   }
2131 
2132   // Note: the IR at this point is broken. We cannot use SE to create any new
2133   // SCEV and then expand it, hoping that SCEV's simplification will give us
2134   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2135   // lead to various SCEV crashes. So all we can do is to use builder and rely
2136   // on InstCombine for future simplifications. Here we handle some trivial
2137   // cases only.
2138   auto CreateAdd = [&B](Value *X, Value *Y) {
2139     assert(X->getType() == Y->getType() && "Types don't match!");
2140     if (match(X, m_ZeroInt()))
2141       return Y;
2142     if (match(Y, m_ZeroInt()))
2143       return X;
2144     return B.CreateAdd(X, Y);
2145   };
2146 
2147   // We allow X to be a vector type, in which case Y will potentially be
2148   // splatted into a vector with the same element count.
2149   auto CreateMul = [&B](Value *X, Value *Y) {
2150     assert(X->getType()->getScalarType() == Y->getType() &&
2151            "Types don't match!");
2152     if (match(X, m_One()))
2153       return Y;
2154     if (match(Y, m_One()))
2155       return X;
2156     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2157     if (XVTy && !isa<VectorType>(Y->getType()))
2158       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2159     return B.CreateMul(X, Y);
2160   };
2161 
2162   switch (InductionKind) {
2163   case InductionDescriptor::IK_IntInduction: {
2164     assert(!isa<VectorType>(Index->getType()) &&
2165            "Vector indices not supported for integer inductions yet");
2166     assert(Index->getType() == StartValue->getType() &&
2167            "Index type does not match StartValue type");
2168     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2169       return B.CreateSub(StartValue, Index);
2170     auto *Offset = CreateMul(Index, Step);
2171     return CreateAdd(StartValue, Offset);
2172   }
2173   case InductionDescriptor::IK_PtrInduction:
2174     return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2175   case InductionDescriptor::IK_FpInduction: {
2176     assert(!isa<VectorType>(Index->getType()) &&
2177            "Vector indices not supported for FP inductions yet");
2178     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2179     assert(InductionBinOp &&
2180            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2181             InductionBinOp->getOpcode() == Instruction::FSub) &&
2182            "Original bin op should be defined for FP induction");
2183 
2184     Value *MulExp = B.CreateFMul(Step, Index);
2185     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2186                          "induction");
2187   }
2188   case InductionDescriptor::IK_NoInduction:
2189     return nullptr;
2190   }
2191   llvm_unreachable("invalid enum");
2192 }
2193 
getMaxVScale(const Function & F,const TargetTransformInfo & TTI)2194 static std::optional<unsigned> getMaxVScale(const Function &F,
2195                                             const TargetTransformInfo &TTI) {
2196   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2197     return MaxVScale;
2198 
2199   if (F.hasFnAttribute(Attribute::VScaleRange))
2200     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2201 
2202   return std::nullopt;
2203 }
2204 
2205 /// For the given VF and UF and maximum trip count computed for the loop, return
2206 /// whether the induction variable might overflow in the vectorized loop. If not,
2207 /// then we know a runtime overflow check always evaluates to false and can be
2208 /// removed.
isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel * Cost,ElementCount VF,std::optional<unsigned> UF=std::nullopt)2209 static bool isIndvarOverflowCheckKnownFalse(
2210     const LoopVectorizationCostModel *Cost,
2211     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2212   // Always be conservative if we don't know the exact unroll factor.
2213   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2214 
2215   IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2216   APInt MaxUIntTripCount = IdxTy->getMask();
2217 
2218   // We know the runtime overflow check is known false iff the (max) trip-count
2219   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2220   // the vector loop induction variable.
2221   if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2222     uint64_t MaxVF = VF.getKnownMinValue();
2223     if (VF.isScalable()) {
2224       std::optional<unsigned> MaxVScale =
2225           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2226       if (!MaxVScale)
2227         return false;
2228       MaxVF *= *MaxVScale;
2229     }
2230 
2231     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2232   }
2233 
2234   return false;
2235 }
2236 
2237 // Return whether we allow using masked interleave-groups (for dealing with
2238 // strided loads/stores that reside in predicated blocks, or for dealing
2239 // with gaps).
useMaskedInterleavedAccesses(const TargetTransformInfo & TTI)2240 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2241   // If an override option has been passed in for interleaved accesses, use it.
2242   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2243     return EnableMaskedInterleavedMemAccesses;
2244 
2245   return TTI.enableMaskedInterleavedAccessVectorization();
2246 }
2247 
2248 Value *
getOrCreateVectorTripCount(BasicBlock * InsertBlock)2249 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2250   if (VectorTripCount)
2251     return VectorTripCount;
2252 
2253   Value *TC = getTripCount();
2254   IRBuilder<> Builder(InsertBlock->getTerminator());
2255 
2256   Type *Ty = TC->getType();
2257   // This is where we can make the step a runtime constant.
2258   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2259 
2260   // If the tail is to be folded by masking, round the number of iterations N
2261   // up to a multiple of Step instead of rounding down. This is done by first
2262   // adding Step-1 and then rounding down. Note that it's ok if this addition
2263   // overflows: the vector induction variable will eventually wrap to zero given
2264   // that it starts at zero and its Step is a power of two; the loop will then
2265   // exit, with the last early-exit vector comparison also producing all-true.
2266   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2267   // is accounted for in emitIterationCountCheck that adds an overflow check.
2268   if (Cost->foldTailByMasking()) {
2269     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2270            "VF*UF must be a power of 2 when folding tail by masking");
2271     TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2272                            "n.rnd.up");
2273   }
2274 
2275   // Now we need to generate the expression for the part of the loop that the
2276   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2277   // iterations are not required for correctness, or N - Step, otherwise. Step
2278   // is equal to the vectorization factor (number of SIMD elements) times the
2279   // unroll factor (number of SIMD instructions).
2280   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2281 
2282   // There are cases where we *must* run at least one iteration in the remainder
2283   // loop.  See the cost model for when this can happen.  If the step evenly
2284   // divides the trip count, we set the remainder to be equal to the step. If
2285   // the step does not evenly divide the trip count, no adjustment is necessary
2286   // since there will already be scalar iterations. Note that the minimum
2287   // iterations check ensures that N >= Step.
2288   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2289     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2290     R = Builder.CreateSelect(IsZero, Step, R);
2291   }
2292 
2293   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2294 
2295   return VectorTripCount;
2296 }
2297 
introduceCheckBlockInVPlan(BasicBlock * CheckIRBB)2298 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
2299   // Note: The block with the minimum trip-count check is already connected
2300   // during earlier VPlan construction.
2301   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2302   VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
2303   assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2304   assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
2305   VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2306   VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
2307   PreVectorPH = CheckVPIRBB;
2308   VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2309   PreVectorPH->swapSuccessors();
2310 
2311   // We just connected a new block to the scalar preheader. Update all
2312   // VPPhis by adding an incoming value for it, replicating the last value.
2313   unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2314   for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
2315     assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2316     assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
2317            "must have incoming values for all operands");
2318     R.addOperand(R.getOperand(NumPredecessors - 2));
2319   }
2320 }
2321 
createIterationCountCheck(ElementCount VF,unsigned UF) const2322 Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
2323                                                       unsigned UF) const {
2324   // Generate code to check if the loop's trip count is less than VF * UF, or
2325   // equal to it in case a scalar epilogue is required; this implies that the
2326   // vector trip count is zero. This check also covers the case where adding one
2327   // to the backedge-taken count overflowed leading to an incorrect trip count
2328   // of zero. In this case we will also jump to the scalar loop.
2329   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2330                                                        : ICmpInst::ICMP_ULT;
2331 
2332   // Reuse existing vector loop preheader for TC checks.
2333   // Note that new preheader block is generated for vector loop.
2334   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2335   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2336 
2337   // If tail is to be folded, vector loop takes care of all iterations.
2338   Value *Count = getTripCount();
2339   Type *CountTy = Count->getType();
2340   Value *CheckMinIters = Builder.getFalse();
2341   auto CreateStep = [&]() -> Value * {
2342     // Create step with max(MinProTripCount, UF * VF).
2343     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2344       return createStepForVF(Builder, CountTy, VF, UF);
2345 
2346     Value *MinProfTC =
2347         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2348     if (!VF.isScalable())
2349       return MinProfTC;
2350     return Builder.CreateBinaryIntrinsic(
2351         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2352   };
2353 
2354   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2355   if (Style == TailFoldingStyle::None) {
2356     Value *Step = CreateStep();
2357     ScalarEvolution &SE = *PSE.getSE();
2358     // TODO: Emit unconditional branch to vector preheader instead of
2359     // conditional branch with known condition.
2360     const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2361     // Check if the trip count is < the step.
2362     if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2363       // TODO: Ensure step is at most the trip count when determining max VF and
2364       // UF, w/o tail folding.
2365       CheckMinIters = Builder.getTrue();
2366     } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P),
2367                                     TripCountSCEV, SE.getSCEV(Step))) {
2368       // Generate the minimum iteration check only if we cannot prove the
2369       // check is known to be true, or known to be false.
2370       CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2371     } // else step known to be < trip count, use CheckMinIters preset to false.
2372   } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
2373              !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2374              Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2375     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2376     // an overflow to zero when updating induction variables and so an
2377     // additional overflow check is required before entering the vector loop.
2378 
2379     // Get the maximum unsigned value for the type.
2380     Value *MaxUIntTripCount =
2381         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2382     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2383 
2384     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2385     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2386   }
2387   return CheckMinIters;
2388 }
2389 
emitIterationCountCheck(BasicBlock * Bypass)2390 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2391   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2392   Value *CheckMinIters = createIterationCountCheck(VF, UF);
2393   // Create new preheader for vector loop.
2394   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
2395                                    static_cast<DominatorTree *>(nullptr), LI,
2396                                    nullptr, "vector.ph");
2397 
2398   BranchInst &BI =
2399       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2400   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2401     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2402   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2403 
2404   assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
2405              TCCheckBlock &&
2406          "Plan's entry must be TCCCheckBlock");
2407 }
2408 
2409 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2410 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2411 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2412 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
replaceVPBBWithIRVPBB(VPBasicBlock * VPBB,BasicBlock * IRBB)2413 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
2414   VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2415   for (auto &R : make_early_inc_range(*VPBB)) {
2416     assert((IRVPBB->empty() || IRVPBB->back().isPhi() || !R.isPhi()) &&
2417            "Tried to move phi recipe after a non-phi recipe");
2418     R.moveBefore(*IRVPBB, IRVPBB->end());
2419   }
2420 
2421   VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2422   // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2423 }
2424 
createVectorLoopSkeleton(StringRef Prefix)2425 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2426   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2427   assert(LoopVectorPreHeader && "Invalid loop structure");
2428   assert((OrigLoop->getUniqueLatchExitBlock() ||
2429           Cost->requiresScalarEpilogue(VF.isVector())) &&
2430          "loops not exiting via the latch without required epilogue?");
2431 
2432   LoopScalarPreHeader =
2433       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2434                  LI, nullptr, Twine(Prefix) + "scalar.ph");
2435   // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2436   // wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar
2437   // preheader may be unreachable at this point. Instead it is replaced in
2438   // createVectorizedLoopSkeleton.
2439 }
2440 
2441 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2442 /// expansion results.
getExpandedStep(const InductionDescriptor & ID,const SCEV2ValueTy & ExpandedSCEVs)2443 static Value *getExpandedStep(const InductionDescriptor &ID,
2444                               const SCEV2ValueTy &ExpandedSCEVs) {
2445   const SCEV *Step = ID.getStep();
2446   if (auto *C = dyn_cast<SCEVConstant>(Step))
2447     return C->getValue();
2448   if (auto *U = dyn_cast<SCEVUnknown>(Step))
2449     return U->getValue();
2450   Value *V = ExpandedSCEVs.lookup(Step);
2451   assert(V && "SCEV must be expanded at this point");
2452   return V;
2453 }
2454 
2455 /// Knowing that loop \p L executes a single vector iteration, add instructions
2456 /// that will get simplified and thus should not have any cost to \p
2457 /// InstsToIgnore.
addFullyUnrolledInstructionsToIgnore(Loop * L,const LoopVectorizationLegality::InductionList & IL,SmallPtrSetImpl<Instruction * > & InstsToIgnore)2458 static void addFullyUnrolledInstructionsToIgnore(
2459     Loop *L, const LoopVectorizationLegality::InductionList &IL,
2460     SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2461   auto *Cmp = L->getLatchCmpInst();
2462   if (Cmp)
2463     InstsToIgnore.insert(Cmp);
2464   for (const auto &KV : IL) {
2465     // Extract the key by hand so that it can be used in the lambda below.  Note
2466     // that captured structured bindings are a C++20 extension.
2467     const PHINode *IV = KV.first;
2468 
2469     // Get next iteration value of the induction variable.
2470     Instruction *IVInst =
2471         cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2472     if (all_of(IVInst->users(),
2473                [&](const User *U) { return U == IV || U == Cmp; }))
2474       InstsToIgnore.insert(IVInst);
2475   }
2476 }
2477 
createVectorizedLoopSkeleton()2478 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2479   /*
2480    In this function we generate a new loop. The new loop will contain
2481    the vectorized instructions while the old loop will continue to run the
2482    scalar remainder.
2483 
2484        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2485      /  |      preheader are expanded here. Eventually all required SCEV
2486     /   |      expansion should happen here.
2487    /    v
2488   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2489   |  /  |
2490   | /   v
2491   ||   [ ]     <-- vector pre header.
2492   |/    |
2493   |     v
2494   |    [  ] \
2495   |    [  ]_|   <-- vector loop (created during VPlan execution).
2496   |     |
2497   |     v
2498   \   -[ ]   <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2499    |    |                       successors created during VPlan execution)
2500    \/   |
2501    /\   v
2502    | ->[ ]     <--- new preheader (wrapped in VPIRBasicBlock).
2503    |    |
2504  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
2505    |   [ ] \
2506    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue, header
2507    |    |          wrapped in VPIRBasicBlock).
2508     \   |
2509      \  v
2510       >[ ]     <-- exit block(s). (wrapped in VPIRBasicBlock)
2511    ...
2512    */
2513 
2514   // Create an empty vector loop, and prepare basic blocks for the runtime
2515   // checks.
2516   createVectorLoopSkeleton("");
2517 
2518   // Now, compare the new count to zero. If it is zero skip the vector loop and
2519   // jump to the scalar loop. This check also covers the case where the
2520   // backedge-taken count is uint##_max: adding one to it will overflow leading
2521   // to an incorrect trip count of zero. In this (rare) case we will also jump
2522   // to the scalar loop.
2523   emitIterationCountCheck(LoopScalarPreHeader);
2524 
2525   replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
2526   return LoopVectorPreHeader;
2527 }
2528 
2529 namespace {
2530 
2531 struct CSEDenseMapInfo {
canHandle__anon71de2b2d0b11::CSEDenseMapInfo2532   static bool canHandle(const Instruction *I) {
2533     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2534            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2535   }
2536 
getEmptyKey__anon71de2b2d0b11::CSEDenseMapInfo2537   static inline Instruction *getEmptyKey() {
2538     return DenseMapInfo<Instruction *>::getEmptyKey();
2539   }
2540 
getTombstoneKey__anon71de2b2d0b11::CSEDenseMapInfo2541   static inline Instruction *getTombstoneKey() {
2542     return DenseMapInfo<Instruction *>::getTombstoneKey();
2543   }
2544 
getHashValue__anon71de2b2d0b11::CSEDenseMapInfo2545   static unsigned getHashValue(const Instruction *I) {
2546     assert(canHandle(I) && "Unknown instruction!");
2547     return hash_combine(I->getOpcode(),
2548                         hash_combine_range(I->operand_values()));
2549   }
2550 
isEqual__anon71de2b2d0b11::CSEDenseMapInfo2551   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2552     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2553         LHS == getTombstoneKey() || RHS == getTombstoneKey())
2554       return LHS == RHS;
2555     return LHS->isIdenticalTo(RHS);
2556   }
2557 };
2558 
2559 } // end anonymous namespace
2560 
2561 ///Perform cse of induction variable instructions.
cse(BasicBlock * BB)2562 static void cse(BasicBlock *BB) {
2563   // Perform simple cse.
2564   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2565   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2566     if (!CSEDenseMapInfo::canHandle(&In))
2567       continue;
2568 
2569     // Check if we can replace this instruction with any of the
2570     // visited instructions.
2571     if (Instruction *V = CSEMap.lookup(&In)) {
2572       In.replaceAllUsesWith(V);
2573       In.eraseFromParent();
2574       continue;
2575     }
2576 
2577     CSEMap[&In] = &In;
2578   }
2579 }
2580 
2581 /// This function attempts to return a value that represents the vectorization
2582 /// factor at runtime. For fixed-width VFs we know this precisely at compile
2583 /// time, but for scalable VFs we calculate it based on an estimate of the
2584 /// vscale value.
getEstimatedRuntimeVF(ElementCount VF,std::optional<unsigned> VScale)2585 static unsigned getEstimatedRuntimeVF(ElementCount VF,
2586                                       std::optional<unsigned> VScale) {
2587   unsigned EstimatedVF = VF.getKnownMinValue();
2588   if (VF.isScalable())
2589     if (VScale)
2590       EstimatedVF *= *VScale;
2591   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2592   return EstimatedVF;
2593 }
2594 
2595 InstructionCost
getVectorCallCost(CallInst * CI,ElementCount VF) const2596 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2597                                               ElementCount VF) const {
2598   // We only need to calculate a cost if the VF is scalar; for actual vectors
2599   // we should already have a pre-calculated cost at each VF.
2600   if (!VF.isScalar())
2601     return getCallWideningDecision(CI, VF).Cost;
2602 
2603   Type *RetTy = CI->getType();
2604   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
2605     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2606       return *RedCost;
2607 
2608   SmallVector<Type *, 4> Tys;
2609   for (auto &ArgOp : CI->args())
2610     Tys.push_back(ArgOp->getType());
2611 
2612   InstructionCost ScalarCallCost =
2613       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2614 
2615   // If this is an intrinsic we may have a lower cost for it.
2616   if (getVectorIntrinsicIDForCall(CI, TLI)) {
2617     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2618     return std::min(ScalarCallCost, IntrinsicCost);
2619   }
2620   return ScalarCallCost;
2621 }
2622 
maybeVectorizeType(Type * Ty,ElementCount VF)2623 static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
2624   if (VF.isScalar() || !canVectorizeTy(Ty))
2625     return Ty;
2626   return toVectorizedTy(Ty, VF);
2627 }
2628 
2629 InstructionCost
getVectorIntrinsicCost(CallInst * CI,ElementCount VF) const2630 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2631                                                    ElementCount VF) const {
2632   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2633   assert(ID && "Expected intrinsic call!");
2634   Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2635   FastMathFlags FMF;
2636   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2637     FMF = FPMO->getFastMathFlags();
2638 
2639   SmallVector<const Value *> Arguments(CI->args());
2640   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2641   SmallVector<Type *> ParamTys;
2642   std::transform(FTy->param_begin(), FTy->param_end(),
2643                  std::back_inserter(ParamTys),
2644                  [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2645 
2646   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2647                                     dyn_cast<IntrinsicInst>(CI),
2648                                     InstructionCost::getInvalid(), TLI);
2649   return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2650 }
2651 
fixVectorizedLoop(VPTransformState & State)2652 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2653   // Fix widened non-induction PHIs by setting up the PHI operands.
2654   fixNonInductionPHIs(State);
2655 
2656   // After vectorization, the exit blocks of the original loop will have
2657   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2658   // looked through single-entry phis.
2659   SmallVector<BasicBlock *> ExitBlocks;
2660   OrigLoop->getExitBlocks(ExitBlocks);
2661   for (BasicBlock *Exit : ExitBlocks)
2662     for (PHINode &PN : Exit->phis())
2663       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
2664 
2665   // Forget the original basic block.
2666   PSE.getSE()->forgetLoop(OrigLoop);
2667   PSE.getSE()->forgetBlockAndLoopDispositions();
2668 
2669   // Don't apply optimizations below when no (vector) loop remains, as they all
2670   // require one at the moment.
2671   VPBasicBlock *HeaderVPBB =
2672       vputils::getFirstLoopHeader(*State.Plan, State.VPDT);
2673   if (!HeaderVPBB)
2674     return;
2675 
2676   BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2677 
2678   // Remove redundant induction instructions.
2679   cse(HeaderBB);
2680 
2681   // Set/update profile weights for the vector and remainder loops as original
2682   // loop iterations are now distributed among them. Note that original loop
2683   // becomes the scalar remainder loop after vectorization.
2684   //
2685   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2686   // end up getting slightly roughened result but that should be OK since
2687   // profile is not inherently precise anyway. Note also possible bypass of
2688   // vector code caused by legality checks is ignored, assigning all the weight
2689   // to the vector loop, optimistically.
2690   //
2691   // For scalable vectorization we can't know at compile time how many
2692   // iterations of the loop are handled in one vector iteration, so instead
2693   // use the value of vscale used for tuning.
2694   Loop *VectorLoop = LI->getLoopFor(HeaderBB);
2695   unsigned EstimatedVFxUF =
2696       getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning());
2697   setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
2698 }
2699 
fixNonInductionPHIs(VPTransformState & State)2700 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
2701   auto Iter = vp_depth_first_shallow(Plan.getEntry());
2702   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
2703     for (VPRecipeBase &P : VPBB->phis()) {
2704       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
2705       if (!VPPhi)
2706         continue;
2707       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
2708       // Make sure the builder has a valid insert point.
2709       Builder.SetInsertPoint(NewPhi);
2710       for (unsigned Idx = 0; Idx < VPPhi->getNumIncoming(); ++Idx) {
2711         VPValue *Inc = VPPhi->getIncomingValue(Idx);
2712         const VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
2713         NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
2714       }
2715     }
2716   }
2717 }
2718 
collectLoopScalars(ElementCount VF)2719 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2720   // We should not collect Scalars more than once per VF. Right now, this
2721   // function is called from collectUniformsAndScalars(), which already does
2722   // this check. Collecting Scalars for VF=1 does not make any sense.
2723   assert(VF.isVector() && !Scalars.contains(VF) &&
2724          "This function should not be visited twice for the same VF");
2725 
2726   // This avoids any chances of creating a REPLICATE recipe during planning
2727   // since that would result in generation of scalarized code during execution,
2728   // which is not supported for scalable vectors.
2729   if (VF.isScalable()) {
2730     Scalars[VF].insert_range(Uniforms[VF]);
2731     return;
2732   }
2733 
2734   SmallSetVector<Instruction *, 8> Worklist;
2735 
2736   // These sets are used to seed the analysis with pointers used by memory
2737   // accesses that will remain scalar.
2738   SmallSetVector<Instruction *, 8> ScalarPtrs;
2739   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2740   auto *Latch = TheLoop->getLoopLatch();
2741 
2742   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2743   // The pointer operands of loads and stores will be scalar as long as the
2744   // memory access is not a gather or scatter operation. The value operand of a
2745   // store will remain scalar if the store is scalarized.
2746   auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2747     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
2748     assert(WideningDecision != CM_Unknown &&
2749            "Widening decision should be ready at this moment");
2750     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
2751       if (Ptr == Store->getValueOperand())
2752         return WideningDecision == CM_Scalarize;
2753     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2754            "Ptr is neither a value or pointer operand");
2755     return WideningDecision != CM_GatherScatter;
2756   };
2757 
2758   // A helper that returns true if the given value is a getelementptr
2759   // instruction contained in the loop.
2760   auto IsLoopVaryingGEP = [&](Value *V) {
2761     return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
2762   };
2763 
2764   // A helper that evaluates a memory access's use of a pointer. If the use will
2765   // be a scalar use and the pointer is only used by memory accesses, we place
2766   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2767   // PossibleNonScalarPtrs.
2768   auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2769     // We only care about bitcast and getelementptr instructions contained in
2770     // the loop.
2771     if (!IsLoopVaryingGEP(Ptr))
2772       return;
2773 
2774     // If the pointer has already been identified as scalar (e.g., if it was
2775     // also identified as uniform), there's nothing to do.
2776     auto *I = cast<Instruction>(Ptr);
2777     if (Worklist.count(I))
2778       return;
2779 
2780     // If the use of the pointer will be a scalar use, and all users of the
2781     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2782     // place the pointer in PossibleNonScalarPtrs.
2783     if (IsScalarUse(MemAccess, Ptr) &&
2784         all_of(I->users(), IsaPred<LoadInst, StoreInst>))
2785       ScalarPtrs.insert(I);
2786     else
2787       PossibleNonScalarPtrs.insert(I);
2788   };
2789 
2790   // We seed the scalars analysis with three classes of instructions: (1)
2791   // instructions marked uniform-after-vectorization and (2) bitcast,
2792   // getelementptr and (pointer) phi instructions used by memory accesses
2793   // requiring a scalar use.
2794   //
2795   // (1) Add to the worklist all instructions that have been identified as
2796   // uniform-after-vectorization.
2797   Worklist.insert_range(Uniforms[VF]);
2798 
2799   // (2) Add to the worklist all bitcast and getelementptr instructions used by
2800   // memory accesses requiring a scalar use. The pointer operands of loads and
2801   // stores will be scalar unless the operation is a gather or scatter.
2802   // The value operand of a store will remain scalar if the store is scalarized.
2803   for (auto *BB : TheLoop->blocks())
2804     for (auto &I : *BB) {
2805       if (auto *Load = dyn_cast<LoadInst>(&I)) {
2806         EvaluatePtrUse(Load, Load->getPointerOperand());
2807       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
2808         EvaluatePtrUse(Store, Store->getPointerOperand());
2809         EvaluatePtrUse(Store, Store->getValueOperand());
2810       }
2811     }
2812   for (auto *I : ScalarPtrs)
2813     if (!PossibleNonScalarPtrs.count(I)) {
2814       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2815       Worklist.insert(I);
2816     }
2817 
2818   // Insert the forced scalars.
2819   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2820   // induction variable when the PHI user is scalarized.
2821   auto ForcedScalar = ForcedScalars.find(VF);
2822   if (ForcedScalar != ForcedScalars.end())
2823     for (auto *I : ForcedScalar->second) {
2824       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2825       Worklist.insert(I);
2826     }
2827 
2828   // Expand the worklist by looking through any bitcasts and getelementptr
2829   // instructions we've already identified as scalar. This is similar to the
2830   // expansion step in collectLoopUniforms(); however, here we're only
2831   // expanding to include additional bitcasts and getelementptr instructions.
2832   unsigned Idx = 0;
2833   while (Idx != Worklist.size()) {
2834     Instruction *Dst = Worklist[Idx++];
2835     if (!IsLoopVaryingGEP(Dst->getOperand(0)))
2836       continue;
2837     auto *Src = cast<Instruction>(Dst->getOperand(0));
2838     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
2839           auto *J = cast<Instruction>(U);
2840           return !TheLoop->contains(J) || Worklist.count(J) ||
2841                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
2842                   IsScalarUse(J, Src));
2843         })) {
2844       Worklist.insert(Src);
2845       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2846     }
2847   }
2848 
2849   // An induction variable will remain scalar if all users of the induction
2850   // variable and induction variable update remain scalar.
2851   for (const auto &Induction : Legal->getInductionVars()) {
2852     auto *Ind = Induction.first;
2853     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
2854 
2855     // If tail-folding is applied, the primary induction variable will be used
2856     // to feed a vector compare.
2857     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2858       continue;
2859 
2860     // Returns true if \p Indvar is a pointer induction that is used directly by
2861     // load/store instruction \p I.
2862     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2863                                               Instruction *I) {
2864       return Induction.second.getKind() ==
2865                  InductionDescriptor::IK_PtrInduction &&
2866              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
2867              Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
2868     };
2869 
2870     // Determine if all users of the induction variable are scalar after
2871     // vectorization.
2872     bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
2873       auto *I = cast<Instruction>(U);
2874       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
2875              IsDirectLoadStoreFromPtrIndvar(Ind, I);
2876     });
2877     if (!ScalarInd)
2878       continue;
2879 
2880     // If the induction variable update is a fixed-order recurrence, neither the
2881     // induction variable or its update should be marked scalar after
2882     // vectorization.
2883     auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
2884     if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
2885       continue;
2886 
2887     // Determine if all users of the induction variable update instruction are
2888     // scalar after vectorization.
2889     bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
2890       auto *I = cast<Instruction>(U);
2891       return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
2892              IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2893     });
2894     if (!ScalarIndUpdate)
2895       continue;
2896 
2897     // The induction variable and its update instruction will remain scalar.
2898     Worklist.insert(Ind);
2899     Worklist.insert(IndUpdate);
2900     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2901     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2902                       << "\n");
2903   }
2904 
2905   Scalars[VF].insert_range(Worklist);
2906 }
2907 
isScalarWithPredication(Instruction * I,ElementCount VF) const2908 bool LoopVectorizationCostModel::isScalarWithPredication(
2909     Instruction *I, ElementCount VF) const {
2910   if (!isPredicatedInst(I))
2911     return false;
2912 
2913   // Do we have a non-scalar lowering for this predicated
2914   // instruction? No - it is scalar with predication.
2915   switch(I->getOpcode()) {
2916   default:
2917     return true;
2918   case Instruction::Call:
2919     if (VF.isScalar())
2920       return true;
2921     return getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize;
2922   case Instruction::Load:
2923   case Instruction::Store: {
2924     auto *Ptr = getLoadStorePointerOperand(I);
2925     auto *Ty = getLoadStoreType(I);
2926     unsigned AS = getLoadStoreAddressSpace(I);
2927     Type *VTy = Ty;
2928     if (VF.isVector())
2929       VTy = VectorType::get(Ty, VF);
2930     const Align Alignment = getLoadStoreAlignment(I);
2931     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment, AS) ||
2932                                 TTI.isLegalMaskedGather(VTy, Alignment))
2933                             : !(isLegalMaskedStore(Ty, Ptr, Alignment, AS) ||
2934                                 TTI.isLegalMaskedScatter(VTy, Alignment));
2935   }
2936   case Instruction::UDiv:
2937   case Instruction::SDiv:
2938   case Instruction::SRem:
2939   case Instruction::URem: {
2940     // We have the option to use the safe-divisor idiom to avoid predication.
2941     // The cost based decision here will always select safe-divisor for
2942     // scalable vectors as scalarization isn't legal.
2943     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2944     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2945   }
2946   }
2947 }
2948 
2949 // TODO: Fold into LoopVectorizationLegality::isMaskRequired.
isPredicatedInst(Instruction * I) const2950 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
2951   // TODO: We can use the loop-preheader as context point here and get
2952   // context sensitive reasoning for isSafeToSpeculativelyExecute.
2953   if (isSafeToSpeculativelyExecute(I) ||
2954       (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
2955       isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
2956     return false;
2957 
2958   // If the instruction was executed conditionally in the original scalar loop,
2959   // predication is needed with a mask whose lanes are all possibly inactive.
2960   if (Legal->blockNeedsPredication(I->getParent()))
2961     return true;
2962 
2963   // If we're not folding the tail by masking, predication is unnecessary.
2964   if (!foldTailByMasking())
2965     return false;
2966 
2967   // All that remain are instructions with side-effects originally executed in
2968   // the loop unconditionally, but now execute under a tail-fold mask (only)
2969   // having at least one active lane (the first). If the side-effects of the
2970   // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2971   // - it will cause the same side-effects as when masked.
2972   switch(I->getOpcode()) {
2973   default:
2974     llvm_unreachable(
2975         "instruction should have been considered by earlier checks");
2976   case Instruction::Call:
2977     // Side-effects of a Call are assumed to be non-invariant, needing a
2978     // (fold-tail) mask.
2979     assert(Legal->isMaskRequired(I) &&
2980            "should have returned earlier for calls not needing a mask");
2981     return true;
2982   case Instruction::Load:
2983     // If the address is loop invariant no predication is needed.
2984     return !Legal->isInvariant(getLoadStorePointerOperand(I));
2985   case Instruction::Store: {
2986     // For stores, we need to prove both speculation safety (which follows from
2987     // the same argument as loads), but also must prove the value being stored
2988     // is correct.  The easiest form of the later is to require that all values
2989     // stored are the same.
2990     return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
2991              Legal->isInvariant(cast<StoreInst>(I)->getValueOperand()));
2992   }
2993   case Instruction::UDiv:
2994   case Instruction::SDiv:
2995   case Instruction::SRem:
2996   case Instruction::URem:
2997     // If the divisor is loop-invariant no predication is needed.
2998     return !Legal->isInvariant(I->getOperand(1));
2999   }
3000 }
3001 
3002 std::pair<InstructionCost, InstructionCost>
getDivRemSpeculationCost(Instruction * I,ElementCount VF) const3003 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3004                                                     ElementCount VF) const {
3005   assert(I->getOpcode() == Instruction::UDiv ||
3006          I->getOpcode() == Instruction::SDiv ||
3007          I->getOpcode() == Instruction::SRem ||
3008          I->getOpcode() == Instruction::URem);
3009   assert(!isSafeToSpeculativelyExecute(I));
3010 
3011   // Scalarization isn't legal for scalable vector types
3012   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3013   if (!VF.isScalable()) {
3014     // Get the scalarization cost and scale this amount by the probability of
3015     // executing the predicated block. If the instruction is not predicated,
3016     // we fall through to the next case.
3017     ScalarizationCost = 0;
3018 
3019     // These instructions have a non-void type, so account for the phi nodes
3020     // that we will create. This cost is likely to be zero. The phi node
3021     // cost, if any, should be scaled by the block probability because it
3022     // models a copy at the end of each predicated block.
3023     ScalarizationCost +=
3024         VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
3025 
3026     // The cost of the non-predicated instruction.
3027     ScalarizationCost +=
3028         VF.getFixedValue() *
3029         TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3030 
3031     // The cost of insertelement and extractelement instructions needed for
3032     // scalarization.
3033     ScalarizationCost += getScalarizationOverhead(I, VF);
3034 
3035     // Scale the cost by the probability of executing the predicated blocks.
3036     // This assumes the predicated block for each vector lane is equally
3037     // likely.
3038     ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
3039   }
3040   InstructionCost SafeDivisorCost = 0;
3041 
3042   auto *VecTy = toVectorTy(I->getType(), VF);
3043 
3044   // The cost of the select guard to ensure all lanes are well defined
3045   // after we speculate above any internal control flow.
3046   SafeDivisorCost +=
3047       TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3048                              toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3049                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
3050 
3051   // Certain instructions can be cheaper to vectorize if they have a constant
3052   // second vector operand. One example of this are shifts on x86.
3053   Value *Op2 = I->getOperand(1);
3054   auto Op2Info = TTI.getOperandInfo(Op2);
3055   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3056       Legal->isInvariant(Op2))
3057     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3058 
3059   SmallVector<const Value *, 4> Operands(I->operand_values());
3060   SafeDivisorCost += TTI.getArithmeticInstrCost(
3061     I->getOpcode(), VecTy, CostKind,
3062     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3063     Op2Info, Operands, I);
3064   return {ScalarizationCost, SafeDivisorCost};
3065 }
3066 
interleavedAccessCanBeWidened(Instruction * I,ElementCount VF) const3067 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3068     Instruction *I, ElementCount VF) const {
3069   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3070   assert(getWideningDecision(I, VF) == CM_Unknown &&
3071          "Decision should not be set yet.");
3072   auto *Group = getInterleavedAccessGroup(I);
3073   assert(Group && "Must have a group.");
3074   unsigned InterleaveFactor = Group->getFactor();
3075 
3076   // If the instruction's allocated size doesn't equal its type size, it
3077   // requires padding and will be scalarized.
3078   auto &DL = I->getDataLayout();
3079   auto *ScalarTy = getLoadStoreType(I);
3080   if (hasIrregularType(ScalarTy, DL))
3081     return false;
3082 
3083   // For scalable vectors, the interleave factors must be <= 8 since we require
3084   // the (de)interleaveN intrinsics instead of shufflevectors.
3085   if (VF.isScalable() && InterleaveFactor > 8)
3086     return false;
3087 
3088   // If the group involves a non-integral pointer, we may not be able to
3089   // losslessly cast all values to a common type.
3090   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3091   for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3092     Instruction *Member = Group->getMember(Idx);
3093     if (!Member)
3094       continue;
3095     auto *MemberTy = getLoadStoreType(Member);
3096     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3097     // Don't coerce non-integral pointers to integers or vice versa.
3098     if (MemberNI != ScalarNI)
3099       // TODO: Consider adding special nullptr value case here
3100       return false;
3101     if (MemberNI && ScalarNI &&
3102         ScalarTy->getPointerAddressSpace() !=
3103             MemberTy->getPointerAddressSpace())
3104       return false;
3105   }
3106 
3107   // Check if masking is required.
3108   // A Group may need masking for one of two reasons: it resides in a block that
3109   // needs predication, or it was decided to use masking to deal with gaps
3110   // (either a gap at the end of a load-access that may result in a speculative
3111   // load, or any gaps in a store-access).
3112   bool PredicatedAccessRequiresMasking =
3113       blockNeedsPredicationForAnyReason(I->getParent()) &&
3114       Legal->isMaskRequired(I);
3115   bool LoadAccessWithGapsRequiresEpilogMasking =
3116       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3117       !isScalarEpilogueAllowed();
3118   bool StoreAccessWithGapsRequiresMasking =
3119       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3120   if (!PredicatedAccessRequiresMasking &&
3121       !LoadAccessWithGapsRequiresEpilogMasking &&
3122       !StoreAccessWithGapsRequiresMasking)
3123     return true;
3124 
3125   // If masked interleaving is required, we expect that the user/target had
3126   // enabled it, because otherwise it either wouldn't have been created or
3127   // it should have been invalidated by the CostModel.
3128   assert(useMaskedInterleavedAccesses(TTI) &&
3129          "Masked interleave-groups for predicated accesses are not enabled.");
3130 
3131   if (Group->isReverse())
3132     return false;
3133 
3134   auto *Ty = getLoadStoreType(I);
3135   const Align Alignment = getLoadStoreAlignment(I);
3136   unsigned AS = getLoadStoreAddressSpace(I);
3137   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS)
3138                           : TTI.isLegalMaskedStore(Ty, Alignment, AS);
3139 }
3140 
memoryInstructionCanBeWidened(Instruction * I,ElementCount VF)3141 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3142     Instruction *I, ElementCount VF) {
3143   // Get and ensure we have a valid memory instruction.
3144   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3145 
3146   auto *Ptr = getLoadStorePointerOperand(I);
3147   auto *ScalarTy = getLoadStoreType(I);
3148 
3149   // In order to be widened, the pointer should be consecutive, first of all.
3150   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3151     return false;
3152 
3153   // If the instruction is a store located in a predicated block, it will be
3154   // scalarized.
3155   if (isScalarWithPredication(I, VF))
3156     return false;
3157 
3158   // If the instruction's allocated size doesn't equal it's type size, it
3159   // requires padding and will be scalarized.
3160   auto &DL = I->getDataLayout();
3161   if (hasIrregularType(ScalarTy, DL))
3162     return false;
3163 
3164   return true;
3165 }
3166 
collectLoopUniforms(ElementCount VF)3167 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3168   // We should not collect Uniforms more than once per VF. Right now,
3169   // this function is called from collectUniformsAndScalars(), which
3170   // already does this check. Collecting Uniforms for VF=1 does not make any
3171   // sense.
3172 
3173   assert(VF.isVector() && !Uniforms.contains(VF) &&
3174          "This function should not be visited twice for the same VF");
3175 
3176   // Visit the list of Uniforms. If we find no uniform value, we won't
3177   // analyze again.  Uniforms.count(VF) will return 1.
3178   Uniforms[VF].clear();
3179 
3180   // Now we know that the loop is vectorizable!
3181   // Collect instructions inside the loop that will remain uniform after
3182   // vectorization.
3183 
3184   // Global values, params and instructions outside of current loop are out of
3185   // scope.
3186   auto IsOutOfScope = [&](Value *V) -> bool {
3187     Instruction *I = dyn_cast<Instruction>(V);
3188     return (!I || !TheLoop->contains(I));
3189   };
3190 
3191   // Worklist containing uniform instructions demanding lane 0.
3192   SetVector<Instruction *> Worklist;
3193 
3194   // Add uniform instructions demanding lane 0 to the worklist. Instructions
3195   // that require predication must not be considered uniform after
3196   // vectorization, because that would create an erroneous replicating region
3197   // where only a single instance out of VF should be formed.
3198   auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3199     if (IsOutOfScope(I)) {
3200       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3201                         << *I << "\n");
3202       return;
3203     }
3204     if (isPredicatedInst(I)) {
3205       LLVM_DEBUG(
3206           dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3207                  << "\n");
3208       return;
3209     }
3210     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3211     Worklist.insert(I);
3212   };
3213 
3214   // Start with the conditional branches exiting the loop. If the branch
3215   // condition is an instruction contained in the loop that is only used by the
3216   // branch, it is uniform. Note conditions from uncountable early exits are not
3217   // uniform.
3218   SmallVector<BasicBlock *> Exiting;
3219   TheLoop->getExitingBlocks(Exiting);
3220   for (BasicBlock *E : Exiting) {
3221     if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3222       continue;
3223     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3224     if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3225       AddToWorklistIfAllowed(Cmp);
3226   }
3227 
3228   auto PrevVF = VF.divideCoefficientBy(2);
3229   // Return true if all lanes perform the same memory operation, and we can
3230   // thus choose to execute only one.
3231   auto IsUniformMemOpUse = [&](Instruction *I) {
3232     // If the value was already known to not be uniform for the previous
3233     // (smaller VF), it cannot be uniform for the larger VF.
3234     if (PrevVF.isVector()) {
3235       auto Iter = Uniforms.find(PrevVF);
3236       if (Iter != Uniforms.end() && !Iter->second.contains(I))
3237         return false;
3238     }
3239     if (!Legal->isUniformMemOp(*I, VF))
3240       return false;
3241     if (isa<LoadInst>(I))
3242       // Loading the same address always produces the same result - at least
3243       // assuming aliasing and ordering which have already been checked.
3244       return true;
3245     // Storing the same value on every iteration.
3246     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3247   };
3248 
3249   auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3250     InstWidening WideningDecision = getWideningDecision(I, VF);
3251     assert(WideningDecision != CM_Unknown &&
3252            "Widening decision should be ready at this moment");
3253 
3254     if (IsUniformMemOpUse(I))
3255       return true;
3256 
3257     return (WideningDecision == CM_Widen ||
3258             WideningDecision == CM_Widen_Reverse ||
3259             WideningDecision == CM_Interleave);
3260   };
3261 
3262   // Returns true if Ptr is the pointer operand of a memory access instruction
3263   // I, I is known to not require scalarization, and the pointer is not also
3264   // stored.
3265   auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3266     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3267       return false;
3268     return getLoadStorePointerOperand(I) == Ptr &&
3269            (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3270   };
3271 
3272   // Holds a list of values which are known to have at least one uniform use.
3273   // Note that there may be other uses which aren't uniform.  A "uniform use"
3274   // here is something which only demands lane 0 of the unrolled iterations;
3275   // it does not imply that all lanes produce the same value (e.g. this is not
3276   // the usual meaning of uniform)
3277   SetVector<Value *> HasUniformUse;
3278 
3279   // Scan the loop for instructions which are either a) known to have only
3280   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3281   for (auto *BB : TheLoop->blocks())
3282     for (auto &I : *BB) {
3283       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3284         switch (II->getIntrinsicID()) {
3285         case Intrinsic::sideeffect:
3286         case Intrinsic::experimental_noalias_scope_decl:
3287         case Intrinsic::assume:
3288         case Intrinsic::lifetime_start:
3289         case Intrinsic::lifetime_end:
3290           if (TheLoop->hasLoopInvariantOperands(&I))
3291             AddToWorklistIfAllowed(&I);
3292           break;
3293         default:
3294           break;
3295         }
3296       }
3297 
3298       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3299         if (IsOutOfScope(EVI->getAggregateOperand())) {
3300           AddToWorklistIfAllowed(EVI);
3301           continue;
3302         }
3303         // Only ExtractValue instructions where the aggregate value comes from a
3304         // call are allowed to be non-uniform.
3305         assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3306                "Expected aggregate value to be call return value");
3307       }
3308 
3309       // If there's no pointer operand, there's nothing to do.
3310       auto *Ptr = getLoadStorePointerOperand(&I);
3311       if (!Ptr)
3312         continue;
3313 
3314       if (IsUniformMemOpUse(&I))
3315         AddToWorklistIfAllowed(&I);
3316 
3317       if (IsVectorizedMemAccessUse(&I, Ptr))
3318         HasUniformUse.insert(Ptr);
3319     }
3320 
3321   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3322   // demanding) users.  Since loops are assumed to be in LCSSA form, this
3323   // disallows uses outside the loop as well.
3324   for (auto *V : HasUniformUse) {
3325     if (IsOutOfScope(V))
3326       continue;
3327     auto *I = cast<Instruction>(V);
3328     bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3329       auto *UI = cast<Instruction>(U);
3330       return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3331     });
3332     if (UsersAreMemAccesses)
3333       AddToWorklistIfAllowed(I);
3334   }
3335 
3336   // Expand Worklist in topological order: whenever a new instruction
3337   // is added , its users should be already inside Worklist.  It ensures
3338   // a uniform instruction will only be used by uniform instructions.
3339   unsigned Idx = 0;
3340   while (Idx != Worklist.size()) {
3341     Instruction *I = Worklist[Idx++];
3342 
3343     for (auto *OV : I->operand_values()) {
3344       // isOutOfScope operands cannot be uniform instructions.
3345       if (IsOutOfScope(OV))
3346         continue;
3347       // First order recurrence Phi's should typically be considered
3348       // non-uniform.
3349       auto *OP = dyn_cast<PHINode>(OV);
3350       if (OP && Legal->isFixedOrderRecurrence(OP))
3351         continue;
3352       // If all the users of the operand are uniform, then add the
3353       // operand into the uniform worklist.
3354       auto *OI = cast<Instruction>(OV);
3355       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3356             auto *J = cast<Instruction>(U);
3357             return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3358           }))
3359         AddToWorklistIfAllowed(OI);
3360     }
3361   }
3362 
3363   // For an instruction to be added into Worklist above, all its users inside
3364   // the loop should also be in Worklist. However, this condition cannot be
3365   // true for phi nodes that form a cyclic dependence. We must process phi
3366   // nodes separately. An induction variable will remain uniform if all users
3367   // of the induction variable and induction variable update remain uniform.
3368   // The code below handles both pointer and non-pointer induction variables.
3369   BasicBlock *Latch = TheLoop->getLoopLatch();
3370   for (const auto &Induction : Legal->getInductionVars()) {
3371     auto *Ind = Induction.first;
3372     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3373 
3374     // Determine if all users of the induction variable are uniform after
3375     // vectorization.
3376     bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3377       auto *I = cast<Instruction>(U);
3378       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3379              IsVectorizedMemAccessUse(I, Ind);
3380     });
3381     if (!UniformInd)
3382       continue;
3383 
3384     // Determine if all users of the induction variable update instruction are
3385     // uniform after vectorization.
3386     bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3387       auto *I = cast<Instruction>(U);
3388       return I == Ind || Worklist.count(I) ||
3389              IsVectorizedMemAccessUse(I, IndUpdate);
3390     });
3391     if (!UniformIndUpdate)
3392       continue;
3393 
3394     // The induction variable and its update instruction will remain uniform.
3395     AddToWorklistIfAllowed(Ind);
3396     AddToWorklistIfAllowed(IndUpdate);
3397   }
3398 
3399   Uniforms[VF].insert_range(Worklist);
3400 }
3401 
runtimeChecksRequired()3402 bool LoopVectorizationCostModel::runtimeChecksRequired() {
3403   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3404 
3405   if (Legal->getRuntimePointerChecking()->Need) {
3406     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3407         "runtime pointer checks needed. Enable vectorization of this "
3408         "loop with '#pragma clang loop vectorize(enable)' when "
3409         "compiling with -Os/-Oz",
3410         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3411     return true;
3412   }
3413 
3414   if (!PSE.getPredicate().isAlwaysTrue()) {
3415     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3416         "runtime SCEV checks needed. Enable vectorization of this "
3417         "loop with '#pragma clang loop vectorize(enable)' when "
3418         "compiling with -Os/-Oz",
3419         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3420     return true;
3421   }
3422 
3423   // FIXME: Avoid specializing for stride==1 instead of bailing out.
3424   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3425     reportVectorizationFailure("Runtime stride check for small trip count",
3426         "runtime stride == 1 checks needed. Enable vectorization of "
3427         "this loop without such check by compiling with -Os/-Oz",
3428         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3429     return true;
3430   }
3431 
3432   return false;
3433 }
3434 
isScalableVectorizationAllowed()3435 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3436   if (IsScalableVectorizationAllowed)
3437     return *IsScalableVectorizationAllowed;
3438 
3439   IsScalableVectorizationAllowed = false;
3440   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3441     return false;
3442 
3443   if (Hints->isScalableVectorizationDisabled()) {
3444     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3445                             "ScalableVectorizationDisabled", ORE, TheLoop);
3446     return false;
3447   }
3448 
3449   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3450 
3451   auto MaxScalableVF = ElementCount::getScalable(
3452       std::numeric_limits<ElementCount::ScalarTy>::max());
3453 
3454   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3455   // FIXME: While for scalable vectors this is currently sufficient, this should
3456   // be replaced by a more detailed mechanism that filters out specific VFs,
3457   // instead of invalidating vectorization for a whole set of VFs based on the
3458   // MaxVF.
3459 
3460   // Disable scalable vectorization if the loop contains unsupported reductions.
3461   if (!canVectorizeReductions(MaxScalableVF)) {
3462     reportVectorizationInfo(
3463         "Scalable vectorization not supported for the reduction "
3464         "operations found in this loop.",
3465         "ScalableVFUnfeasible", ORE, TheLoop);
3466     return false;
3467   }
3468 
3469   // Disable scalable vectorization if the loop contains any instructions
3470   // with element types not supported for scalable vectors.
3471   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3472         return !Ty->isVoidTy() &&
3473                !this->TTI.isElementTypeLegalForScalableVector(Ty);
3474       })) {
3475     reportVectorizationInfo("Scalable vectorization is not supported "
3476                             "for all element types found in this loop.",
3477                             "ScalableVFUnfeasible", ORE, TheLoop);
3478     return false;
3479   }
3480 
3481   if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3482     reportVectorizationInfo("The target does not provide maximum vscale value "
3483                             "for safe distance analysis.",
3484                             "ScalableVFUnfeasible", ORE, TheLoop);
3485     return false;
3486   }
3487 
3488   IsScalableVectorizationAllowed = true;
3489   return true;
3490 }
3491 
3492 ElementCount
getMaxLegalScalableVF(unsigned MaxSafeElements)3493 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3494   if (!isScalableVectorizationAllowed())
3495     return ElementCount::getScalable(0);
3496 
3497   auto MaxScalableVF = ElementCount::getScalable(
3498       std::numeric_limits<ElementCount::ScalarTy>::max());
3499   if (Legal->isSafeForAnyVectorWidth())
3500     return MaxScalableVF;
3501 
3502   std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3503   // Limit MaxScalableVF by the maximum safe dependence distance.
3504   MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3505 
3506   if (!MaxScalableVF)
3507     reportVectorizationInfo(
3508         "Max legal vector width too small, scalable vectorization "
3509         "unfeasible.",
3510         "ScalableVFUnfeasible", ORE, TheLoop);
3511 
3512   return MaxScalableVF;
3513 }
3514 
computeFeasibleMaxVF(unsigned MaxTripCount,ElementCount UserVF,bool FoldTailByMasking)3515 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3516     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3517   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3518   unsigned SmallestType, WidestType;
3519   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3520 
3521   // Get the maximum safe dependence distance in bits computed by LAA.
3522   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3523   // the memory accesses that is most restrictive (involved in the smallest
3524   // dependence distance).
3525   unsigned MaxSafeElementsPowerOf2 =
3526       bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3527   if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3528     unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3529     MaxSafeElementsPowerOf2 =
3530         std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
3531   }
3532   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
3533   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
3534 
3535   if (!Legal->isSafeForAnyVectorWidth())
3536     this->MaxSafeElements = MaxSafeElementsPowerOf2;
3537 
3538   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3539                     << ".\n");
3540   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3541                     << ".\n");
3542 
3543   // First analyze the UserVF, fall back if the UserVF should be ignored.
3544   if (UserVF) {
3545     auto MaxSafeUserVF =
3546         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3547 
3548     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3549       // If `VF=vscale x N` is safe, then so is `VF=N`
3550       if (UserVF.isScalable())
3551         return FixedScalableVFPair(
3552             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3553 
3554       return UserVF;
3555     }
3556 
3557     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3558 
3559     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3560     // is better to ignore the hint and let the compiler choose a suitable VF.
3561     if (!UserVF.isScalable()) {
3562       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3563                         << " is unsafe, clamping to max safe VF="
3564                         << MaxSafeFixedVF << ".\n");
3565       ORE->emit([&]() {
3566         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3567                                           TheLoop->getStartLoc(),
3568                                           TheLoop->getHeader())
3569                << "User-specified vectorization factor "
3570                << ore::NV("UserVectorizationFactor", UserVF)
3571                << " is unsafe, clamping to maximum safe vectorization factor "
3572                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3573       });
3574       return MaxSafeFixedVF;
3575     }
3576 
3577     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3578       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3579                         << " is ignored because scalable vectors are not "
3580                            "available.\n");
3581       ORE->emit([&]() {
3582         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3583                                           TheLoop->getStartLoc(),
3584                                           TheLoop->getHeader())
3585                << "User-specified vectorization factor "
3586                << ore::NV("UserVectorizationFactor", UserVF)
3587                << " is ignored because the target does not support scalable "
3588                   "vectors. The compiler will pick a more suitable value.";
3589       });
3590     } else {
3591       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3592                         << " is unsafe. Ignoring scalable UserVF.\n");
3593       ORE->emit([&]() {
3594         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3595                                           TheLoop->getStartLoc(),
3596                                           TheLoop->getHeader())
3597                << "User-specified vectorization factor "
3598                << ore::NV("UserVectorizationFactor", UserVF)
3599                << " is unsafe. Ignoring the hint to let the compiler pick a "
3600                   "more suitable value.";
3601       });
3602     }
3603   }
3604 
3605   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3606                     << " / " << WidestType << " bits.\n");
3607 
3608   FixedScalableVFPair Result(ElementCount::getFixed(1),
3609                              ElementCount::getScalable(0));
3610   if (auto MaxVF =
3611           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3612                                   MaxSafeFixedVF, FoldTailByMasking))
3613     Result.FixedVF = MaxVF;
3614 
3615   if (auto MaxVF =
3616           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3617                                   MaxSafeScalableVF, FoldTailByMasking))
3618     if (MaxVF.isScalable()) {
3619       Result.ScalableVF = MaxVF;
3620       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3621                         << "\n");
3622     }
3623 
3624   return Result;
3625 }
3626 
3627 FixedScalableVFPair
computeMaxVF(ElementCount UserVF,unsigned UserIC)3628 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3629   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3630     // TODO: It may be useful to do since it's still likely to be dynamically
3631     // uniform if the target can skip.
3632     reportVectorizationFailure(
3633         "Not inserting runtime ptr check for divergent target",
3634         "runtime pointer checks needed. Not enabled for divergent target",
3635         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3636     return FixedScalableVFPair::getNone();
3637   }
3638 
3639   ScalarEvolution *SE = PSE.getSE();
3640   ElementCount TC = getSmallConstantTripCount(SE, TheLoop);
3641   unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3642   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3643   if (TC != ElementCount::getFixed(MaxTC))
3644     LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3645   if (TC.isScalar()) {
3646     reportVectorizationFailure("Single iteration (non) loop",
3647         "loop trip count is one, irrelevant for vectorization",
3648         "SingleIterationLoop", ORE, TheLoop);
3649     return FixedScalableVFPair::getNone();
3650   }
3651 
3652   // If BTC matches the widest induction type and is -1 then the trip count
3653   // computation will wrap to 0 and the vector trip count will be 0. Do not try
3654   // to vectorize.
3655   const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
3656   if (!isa<SCEVCouldNotCompute>(BTC) &&
3657       BTC->getType()->getScalarSizeInBits() >=
3658           Legal->getWidestInductionType()->getScalarSizeInBits() &&
3659       SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC,
3660                            SE->getMinusOne(BTC->getType()))) {
3661     reportVectorizationFailure(
3662         "Trip count computation wrapped",
3663         "backedge-taken count is -1, loop trip count wrapped to 0",
3664         "TripCountWrapped", ORE, TheLoop);
3665     return FixedScalableVFPair::getNone();
3666   }
3667 
3668   switch (ScalarEpilogueStatus) {
3669   case CM_ScalarEpilogueAllowed:
3670     return computeFeasibleMaxVF(MaxTC, UserVF, false);
3671   case CM_ScalarEpilogueNotAllowedUsePredicate:
3672     [[fallthrough]];
3673   case CM_ScalarEpilogueNotNeededUsePredicate:
3674     LLVM_DEBUG(
3675         dbgs() << "LV: vector predicate hint/switch found.\n"
3676                << "LV: Not allowing scalar epilogue, creating predicated "
3677                << "vector loop.\n");
3678     break;
3679   case CM_ScalarEpilogueNotAllowedLowTripLoop:
3680     // fallthrough as a special case of OptForSize
3681   case CM_ScalarEpilogueNotAllowedOptSize:
3682     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3683       LLVM_DEBUG(
3684           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3685     else
3686       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3687                         << "count.\n");
3688 
3689     // Bail if runtime checks are required, which are not good when optimising
3690     // for size.
3691     if (runtimeChecksRequired())
3692       return FixedScalableVFPair::getNone();
3693 
3694     break;
3695   }
3696 
3697   // Now try the tail folding
3698 
3699   // Invalidate interleave groups that require an epilogue if we can't mask
3700   // the interleave-group.
3701   if (!useMaskedInterleavedAccesses(TTI)) {
3702     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3703            "No decisions should have been taken at this point");
3704     // Note: There is no need to invalidate any cost modeling decisions here, as
3705     // none were taken so far.
3706     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3707   }
3708 
3709   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
3710 
3711   // Avoid tail folding if the trip count is known to be a multiple of any VF
3712   // we choose.
3713   std::optional<unsigned> MaxPowerOf2RuntimeVF =
3714       MaxFactors.FixedVF.getFixedValue();
3715   if (MaxFactors.ScalableVF) {
3716     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3717     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
3718       MaxPowerOf2RuntimeVF = std::max<unsigned>(
3719           *MaxPowerOf2RuntimeVF,
3720           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3721     } else
3722       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3723   }
3724 
3725   auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3726     // Return false if the loop is neither a single-latch-exit loop nor an
3727     // early-exit loop as tail-folding is not supported in that case.
3728     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3729         !Legal->hasUncountableEarlyExit())
3730       return false;
3731     unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3732     ScalarEvolution *SE = PSE.getSE();
3733     // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3734     // with uncountable exits. For countable loops, the symbolic maximum must
3735     // remain identical to the known back-edge taken count.
3736     const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3737     assert((Legal->hasUncountableEarlyExit() ||
3738             BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3739            "Invalid loop count");
3740     const SCEV *ExitCount = SE->getAddExpr(
3741         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3742     const SCEV *Rem = SE->getURemExpr(
3743         SE->applyLoopGuards(ExitCount, TheLoop),
3744         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
3745     return Rem->isZero();
3746   };
3747 
3748   if (MaxPowerOf2RuntimeVF > 0u) {
3749     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3750            "MaxFixedVF must be a power of 2");
3751     if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3752       // Accept MaxFixedVF if we do not have a tail.
3753       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3754       return MaxFactors;
3755     }
3756   }
3757 
3758   auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
3759   if (ExpectedTC && ExpectedTC->isFixed() &&
3760       ExpectedTC->getFixedValue() <=
3761           TTI.getMinTripCountTailFoldingThreshold()) {
3762     if (MaxPowerOf2RuntimeVF > 0u) {
3763       // If we have a low-trip-count, and the fixed-width VF is known to divide
3764       // the trip count but the scalable factor does not, use the fixed-width
3765       // factor in preference to allow the generation of a non-predicated loop.
3766       if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3767           NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3768         LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3769                              "remain for any chosen VF.\n");
3770         MaxFactors.ScalableVF = ElementCount::getScalable(0);
3771         return MaxFactors;
3772       }
3773     }
3774 
3775     reportVectorizationFailure(
3776         "The trip count is below the minial threshold value.",
3777         "loop trip count is too low, avoiding vectorization", "LowTripCount",
3778         ORE, TheLoop);
3779     return FixedScalableVFPair::getNone();
3780   }
3781 
3782   // If we don't know the precise trip count, or if the trip count that we
3783   // found modulo the vectorization factor is not zero, try to fold the tail
3784   // by masking.
3785   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3786   bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3787   setTailFoldingStyles(ContainsScalableVF, UserIC);
3788   if (foldTailByMasking()) {
3789     if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
3790       LLVM_DEBUG(
3791           dbgs()
3792           << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3793              "try to generate VP Intrinsics with scalable vector "
3794              "factors only.\n");
3795       // Tail folded loop using VP intrinsics restricts the VF to be scalable
3796       // for now.
3797       // TODO: extend it for fixed vectors, if required.
3798       assert(ContainsScalableVF && "Expected scalable vector factor.");
3799 
3800       MaxFactors.FixedVF = ElementCount::getFixed(1);
3801     }
3802     return MaxFactors;
3803   }
3804 
3805   // If there was a tail-folding hint/switch, but we can't fold the tail by
3806   // masking, fallback to a vectorization with a scalar epilogue.
3807   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3808     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3809                          "scalar epilogue instead.\n");
3810     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3811     return MaxFactors;
3812   }
3813 
3814   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3815     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3816     return FixedScalableVFPair::getNone();
3817   }
3818 
3819   if (TC.isZero()) {
3820     reportVectorizationFailure(
3821         "unable to calculate the loop count due to complex control flow",
3822         "UnknownLoopCountComplexCFG", ORE, TheLoop);
3823     return FixedScalableVFPair::getNone();
3824   }
3825 
3826   reportVectorizationFailure(
3827       "Cannot optimize for size and vectorize at the same time.",
3828       "cannot optimize for size and vectorize at the same time. "
3829       "Enable vectorization of this loop with '#pragma clang loop "
3830       "vectorize(enable)' when compiling with -Os/-Oz",
3831       "NoTailLoopWithOptForSize", ORE, TheLoop);
3832   return FixedScalableVFPair::getNone();
3833 }
3834 
useMaxBandwidth(ElementCount VF)3835 bool LoopVectorizationCostModel::useMaxBandwidth(ElementCount VF) {
3836   return useMaxBandwidth(VF.isScalable()
3837                              ? TargetTransformInfo::RGK_ScalableVector
3838                              : TargetTransformInfo::RGK_FixedWidthVector);
3839 }
3840 
useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind)3841 bool LoopVectorizationCostModel::useMaxBandwidth(
3842     TargetTransformInfo::RegisterKind RegKind) {
3843   return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
3844                                (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
3845                                 (UseWiderVFIfCallVariantsPresent &&
3846                                  Legal->hasVectorCallVariants())));
3847 }
3848 
getMaximizedVFForTarget(unsigned MaxTripCount,unsigned SmallestType,unsigned WidestType,ElementCount MaxSafeVF,bool FoldTailByMasking)3849 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3850     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3851     ElementCount MaxSafeVF, bool FoldTailByMasking) {
3852   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3853   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3854       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3855                            : TargetTransformInfo::RGK_FixedWidthVector);
3856 
3857   // Convenience function to return the minimum of two ElementCounts.
3858   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3859     assert((LHS.isScalable() == RHS.isScalable()) &&
3860            "Scalable flags must match");
3861     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3862   };
3863 
3864   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3865   // Note that both WidestRegister and WidestType may not be a powers of 2.
3866   auto MaxVectorElementCount = ElementCount::get(
3867       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
3868       ComputeScalableMaxVF);
3869   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3870   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3871                     << (MaxVectorElementCount * WidestType) << " bits.\n");
3872 
3873   if (!MaxVectorElementCount) {
3874     LLVM_DEBUG(dbgs() << "LV: The target has no "
3875                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
3876                       << " vector registers.\n");
3877     return ElementCount::getFixed(1);
3878   }
3879 
3880   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
3881   if (MaxVectorElementCount.isScalable() &&
3882       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
3883     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
3884     auto Min = Attr.getVScaleRangeMin();
3885     WidestRegisterMinEC *= Min;
3886   }
3887 
3888   // When a scalar epilogue is required, at least one iteration of the scalar
3889   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3890   // max VF that results in a dead vector loop.
3891   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
3892     MaxTripCount -= 1;
3893 
3894   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
3895       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
3896     // If upper bound loop trip count (TC) is known at compile time there is no
3897     // point in choosing VF greater than TC (as done in the loop below). Select
3898     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
3899     // scalable, we only fall back on a fixed VF when the TC is less than or
3900     // equal to the known number of lanes.
3901     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
3902     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3903                          "exceeding the constant trip count: "
3904                       << ClampedUpperTripCount << "\n");
3905     return ElementCount::get(
3906         ClampedUpperTripCount,
3907         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
3908   }
3909 
3910   TargetTransformInfo::RegisterKind RegKind =
3911       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3912                            : TargetTransformInfo::RGK_FixedWidthVector;
3913   ElementCount MaxVF = MaxVectorElementCount;
3914   if (useMaxBandwidth(RegKind)) {
3915     auto MaxVectorElementCountMaxBW = ElementCount::get(
3916         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
3917         ComputeScalableMaxVF);
3918     MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
3919 
3920     if (ElementCount MinVF =
3921             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
3922       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
3923         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3924                           << ") with target's minimum: " << MinVF << '\n');
3925         MaxVF = MinVF;
3926       }
3927     }
3928 
3929     // Invalidate any widening decisions we might have made, in case the loop
3930     // requires prediction (decided later), but we have already made some
3931     // load/store widening decisions.
3932     invalidateCostModelingDecisions();
3933   }
3934   return MaxVF;
3935 }
3936 
isMoreProfitable(const VectorizationFactor & A,const VectorizationFactor & B,const unsigned MaxTripCount,bool HasTail) const3937 bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3938                                                 const VectorizationFactor &B,
3939                                                 const unsigned MaxTripCount,
3940                                                 bool HasTail) const {
3941   InstructionCost CostA = A.Cost;
3942   InstructionCost CostB = B.Cost;
3943 
3944   // Improve estimate for the vector width if it is scalable.
3945   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3946   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3947   if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3948     if (A.Width.isScalable())
3949       EstimatedWidthA *= *VScale;
3950     if (B.Width.isScalable())
3951       EstimatedWidthB *= *VScale;
3952   }
3953 
3954   // When optimizing for size choose whichever is smallest, which will be the
3955   // one with the smallest cost for the whole loop. On a tie pick the larger
3956   // vector width, on the assumption that throughput will be greater.
3957   if (CM.CostKind == TTI::TCK_CodeSize)
3958     return CostA < CostB ||
3959            (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3960 
3961   // Assume vscale may be larger than 1 (or the value being tuned for),
3962   // so that scalable vectorization is slightly favorable over fixed-width
3963   // vectorization.
3964   bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
3965                         A.Width.isScalable() && !B.Width.isScalable();
3966 
3967   auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3968                                 const InstructionCost &RHS) {
3969     return PreferScalable ? LHS <= RHS : LHS < RHS;
3970   };
3971 
3972   // To avoid the need for FP division:
3973   //      (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3974   // <=>  (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
3975   if (!MaxTripCount)
3976     return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3977 
3978   auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3979                                               InstructionCost VectorCost,
3980                                               InstructionCost ScalarCost) {
3981     // If the trip count is a known (possibly small) constant, the trip count
3982     // will be rounded up to an integer number of iterations under
3983     // FoldTailByMasking. The total cost in that case will be
3984     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
3985     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
3986     // some extra overheads, but for the purpose of comparing the costs of
3987     // different VFs we can use this to compare the total loop-body cost
3988     // expected after vectorization.
3989     if (HasTail)
3990       return VectorCost * (MaxTripCount / VF) +
3991              ScalarCost * (MaxTripCount % VF);
3992     return VectorCost * divideCeil(MaxTripCount, VF);
3993   };
3994 
3995   auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
3996   auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
3997   return CmpFn(RTCostA, RTCostB);
3998 }
3999 
isMoreProfitable(const VectorizationFactor & A,const VectorizationFactor & B,bool HasTail) const4000 bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
4001                                                 const VectorizationFactor &B,
4002                                                 bool HasTail) const {
4003   const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4004   return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
4005                                                     HasTail);
4006 }
4007 
emitInvalidCostRemarks(OptimizationRemarkEmitter * ORE)4008 void LoopVectorizationPlanner::emitInvalidCostRemarks(
4009     OptimizationRemarkEmitter *ORE) {
4010   using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4011   SmallVector<RecipeVFPair> InvalidCosts;
4012   for (const auto &Plan : VPlans) {
4013     for (ElementCount VF : Plan->vectorFactors()) {
4014       // The VPlan-based cost model is designed for computing vector cost.
4015       // Querying VPlan-based cost model with a scarlar VF will cause some
4016       // errors because we expect the VF is vector for most of the widen
4017       // recipes.
4018       if (VF.isScalar())
4019         continue;
4020 
4021       VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4022                             CM, CM.CostKind);
4023       precomputeCosts(*Plan, VF, CostCtx);
4024       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4025       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4026         for (auto &R : *VPBB) {
4027           if (!R.cost(VF, CostCtx).isValid())
4028             InvalidCosts.emplace_back(&R, VF);
4029         }
4030       }
4031     }
4032   }
4033   if (InvalidCosts.empty())
4034     return;
4035 
4036   // Emit a report of VFs with invalid costs in the loop.
4037 
4038   // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4039   DenseMap<VPRecipeBase *, unsigned> Numbering;
4040   unsigned I = 0;
4041   for (auto &Pair : InvalidCosts)
4042     if (Numbering.try_emplace(Pair.first, I).second)
4043       ++I;
4044 
4045   // Sort the list, first on recipe(number) then on VF.
4046   sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4047     unsigned NA = Numbering[A.first];
4048     unsigned NB = Numbering[B.first];
4049     if (NA != NB)
4050       return NA < NB;
4051     return ElementCount::isKnownLT(A.second, B.second);
4052   });
4053 
4054   // For a list of ordered recipe-VF pairs:
4055   //   [(load, VF1), (load, VF2), (store, VF1)]
4056   // group the recipes together to emit separate remarks for:
4057   //   load  (VF1, VF2)
4058   //   store (VF1)
4059   auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4060   auto Subset = ArrayRef<RecipeVFPair>();
4061   do {
4062     if (Subset.empty())
4063       Subset = Tail.take_front(1);
4064 
4065     VPRecipeBase *R = Subset.front().first;
4066 
4067     unsigned Opcode =
4068         TypeSwitch<const VPRecipeBase *, unsigned>(R)
4069             .Case<VPHeaderPHIRecipe>(
4070                 [](const auto *R) { return Instruction::PHI; })
4071             .Case<VPWidenSelectRecipe>(
4072                 [](const auto *R) { return Instruction::Select; })
4073             .Case<VPWidenStoreRecipe>(
4074                 [](const auto *R) { return Instruction::Store; })
4075             .Case<VPWidenLoadRecipe>(
4076                 [](const auto *R) { return Instruction::Load; })
4077             .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4078                 [](const auto *R) { return Instruction::Call; })
4079             .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4080                   VPWidenCastRecipe>(
4081                 [](const auto *R) { return R->getOpcode(); })
4082             .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4083               return R->getStoredValues().empty() ? Instruction::Load
4084                                                   : Instruction::Store;
4085             });
4086 
4087     // If the next recipe is different, or if there are no other pairs,
4088     // emit a remark for the collated subset. e.g.
4089     //   [(load, VF1), (load, VF2))]
4090     // to emit:
4091     //  remark: invalid costs for 'load' at VF=(VF1, VF2)
4092     if (Subset == Tail || Tail[Subset.size()].first != R) {
4093       std::string OutString;
4094       raw_string_ostream OS(OutString);
4095       assert(!Subset.empty() && "Unexpected empty range");
4096       OS << "Recipe with invalid costs prevented vectorization at VF=(";
4097       for (const auto &Pair : Subset)
4098         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4099       OS << "):";
4100       if (Opcode == Instruction::Call) {
4101         StringRef Name = "";
4102         if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4103           Name = Int->getIntrinsicName();
4104         } else {
4105           auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4106           Function *CalledFn =
4107               WidenCall ? WidenCall->getCalledScalarFunction()
4108                         : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4109                                              ->getLiveInIRValue());
4110           Name = CalledFn->getName();
4111         }
4112         OS << " call to " << Name;
4113       } else
4114         OS << " " << Instruction::getOpcodeName(Opcode);
4115       reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4116                               R->getDebugLoc());
4117       Tail = Tail.drop_front(Subset.size());
4118       Subset = {};
4119     } else
4120       // Grow the subset by one element
4121       Subset = Tail.take_front(Subset.size() + 1);
4122   } while (!Tail.empty());
4123 }
4124 
4125 /// Check if any recipe of \p Plan will generate a vector value, which will be
4126 /// assigned a vector register.
willGenerateVectors(VPlan & Plan,ElementCount VF,const TargetTransformInfo & TTI)4127 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4128                                 const TargetTransformInfo &TTI) {
4129   assert(VF.isVector() && "Checking a scalar VF?");
4130   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4131   DenseSet<VPRecipeBase *> EphemeralRecipes;
4132   collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4133   // Set of already visited types.
4134   DenseSet<Type *> Visited;
4135   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4136            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4137     for (VPRecipeBase &R : *VPBB) {
4138       if (EphemeralRecipes.contains(&R))
4139         continue;
4140       // Continue early if the recipe is considered to not produce a vector
4141       // result. Note that this includes VPInstruction where some opcodes may
4142       // produce a vector, to preserve existing behavior as VPInstructions model
4143       // aspects not directly mapped to existing IR instructions.
4144       switch (R.getVPDefID()) {
4145       case VPDef::VPDerivedIVSC:
4146       case VPDef::VPScalarIVStepsSC:
4147       case VPDef::VPReplicateSC:
4148       case VPDef::VPInstructionSC:
4149       case VPDef::VPCanonicalIVPHISC:
4150       case VPDef::VPVectorPointerSC:
4151       case VPDef::VPVectorEndPointerSC:
4152       case VPDef::VPExpandSCEVSC:
4153       case VPDef::VPEVLBasedIVPHISC:
4154       case VPDef::VPPredInstPHISC:
4155       case VPDef::VPBranchOnMaskSC:
4156         continue;
4157       case VPDef::VPReductionSC:
4158       case VPDef::VPActiveLaneMaskPHISC:
4159       case VPDef::VPWidenCallSC:
4160       case VPDef::VPWidenCanonicalIVSC:
4161       case VPDef::VPWidenCastSC:
4162       case VPDef::VPWidenGEPSC:
4163       case VPDef::VPWidenIntrinsicSC:
4164       case VPDef::VPWidenSC:
4165       case VPDef::VPWidenSelectSC:
4166       case VPDef::VPBlendSC:
4167       case VPDef::VPFirstOrderRecurrencePHISC:
4168       case VPDef::VPHistogramSC:
4169       case VPDef::VPWidenPHISC:
4170       case VPDef::VPWidenIntOrFpInductionSC:
4171       case VPDef::VPWidenPointerInductionSC:
4172       case VPDef::VPReductionPHISC:
4173       case VPDef::VPInterleaveSC:
4174       case VPDef::VPWidenLoadEVLSC:
4175       case VPDef::VPWidenLoadSC:
4176       case VPDef::VPWidenStoreEVLSC:
4177       case VPDef::VPWidenStoreSC:
4178         break;
4179       default:
4180         llvm_unreachable("unhandled recipe");
4181       }
4182 
4183       auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4184         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4185         if (!NumLegalParts)
4186           return false;
4187         if (VF.isScalable()) {
4188           // <vscale x 1 x iN> is assumed to be profitable over iN because
4189           // scalable registers are a distinct register class from scalar
4190           // ones. If we ever find a target which wants to lower scalable
4191           // vectors back to scalars, we'll need to update this code to
4192           // explicitly ask TTI about the register class uses for each part.
4193           return NumLegalParts <= VF.getKnownMinValue();
4194         }
4195         // Two or more elements that share a register - are vectorized.
4196         return NumLegalParts < VF.getFixedValue();
4197       };
4198 
4199       // If no def nor is a store, e.g., branches, continue - no value to check.
4200       if (R.getNumDefinedValues() == 0 &&
4201           !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4202               &R))
4203         continue;
4204       // For multi-def recipes, currently only interleaved loads, suffice to
4205       // check first def only.
4206       // For stores check their stored value; for interleaved stores suffice
4207       // the check first stored value only. In all cases this is the second
4208       // operand.
4209       VPValue *ToCheck =
4210           R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4211       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4212       if (!Visited.insert({ScalarTy}).second)
4213         continue;
4214       Type *WideTy = toVectorizedTy(ScalarTy, VF);
4215       if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
4216         return true;
4217     }
4218   }
4219 
4220   return false;
4221 }
4222 
hasReplicatorRegion(VPlan & Plan)4223 static bool hasReplicatorRegion(VPlan &Plan) {
4224   return any_of(VPBlockUtils::blocksOnly<VPRegionBlock>(vp_depth_first_shallow(
4225                     Plan.getVectorLoopRegion()->getEntry())),
4226                 [](auto *VPRB) { return VPRB->isReplicator(); });
4227 }
4228 
4229 #ifndef NDEBUG
selectVectorizationFactor()4230 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4231   InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4232   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4233   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4234   assert(
4235       any_of(VPlans,
4236              [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4237       "Expected Scalar VF to be a candidate");
4238 
4239   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4240                                        ExpectedCost);
4241   VectorizationFactor ChosenFactor = ScalarCost;
4242 
4243   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4244   if (ForceVectorization &&
4245       (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4246     // Ignore scalar width, because the user explicitly wants vectorization.
4247     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4248     // evaluation.
4249     ChosenFactor.Cost = InstructionCost::getMax();
4250   }
4251 
4252   for (auto &P : VPlans) {
4253     ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4254                                P->vectorFactors().end());
4255 
4256     SmallVector<VPRegisterUsage, 8> RUs;
4257     if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
4258         CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
4259       RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4260 
4261     for (unsigned I = 0; I < VFs.size(); I++) {
4262       ElementCount VF = VFs[I];
4263       // The cost for scalar VF=1 is already calculated, so ignore it.
4264       if (VF.isScalar())
4265         continue;
4266 
4267       /// Don't consider the VF if it exceeds the number of registers for the
4268       /// target.
4269       if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI))
4270         continue;
4271 
4272       InstructionCost C = CM.expectedCost(VF);
4273 
4274       // Add on other costs that are modelled in VPlan, but not in the legacy
4275       // cost model.
4276       VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
4277                             CM, CM.CostKind);
4278       VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4279       assert(VectorRegion && "Expected to have a vector region!");
4280       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4281                vp_depth_first_shallow(VectorRegion->getEntry()))) {
4282         for (VPRecipeBase &R : *VPBB) {
4283           auto *VPI = dyn_cast<VPInstruction>(&R);
4284           if (!VPI)
4285             continue;
4286           switch (VPI->getOpcode()) {
4287           case VPInstruction::ActiveLaneMask:
4288           case VPInstruction::ExplicitVectorLength:
4289             C += VPI->cost(VF, CostCtx);
4290             break;
4291           default:
4292             break;
4293           }
4294         }
4295       }
4296 
4297       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4298       unsigned Width =
4299           getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
4300       LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4301                         << " costs: " << (Candidate.Cost / Width));
4302       if (VF.isScalable())
4303         LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4304                           << CM.getVScaleForTuning().value_or(1) << ")");
4305       LLVM_DEBUG(dbgs() << ".\n");
4306 
4307       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4308         LLVM_DEBUG(
4309             dbgs()
4310             << "LV: Not considering vector loop of width " << VF
4311             << " because it will not generate any vector instructions.\n");
4312         continue;
4313       }
4314 
4315       if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4316         LLVM_DEBUG(
4317             dbgs()
4318             << "LV: Not considering vector loop of width " << VF
4319             << " because it would cause replicated blocks to be generated,"
4320             << " which isn't allowed when optimizing for size.\n");
4321         continue;
4322       }
4323 
4324       if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4325         ChosenFactor = Candidate;
4326     }
4327   }
4328 
4329   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4330     reportVectorizationFailure(
4331         "There are conditional stores.",
4332         "store that is conditionally executed prevents vectorization",
4333         "ConditionalStore", ORE, OrigLoop);
4334     ChosenFactor = ScalarCost;
4335   }
4336 
4337   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4338                  !isMoreProfitable(ChosenFactor, ScalarCost,
4339                                    !CM.foldTailByMasking())) dbgs()
4340              << "LV: Vectorization seems to be not beneficial, "
4341              << "but was forced by a user.\n");
4342   return ChosenFactor;
4343 }
4344 #endif
4345 
isCandidateForEpilogueVectorization(ElementCount VF) const4346 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4347     ElementCount VF) const {
4348   // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4349   // reductions need special handling and are currently unsupported.
4350   if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
4351         if (!Legal->isReductionVariable(&Phi))
4352           return Legal->isFixedOrderRecurrence(&Phi);
4353         RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
4354         return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
4355       }))
4356     return false;
4357 
4358   // Phis with uses outside of the loop require special handling and are
4359   // currently unsupported.
4360   for (const auto &Entry : Legal->getInductionVars()) {
4361     // Look for uses of the value of the induction at the last iteration.
4362     Value *PostInc =
4363         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4364     for (User *U : PostInc->users())
4365       if (!OrigLoop->contains(cast<Instruction>(U)))
4366         return false;
4367     // Look for uses of penultimate value of the induction.
4368     for (User *U : Entry.first->users())
4369       if (!OrigLoop->contains(cast<Instruction>(U)))
4370         return false;
4371   }
4372 
4373   // Epilogue vectorization code has not been auditted to ensure it handles
4374   // non-latch exits properly.  It may be fine, but it needs auditted and
4375   // tested.
4376   // TODO: Add support for loops with an early exit.
4377   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4378     return false;
4379 
4380   return true;
4381 }
4382 
isEpilogueVectorizationProfitable(const ElementCount VF,const unsigned IC) const4383 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4384     const ElementCount VF, const unsigned IC) const {
4385   // FIXME: We need a much better cost-model to take different parameters such
4386   // as register pressure, code size increase and cost of extra branches into
4387   // account. For now we apply a very crude heuristic and only consider loops
4388   // with vectorization factors larger than a certain value.
4389 
4390   // Allow the target to opt out entirely.
4391   if (!TTI.preferEpilogueVectorization())
4392     return false;
4393 
4394   // We also consider epilogue vectorization unprofitable for targets that don't
4395   // consider interleaving beneficial (eg. MVE).
4396   if (TTI.getMaxInterleaveFactor(VF) <= 1)
4397     return false;
4398 
4399   // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4400   // VFs when deciding profitability.
4401   // See related "TODO: extend to support scalable VFs." in
4402   // selectEpilogueVectorizationFactor.
4403   unsigned Multiplier = VF.isFixed() ? IC : 1;
4404   unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4405                                 ? EpilogueVectorizationMinVF
4406                                 : TTI.getEpilogueVectorizationMinVF();
4407   return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >=
4408          MinVFThreshold;
4409 }
4410 
selectEpilogueVectorizationFactor(const ElementCount MainLoopVF,unsigned IC)4411 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4412     const ElementCount MainLoopVF, unsigned IC) {
4413   VectorizationFactor Result = VectorizationFactor::Disabled();
4414   if (!EnableEpilogueVectorization) {
4415     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4416     return Result;
4417   }
4418 
4419   if (!CM.isScalarEpilogueAllowed()) {
4420     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4421                          "epilogue is allowed.\n");
4422     return Result;
4423   }
4424 
4425   // Not really a cost consideration, but check for unsupported cases here to
4426   // simplify the logic.
4427   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4428     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4429                          "is not a supported candidate.\n");
4430     return Result;
4431   }
4432 
4433   if (EpilogueVectorizationForceVF > 1) {
4434     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4435     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
4436     if (hasPlanWithVF(ForcedEC))
4437       return {ForcedEC, 0, 0};
4438 
4439     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4440                          "viable.\n");
4441     return Result;
4442   }
4443 
4444   if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4445     LLVM_DEBUG(
4446         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4447     return Result;
4448   }
4449 
4450   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4451     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4452                          "this loop\n");
4453     return Result;
4454   }
4455 
4456   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4457   // the main loop handles 8 lanes per iteration. We could still benefit from
4458   // vectorizing the epilogue loop with VF=4.
4459   ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4460       getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning()));
4461 
4462   ScalarEvolution &SE = *PSE.getSE();
4463   Type *TCType = Legal->getWidestInductionType();
4464   const SCEV *RemainingIterations = nullptr;
4465   unsigned MaxTripCount = 0;
4466   for (auto &NextVF : ProfitableVFs) {
4467     // Skip candidate VFs without a corresponding VPlan.
4468     if (!hasPlanWithVF(NextVF.Width))
4469       continue;
4470 
4471     // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4472     // vectors) or > the VF of the main loop (fixed vectors).
4473     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4474          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4475         (NextVF.Width.isScalable() &&
4476          ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4477         (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4478          ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4479       continue;
4480 
4481     // If NextVF is greater than the number of remaining iterations, the
4482     // epilogue loop would be dead. Skip such factors.
4483     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4484       // TODO: extend to support scalable VFs.
4485       if (!RemainingIterations) {
4486         const SCEV *TC = vputils::getSCEVExprForVPValue(
4487             getPlanFor(NextVF.Width).getTripCount(), SE);
4488         assert(!isa<SCEVCouldNotCompute>(TC) &&
4489                "Trip count SCEV must be computable");
4490         RemainingIterations = SE.getURemExpr(
4491             TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
4492         MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4493         if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4494                                 SE.getConstant(TCType, MaxTripCount))) {
4495           MaxTripCount =
4496               SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4497         }
4498         LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4499                           << MaxTripCount << "\n");
4500       }
4501       if (SE.isKnownPredicate(
4502               CmpInst::ICMP_UGT,
4503               SE.getConstant(TCType, NextVF.Width.getFixedValue()),
4504               RemainingIterations))
4505         continue;
4506     }
4507 
4508     if (Result.Width.isScalar() ||
4509         isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking()))
4510       Result = NextVF;
4511   }
4512 
4513   if (Result != VectorizationFactor::Disabled())
4514     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4515                       << Result.Width << "\n");
4516   return Result;
4517 }
4518 
4519 std::pair<unsigned, unsigned>
getSmallestAndWidestTypes()4520 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4521   unsigned MinWidth = -1U;
4522   unsigned MaxWidth = 8;
4523   const DataLayout &DL = TheFunction->getDataLayout();
4524   // For in-loop reductions, no element types are added to ElementTypesInLoop
4525   // if there are no loads/stores in the loop. In this case, check through the
4526   // reduction variables to determine the maximum width.
4527   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4528     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4529       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4530       // When finding the min width used by the recurrence we need to account
4531       // for casts on the input operands of the recurrence.
4532       MinWidth = std::min(
4533           MinWidth,
4534           std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4535                    RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4536       MaxWidth = std::max(MaxWidth,
4537                           RdxDesc.getRecurrenceType()->getScalarSizeInBits());
4538     }
4539   } else {
4540     for (Type *T : ElementTypesInLoop) {
4541       MinWidth = std::min<unsigned>(
4542           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4543       MaxWidth = std::max<unsigned>(
4544           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4545     }
4546   }
4547   return {MinWidth, MaxWidth};
4548 }
4549 
collectElementTypesForWidening()4550 void LoopVectorizationCostModel::collectElementTypesForWidening() {
4551   ElementTypesInLoop.clear();
4552   // For each block.
4553   for (BasicBlock *BB : TheLoop->blocks()) {
4554     // For each instruction in the loop.
4555     for (Instruction &I : BB->instructionsWithoutDebug()) {
4556       Type *T = I.getType();
4557 
4558       // Skip ignored values.
4559       if (ValuesToIgnore.count(&I))
4560         continue;
4561 
4562       // Only examine Loads, Stores and PHINodes.
4563       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4564         continue;
4565 
4566       // Examine PHI nodes that are reduction variables. Update the type to
4567       // account for the recurrence type.
4568       if (auto *PN = dyn_cast<PHINode>(&I)) {
4569         if (!Legal->isReductionVariable(PN))
4570           continue;
4571         const RecurrenceDescriptor &RdxDesc =
4572             Legal->getRecurrenceDescriptor(PN);
4573         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4574             TTI.preferInLoopReduction(RdxDesc.getRecurrenceKind(),
4575                                       RdxDesc.getRecurrenceType()))
4576           continue;
4577         T = RdxDesc.getRecurrenceType();
4578       }
4579 
4580       // Examine the stored values.
4581       if (auto *ST = dyn_cast<StoreInst>(&I))
4582         T = ST->getValueOperand()->getType();
4583 
4584       assert(T->isSized() &&
4585              "Expected the load/store/recurrence type to be sized");
4586 
4587       ElementTypesInLoop.insert(T);
4588     }
4589   }
4590 }
4591 
4592 unsigned
selectInterleaveCount(VPlan & Plan,ElementCount VF,InstructionCost LoopCost)4593 LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4594                                                   InstructionCost LoopCost) {
4595   // -- The interleave heuristics --
4596   // We interleave the loop in order to expose ILP and reduce the loop overhead.
4597   // There are many micro-architectural considerations that we can't predict
4598   // at this level. For example, frontend pressure (on decode or fetch) due to
4599   // code size, or the number and capabilities of the execution ports.
4600   //
4601   // We use the following heuristics to select the interleave count:
4602   // 1. If the code has reductions, then we interleave to break the cross
4603   // iteration dependency.
4604   // 2. If the loop is really small, then we interleave to reduce the loop
4605   // overhead.
4606   // 3. We don't interleave if we think that we will spill registers to memory
4607   // due to the increased register pressure.
4608 
4609   if (!isScalarEpilogueAllowed())
4610     return 1;
4611 
4612   // Do not interleave if EVL is preferred and no User IC is specified.
4613   if (foldTailWithEVL()) {
4614     LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4615                          "Unroll factor forced to be 1.\n");
4616     return 1;
4617   }
4618 
4619   // We used the distance for the interleave count.
4620   if (!Legal->isSafeForAnyVectorWidth())
4621     return 1;
4622 
4623   // We don't attempt to perform interleaving for loops with uncountable early
4624   // exits because the VPInstruction::AnyOf code cannot currently handle
4625   // multiple parts.
4626   if (Legal->hasUncountableEarlyExit())
4627     return 1;
4628 
4629   const bool HasReductions = !Legal->getReductionVars().empty();
4630 
4631   // If we did not calculate the cost for VF (because the user selected the VF)
4632   // then we calculate the cost of VF here.
4633   if (LoopCost == 0) {
4634     LoopCost = expectedCost(VF);
4635     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4636 
4637     // Loop body is free and there is no need for interleaving.
4638     if (LoopCost == 0)
4639       return 1;
4640   }
4641 
4642   VPRegisterUsage R =
4643       calculateRegisterUsageForPlan(Plan, {VF}, TTI, ValuesToIgnore)[0];
4644   // We divide by these constants so assume that we have at least one
4645   // instruction that uses at least one register.
4646   for (auto &Pair : R.MaxLocalUsers) {
4647     Pair.second = std::max(Pair.second, 1U);
4648   }
4649 
4650   // We calculate the interleave count using the following formula.
4651   // Subtract the number of loop invariants from the number of available
4652   // registers. These registers are used by all of the interleaved instances.
4653   // Next, divide the remaining registers by the number of registers that is
4654   // required by the loop, in order to estimate how many parallel instances
4655   // fit without causing spills. All of this is rounded down if necessary to be
4656   // a power of two. We want power of two interleave count to simplify any
4657   // addressing operations or alignment considerations.
4658   // We also want power of two interleave counts to ensure that the induction
4659   // variable of the vector loop wraps to zero, when tail is folded by masking;
4660   // this currently happens when OptForSize, in which case IC is set to 1 above.
4661   unsigned IC = UINT_MAX;
4662 
4663   for (const auto &Pair : R.MaxLocalUsers) {
4664     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4665     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4666                       << " registers of "
4667                       << TTI.getRegisterClassName(Pair.first)
4668                       << " register class\n");
4669     if (VF.isScalar()) {
4670       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4671         TargetNumRegisters = ForceTargetNumScalarRegs;
4672     } else {
4673       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4674         TargetNumRegisters = ForceTargetNumVectorRegs;
4675     }
4676     unsigned MaxLocalUsers = Pair.second;
4677     unsigned LoopInvariantRegs = 0;
4678     if (R.LoopInvariantRegs.contains(Pair.first))
4679       LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4680 
4681     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4682                                      MaxLocalUsers);
4683     // Don't count the induction variable as interleaved.
4684     if (EnableIndVarRegisterHeur) {
4685       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4686                               std::max(1U, (MaxLocalUsers - 1)));
4687     }
4688 
4689     IC = std::min(IC, TmpIC);
4690   }
4691 
4692   // Clamp the interleave ranges to reasonable counts.
4693   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4694 
4695   // Check if the user has overridden the max.
4696   if (VF.isScalar()) {
4697     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4698       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4699   } else {
4700     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4701       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4702   }
4703 
4704   unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
4705 
4706   // Try to get the exact trip count, or an estimate based on profiling data or
4707   // ConstantMax from PSE, failing that.
4708   if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) {
4709     // At least one iteration must be scalar when this constraint holds. So the
4710     // maximum available iterations for interleaving is one less.
4711     unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
4712                                ? BestKnownTC->getFixedValue() - 1
4713                                : BestKnownTC->getFixedValue();
4714 
4715     unsigned InterleaveCountLB = bit_floor(std::max(
4716         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4717 
4718     if (getSmallConstantTripCount(PSE.getSE(), TheLoop).isNonZero()) {
4719       // If the best known trip count is exact, we select between two
4720       // prospective ICs, where
4721       //
4722       // 1) the aggressive IC is capped by the trip count divided by VF
4723       // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4724       //
4725       // The final IC is selected in a way that the epilogue loop trip count is
4726       // minimized while maximizing the IC itself, so that we either run the
4727       // vector loop at least once if it generates a small epilogue loop, or
4728       // else we run the vector loop at least twice.
4729 
4730       unsigned InterleaveCountUB = bit_floor(std::max(
4731           1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4732       MaxInterleaveCount = InterleaveCountLB;
4733 
4734       if (InterleaveCountUB != InterleaveCountLB) {
4735         unsigned TailTripCountUB =
4736             (AvailableTC % (EstimatedVF * InterleaveCountUB));
4737         unsigned TailTripCountLB =
4738             (AvailableTC % (EstimatedVF * InterleaveCountLB));
4739         // If both produce same scalar tail, maximize the IC to do the same work
4740         // in fewer vector loop iterations
4741         if (TailTripCountUB == TailTripCountLB)
4742           MaxInterleaveCount = InterleaveCountUB;
4743       }
4744     } else {
4745       // If trip count is an estimated compile time constant, limit the
4746       // IC to be capped by the trip count divided by VF * 2, such that the
4747       // vector loop runs at least twice to make interleaving seem profitable
4748       // when there is an epilogue loop present. Since exact Trip count is not
4749       // known we choose to be conservative in our IC estimate.
4750       MaxInterleaveCount = InterleaveCountLB;
4751     }
4752   }
4753 
4754   assert(MaxInterleaveCount > 0 &&
4755          "Maximum interleave count must be greater than 0");
4756 
4757   // Clamp the calculated IC to be between the 1 and the max interleave count
4758   // that the target and trip count allows.
4759   if (IC > MaxInterleaveCount)
4760     IC = MaxInterleaveCount;
4761   else
4762     // Make sure IC is greater than 0.
4763     IC = std::max(1u, IC);
4764 
4765   assert(IC > 0 && "Interleave count must be greater than 0.");
4766 
4767   // Interleave if we vectorized this loop and there is a reduction that could
4768   // benefit from interleaving.
4769   if (VF.isVector() && HasReductions) {
4770     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4771     return IC;
4772   }
4773 
4774   // For any scalar loop that either requires runtime checks or predication we
4775   // are better off leaving this to the unroller. Note that if we've already
4776   // vectorized the loop we will have done the runtime check and so interleaving
4777   // won't require further checks.
4778   bool ScalarInterleavingRequiresPredication =
4779       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
4780          return Legal->blockNeedsPredication(BB);
4781        }));
4782   bool ScalarInterleavingRequiresRuntimePointerCheck =
4783       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4784 
4785   // We want to interleave small loops in order to reduce the loop overhead and
4786   // potentially expose ILP opportunities.
4787   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4788                     << "LV: IC is " << IC << '\n'
4789                     << "LV: VF is " << VF << '\n');
4790   const bool AggressivelyInterleaveReductions =
4791       TTI.enableAggressiveInterleaving(HasReductions);
4792   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4793       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4794     // We assume that the cost overhead is 1 and we use the cost model
4795     // to estimate the cost of the loop and interleave until the cost of the
4796     // loop overhead is about 5% of the cost of the loop.
4797     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
4798                                         SmallLoopCost / LoopCost.getValue()));
4799 
4800     // Interleave until store/load ports (estimated by max interleave count) are
4801     // saturated.
4802     unsigned NumStores = Legal->getNumStores();
4803     unsigned NumLoads = Legal->getNumLoads();
4804     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4805     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4806 
4807     // There is little point in interleaving for reductions containing selects
4808     // and compares when VF=1 since it may just create more overhead than it's
4809     // worth for loops with small trip counts. This is because we still have to
4810     // do the final reduction after the loop.
4811     bool HasSelectCmpReductions =
4812         HasReductions &&
4813         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
4814           const RecurrenceDescriptor &RdxDesc = Reduction.second;
4815           RecurKind RK = RdxDesc.getRecurrenceKind();
4816           return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
4817                  RecurrenceDescriptor::isFindIVRecurrenceKind(RK);
4818         });
4819     if (HasSelectCmpReductions) {
4820       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4821       return 1;
4822     }
4823 
4824     // If we have a scalar reduction (vector reductions are already dealt with
4825     // by this point), we can increase the critical path length if the loop
4826     // we're interleaving is inside another loop. For tree-wise reductions
4827     // set the limit to 2, and for ordered reductions it's best to disable
4828     // interleaving entirely.
4829     if (HasReductions && TheLoop->getLoopDepth() > 1) {
4830       bool HasOrderedReductions =
4831           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
4832             const RecurrenceDescriptor &RdxDesc = Reduction.second;
4833             return RdxDesc.isOrdered();
4834           });
4835       if (HasOrderedReductions) {
4836         LLVM_DEBUG(
4837             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4838         return 1;
4839       }
4840 
4841       unsigned F = MaxNestedScalarReductionIC;
4842       SmallIC = std::min(SmallIC, F);
4843       StoresIC = std::min(StoresIC, F);
4844       LoadsIC = std::min(LoadsIC, F);
4845     }
4846 
4847     if (EnableLoadStoreRuntimeInterleave &&
4848         std::max(StoresIC, LoadsIC) > SmallIC) {
4849       LLVM_DEBUG(
4850           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4851       return std::max(StoresIC, LoadsIC);
4852     }
4853 
4854     // If there are scalar reductions and TTI has enabled aggressive
4855     // interleaving for reductions, we will interleave to expose ILP.
4856     if (VF.isScalar() && AggressivelyInterleaveReductions) {
4857       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4858       // Interleave no less than SmallIC but not as aggressive as the normal IC
4859       // to satisfy the rare situation when resources are too limited.
4860       return std::max(IC / 2, SmallIC);
4861     }
4862 
4863     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4864     return SmallIC;
4865   }
4866 
4867   // Interleave if this is a large loop (small loops are already dealt with by
4868   // this point) that could benefit from interleaving.
4869   if (AggressivelyInterleaveReductions) {
4870     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4871     return IC;
4872   }
4873 
4874   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4875   return 1;
4876 }
4877 
useEmulatedMaskMemRefHack(Instruction * I,ElementCount VF)4878 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4879                                                            ElementCount VF) {
4880   // TODO: Cost model for emulated masked load/store is completely
4881   // broken. This hack guides the cost model to use an artificially
4882   // high enough value to practically disable vectorization with such
4883   // operations, except where previously deployed legality hack allowed
4884   // using very low cost values. This is to avoid regressions coming simply
4885   // from moving "masked load/store" check from legality to cost model.
4886   // Masked Load/Gather emulation was previously never allowed.
4887   // Limited number of Masked Store/Scatter emulation was allowed.
4888   assert((isPredicatedInst(I)) &&
4889          "Expecting a scalar emulated instruction");
4890   return isa<LoadInst>(I) ||
4891          (isa<StoreInst>(I) &&
4892           NumPredStores > NumberOfStoresToPredicate);
4893 }
4894 
collectInstsToScalarize(ElementCount VF)4895 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
4896   assert(VF.isVector() && "Expected VF >= 2");
4897 
4898   // If we've already collected the instructions to scalarize or the predicated
4899   // BBs after vectorization, there's nothing to do. Collection may already have
4900   // occurred if we have a user-selected VF and are now computing the expected
4901   // cost for interleaving.
4902   if (InstsToScalarize.contains(VF) ||
4903       PredicatedBBsAfterVectorization.contains(VF))
4904     return;
4905 
4906   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
4907   // not profitable to scalarize any instructions, the presence of VF in the
4908   // map will indicate that we've analyzed it already.
4909   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
4910 
4911   // Find all the instructions that are scalar with predication in the loop and
4912   // determine if it would be better to not if-convert the blocks they are in.
4913   // If so, we also record the instructions to scalarize.
4914   for (BasicBlock *BB : TheLoop->blocks()) {
4915     if (!blockNeedsPredicationForAnyReason(BB))
4916       continue;
4917     for (Instruction &I : *BB)
4918       if (isScalarWithPredication(&I, VF)) {
4919         ScalarCostsTy ScalarCosts;
4920         // Do not apply discount logic for:
4921         // 1. Scalars after vectorization, as there will only be a single copy
4922         // of the instruction.
4923         // 2. Scalable VF, as that would lead to invalid scalarization costs.
4924         // 3. Emulated masked memrefs, if a hacked cost is needed.
4925         if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
4926             !useEmulatedMaskMemRefHack(&I, VF) &&
4927             computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
4928           ScalarCostsVF.insert_range(ScalarCosts);
4929           // Check if we decided to scalarize a call. If so, update the widening
4930           // decision of the call to CM_Scalarize with the computed scalar cost.
4931           for (const auto &[I, Cost] : ScalarCosts) {
4932             auto *CI = dyn_cast<CallInst>(I);
4933             if (!CI || !CallWideningDecisions.contains({CI, VF}))
4934               continue;
4935             CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
4936             CallWideningDecisions[{CI, VF}].Cost = Cost;
4937           }
4938         }
4939         // Remember that BB will remain after vectorization.
4940         PredicatedBBsAfterVectorization[VF].insert(BB);
4941         for (auto *Pred : predecessors(BB)) {
4942           if (Pred->getSingleSuccessor() == BB)
4943             PredicatedBBsAfterVectorization[VF].insert(Pred);
4944         }
4945       }
4946   }
4947 }
4948 
computePredInstDiscount(Instruction * PredInst,ScalarCostsTy & ScalarCosts,ElementCount VF)4949 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
4950     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
4951   assert(!isUniformAfterVectorization(PredInst, VF) &&
4952          "Instruction marked uniform-after-vectorization will be predicated");
4953 
4954   // Initialize the discount to zero, meaning that the scalar version and the
4955   // vector version cost the same.
4956   InstructionCost Discount = 0;
4957 
4958   // Holds instructions to analyze. The instructions we visit are mapped in
4959   // ScalarCosts. Those instructions are the ones that would be scalarized if
4960   // we find that the scalar version costs less.
4961   SmallVector<Instruction *, 8> Worklist;
4962 
4963   // Returns true if the given instruction can be scalarized.
4964   auto CanBeScalarized = [&](Instruction *I) -> bool {
4965     // We only attempt to scalarize instructions forming a single-use chain
4966     // from the original predicated block that would otherwise be vectorized.
4967     // Although not strictly necessary, we give up on instructions we know will
4968     // already be scalar to avoid traversing chains that are unlikely to be
4969     // beneficial.
4970     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
4971         isScalarAfterVectorization(I, VF))
4972       return false;
4973 
4974     // If the instruction is scalar with predication, it will be analyzed
4975     // separately. We ignore it within the context of PredInst.
4976     if (isScalarWithPredication(I, VF))
4977       return false;
4978 
4979     // If any of the instruction's operands are uniform after vectorization,
4980     // the instruction cannot be scalarized. This prevents, for example, a
4981     // masked load from being scalarized.
4982     //
4983     // We assume we will only emit a value for lane zero of an instruction
4984     // marked uniform after vectorization, rather than VF identical values.
4985     // Thus, if we scalarize an instruction that uses a uniform, we would
4986     // create uses of values corresponding to the lanes we aren't emitting code
4987     // for. This behavior can be changed by allowing getScalarValue to clone
4988     // the lane zero values for uniforms rather than asserting.
4989     for (Use &U : I->operands())
4990       if (auto *J = dyn_cast<Instruction>(U.get()))
4991         if (isUniformAfterVectorization(J, VF))
4992           return false;
4993 
4994     // Otherwise, we can scalarize the instruction.
4995     return true;
4996   };
4997 
4998   // Compute the expected cost discount from scalarizing the entire expression
4999   // feeding the predicated instruction. We currently only consider expressions
5000   // that are single-use instruction chains.
5001   Worklist.push_back(PredInst);
5002   while (!Worklist.empty()) {
5003     Instruction *I = Worklist.pop_back_val();
5004 
5005     // If we've already analyzed the instruction, there's nothing to do.
5006     if (ScalarCosts.contains(I))
5007       continue;
5008 
5009     // Cannot scalarize fixed-order recurrence phis at the moment.
5010     if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5011       continue;
5012 
5013     // Compute the cost of the vector instruction. Note that this cost already
5014     // includes the scalarization overhead of the predicated instruction.
5015     InstructionCost VectorCost = getInstructionCost(I, VF);
5016 
5017     // Compute the cost of the scalarized instruction. This cost is the cost of
5018     // the instruction as if it wasn't if-converted and instead remained in the
5019     // predicated block. We will scale this cost by block probability after
5020     // computing the scalarization overhead.
5021     InstructionCost ScalarCost =
5022         VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5023 
5024     // Compute the scalarization overhead of needed insertelement instructions
5025     // and phi nodes.
5026     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5027       Type *WideTy = toVectorizedTy(I->getType(), VF);
5028       for (Type *VectorTy : getContainedTypes(WideTy)) {
5029         ScalarCost += TTI.getScalarizationOverhead(
5030             cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
5031             /*Insert=*/true,
5032             /*Extract=*/false, CostKind);
5033       }
5034       ScalarCost +=
5035           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5036     }
5037 
5038     // Compute the scalarization overhead of needed extractelement
5039     // instructions. For each of the instruction's operands, if the operand can
5040     // be scalarized, add it to the worklist; otherwise, account for the
5041     // overhead.
5042     for (Use &U : I->operands())
5043       if (auto *J = dyn_cast<Instruction>(U.get())) {
5044         assert(canVectorizeTy(J->getType()) &&
5045                "Instruction has non-scalar type");
5046         if (CanBeScalarized(J))
5047           Worklist.push_back(J);
5048         else if (needsExtract(J, VF)) {
5049           Type *WideTy = toVectorizedTy(J->getType(), VF);
5050           for (Type *VectorTy : getContainedTypes(WideTy)) {
5051             ScalarCost += TTI.getScalarizationOverhead(
5052                 cast<VectorType>(VectorTy),
5053                 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5054                 /*Extract*/ true, CostKind);
5055           }
5056         }
5057       }
5058 
5059     // Scale the total scalar cost by block probability.
5060     ScalarCost /= getPredBlockCostDivisor(CostKind);
5061 
5062     // Compute the discount. A non-negative discount means the vector version
5063     // of the instruction costs more, and scalarizing would be beneficial.
5064     Discount += VectorCost - ScalarCost;
5065     ScalarCosts[I] = ScalarCost;
5066   }
5067 
5068   return Discount;
5069 }
5070 
expectedCost(ElementCount VF)5071 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5072   InstructionCost Cost;
5073 
5074   // If the vector loop gets executed exactly once with the given VF, ignore the
5075   // costs of comparison and induction instructions, as they'll get simplified
5076   // away.
5077   SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5078   auto TC = getSmallConstantTripCount(PSE.getSE(), TheLoop);
5079   if (TC == VF && !foldTailByMasking())
5080     addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
5081                                          ValuesToIgnoreForVF);
5082 
5083   // For each block.
5084   for (BasicBlock *BB : TheLoop->blocks()) {
5085     InstructionCost BlockCost;
5086 
5087     // For each instruction in the old loop.
5088     for (Instruction &I : BB->instructionsWithoutDebug()) {
5089       // Skip ignored values.
5090       if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5091           (VF.isVector() && VecValuesToIgnore.count(&I)))
5092         continue;
5093 
5094       InstructionCost C = getInstructionCost(&I, VF);
5095 
5096       // Check if we should override the cost.
5097       if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5098         C = InstructionCost(ForceTargetInstructionCost);
5099 
5100       BlockCost += C;
5101       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5102                         << VF << " For instruction: " << I << '\n');
5103     }
5104 
5105     // If we are vectorizing a predicated block, it will have been
5106     // if-converted. This means that the block's instructions (aside from
5107     // stores and instructions that may divide by zero) will now be
5108     // unconditionally executed. For the scalar case, we may not always execute
5109     // the predicated block, if it is an if-else block. Thus, scale the block's
5110     // cost by the probability of executing it. blockNeedsPredication from
5111     // Legal is used so as to not include all blocks in tail folded loops.
5112     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5113       BlockCost /= getPredBlockCostDivisor(CostKind);
5114 
5115     Cost += BlockCost;
5116   }
5117 
5118   return Cost;
5119 }
5120 
5121 /// Gets Address Access SCEV after verifying that the access pattern
5122 /// is loop invariant except the induction variable dependence.
5123 ///
5124 /// This SCEV can be sent to the Target in order to estimate the address
5125 /// calculation cost.
getAddressAccessSCEV(Value * Ptr,LoopVectorizationLegality * Legal,PredicatedScalarEvolution & PSE,const Loop * TheLoop)5126 static const SCEV *getAddressAccessSCEV(
5127               Value *Ptr,
5128               LoopVectorizationLegality *Legal,
5129               PredicatedScalarEvolution &PSE,
5130               const Loop *TheLoop) {
5131 
5132   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5133   if (!Gep)
5134     return nullptr;
5135 
5136   // We are looking for a gep with all loop invariant indices except for one
5137   // which should be an induction variable.
5138   auto *SE = PSE.getSE();
5139   unsigned NumOperands = Gep->getNumOperands();
5140   for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5141     Value *Opd = Gep->getOperand(Idx);
5142     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5143         !Legal->isInductionVariable(Opd))
5144       return nullptr;
5145   }
5146 
5147   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5148   return PSE.getSCEV(Ptr);
5149 }
5150 
5151 InstructionCost
getMemInstScalarizationCost(Instruction * I,ElementCount VF)5152 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5153                                                         ElementCount VF) {
5154   assert(VF.isVector() &&
5155          "Scalarization cost of instruction implies vectorization.");
5156   if (VF.isScalable())
5157     return InstructionCost::getInvalid();
5158 
5159   Type *ValTy = getLoadStoreType(I);
5160   auto *SE = PSE.getSE();
5161 
5162   unsigned AS = getLoadStoreAddressSpace(I);
5163   Value *Ptr = getLoadStorePointerOperand(I);
5164   Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5165   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5166   //       that it is being called from this specific place.
5167 
5168   // Figure out whether the access is strided and get the stride value
5169   // if it's known in compile time
5170   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5171 
5172   // Get the cost of the scalar memory instruction and address computation.
5173   InstructionCost Cost =
5174       VF.getFixedValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5175 
5176   // Don't pass *I here, since it is scalar but will actually be part of a
5177   // vectorized loop where the user of it is a vectorized instruction.
5178   const Align Alignment = getLoadStoreAlignment(I);
5179   Cost += VF.getFixedValue() * TTI.getMemoryOpCost(I->getOpcode(),
5180                                                    ValTy->getScalarType(),
5181                                                    Alignment, AS, CostKind);
5182 
5183   // Get the overhead of the extractelement and insertelement instructions
5184   // we might create due to scalarization.
5185   Cost += getScalarizationOverhead(I, VF);
5186 
5187   // If we have a predicated load/store, it will need extra i1 extracts and
5188   // conditional branches, but may not be executed for each vector lane. Scale
5189   // the cost by the probability of executing the predicated block.
5190   if (isPredicatedInst(I)) {
5191     Cost /= getPredBlockCostDivisor(CostKind);
5192 
5193     // Add the cost of an i1 extract and a branch
5194     auto *VecI1Ty =
5195         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5196     Cost += TTI.getScalarizationOverhead(
5197         VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
5198         /*Insert=*/false, /*Extract=*/true, CostKind);
5199     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5200 
5201     if (useEmulatedMaskMemRefHack(I, VF))
5202       // Artificially setting to a high enough value to practically disable
5203       // vectorization with such operations.
5204       Cost = 3000000;
5205   }
5206 
5207   return Cost;
5208 }
5209 
5210 InstructionCost
getConsecutiveMemOpCost(Instruction * I,ElementCount VF)5211 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5212                                                     ElementCount VF) {
5213   Type *ValTy = getLoadStoreType(I);
5214   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5215   Value *Ptr = getLoadStorePointerOperand(I);
5216   unsigned AS = getLoadStoreAddressSpace(I);
5217   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5218 
5219   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5220          "Stride should be 1 or -1 for consecutive memory access");
5221   const Align Alignment = getLoadStoreAlignment(I);
5222   InstructionCost Cost = 0;
5223   if (Legal->isMaskRequired(I)) {
5224     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5225                                       CostKind);
5226   } else {
5227     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5228     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5229                                 CostKind, OpInfo, I);
5230   }
5231 
5232   bool Reverse = ConsecutiveStride < 0;
5233   if (Reverse)
5234     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
5235                                VectorTy, {}, CostKind, 0);
5236   return Cost;
5237 }
5238 
5239 InstructionCost
getUniformMemOpCost(Instruction * I,ElementCount VF)5240 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5241                                                 ElementCount VF) {
5242   assert(Legal->isUniformMemOp(*I, VF));
5243 
5244   Type *ValTy = getLoadStoreType(I);
5245   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5246   const Align Alignment = getLoadStoreAlignment(I);
5247   unsigned AS = getLoadStoreAddressSpace(I);
5248   if (isa<LoadInst>(I)) {
5249     return TTI.getAddressComputationCost(ValTy) +
5250            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5251                                CostKind) +
5252            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy,
5253                               VectorTy, {}, CostKind);
5254   }
5255   StoreInst *SI = cast<StoreInst>(I);
5256 
5257   bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5258   // TODO: We have existing tests that request the cost of extracting element
5259   // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5260   // the actual generated code, which involves extracting the last element of
5261   // a scalable vector where the lane to extract is unknown at compile time.
5262   return TTI.getAddressComputationCost(ValTy) +
5263          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5264                              CostKind) +
5265          (IsLoopInvariantStoreValue
5266               ? 0
5267               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5268                                        CostKind, VF.getKnownMinValue() - 1));
5269 }
5270 
5271 InstructionCost
getGatherScatterCost(Instruction * I,ElementCount VF)5272 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5273                                                  ElementCount VF) {
5274   Type *ValTy = getLoadStoreType(I);
5275   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5276   const Align Alignment = getLoadStoreAlignment(I);
5277   const Value *Ptr = getLoadStorePointerOperand(I);
5278 
5279   return TTI.getAddressComputationCost(VectorTy) +
5280          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5281                                     Legal->isMaskRequired(I), Alignment,
5282                                     CostKind, I);
5283 }
5284 
5285 InstructionCost
getInterleaveGroupCost(Instruction * I,ElementCount VF)5286 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5287                                                    ElementCount VF) {
5288   const auto *Group = getInterleavedAccessGroup(I);
5289   assert(Group && "Fail to get an interleaved access group.");
5290 
5291   Instruction *InsertPos = Group->getInsertPos();
5292   Type *ValTy = getLoadStoreType(InsertPos);
5293   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5294   unsigned AS = getLoadStoreAddressSpace(InsertPos);
5295 
5296   unsigned InterleaveFactor = Group->getFactor();
5297   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5298 
5299   // Holds the indices of existing members in the interleaved group.
5300   SmallVector<unsigned, 4> Indices;
5301   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5302     if (Group->getMember(IF))
5303       Indices.push_back(IF);
5304 
5305   // Calculate the cost of the whole interleaved group.
5306   bool UseMaskForGaps =
5307       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5308       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5309   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5310       InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5311       Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5312       UseMaskForGaps);
5313 
5314   if (Group->isReverse()) {
5315     // TODO: Add support for reversed masked interleaved access.
5316     assert(!Legal->isMaskRequired(I) &&
5317            "Reverse masked interleaved access not supported.");
5318     Cost += Group->getNumMembers() *
5319             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
5320                                VectorTy, {}, CostKind, 0);
5321   }
5322   return Cost;
5323 }
5324 
5325 std::optional<InstructionCost>
getReductionPatternCost(Instruction * I,ElementCount VF,Type * Ty) const5326 LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5327                                                     ElementCount VF,
5328                                                     Type *Ty) const {
5329   using namespace llvm::PatternMatch;
5330   // Early exit for no inloop reductions
5331   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5332     return std::nullopt;
5333   auto *VectorTy = cast<VectorType>(Ty);
5334 
5335   // We are looking for a pattern of, and finding the minimal acceptable cost:
5336   //  reduce(mul(ext(A), ext(B))) or
5337   //  reduce(mul(A, B)) or
5338   //  reduce(ext(A)) or
5339   //  reduce(A).
5340   // The basic idea is that we walk down the tree to do that, finding the root
5341   // reduction instruction in InLoopReductionImmediateChains. From there we find
5342   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5343   // of the components. If the reduction cost is lower then we return it for the
5344   // reduction instruction and 0 for the other instructions in the pattern. If
5345   // it is not we return an invalid cost specifying the orignal cost method
5346   // should be used.
5347   Instruction *RetI = I;
5348   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5349     if (!RetI->hasOneUser())
5350       return std::nullopt;
5351     RetI = RetI->user_back();
5352   }
5353 
5354   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5355       RetI->user_back()->getOpcode() == Instruction::Add) {
5356     RetI = RetI->user_back();
5357   }
5358 
5359   // Test if the found instruction is a reduction, and if not return an invalid
5360   // cost specifying the parent to use the original cost modelling.
5361   Instruction *LastChain = InLoopReductionImmediateChains.lookup(RetI);
5362   if (!LastChain)
5363     return std::nullopt;
5364 
5365   // Find the reduction this chain is a part of and calculate the basic cost of
5366   // the reduction on its own.
5367   Instruction *ReductionPhi = LastChain;
5368   while (!isa<PHINode>(ReductionPhi))
5369     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5370 
5371   const RecurrenceDescriptor &RdxDesc =
5372       Legal->getRecurrenceDescriptor(cast<PHINode>(ReductionPhi));
5373 
5374   InstructionCost BaseCost;
5375   RecurKind RK = RdxDesc.getRecurrenceKind();
5376   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
5377     Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5378     BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5379                                           RdxDesc.getFastMathFlags(), CostKind);
5380   } else {
5381     BaseCost = TTI.getArithmeticReductionCost(
5382         RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5383   }
5384 
5385   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5386   // normal fmul instruction to the cost of the fadd reduction.
5387   if (RK == RecurKind::FMulAdd)
5388     BaseCost +=
5389         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5390 
5391   // If we're using ordered reductions then we can just return the base cost
5392   // here, since getArithmeticReductionCost calculates the full ordered
5393   // reduction cost when FP reassociation is not allowed.
5394   if (useOrderedReductions(RdxDesc))
5395     return BaseCost;
5396 
5397   // Get the operand that was not the reduction chain and match it to one of the
5398   // patterns, returning the better cost if it is found.
5399   Instruction *RedOp = RetI->getOperand(1) == LastChain
5400                            ? dyn_cast<Instruction>(RetI->getOperand(0))
5401                            : dyn_cast<Instruction>(RetI->getOperand(1));
5402 
5403   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5404 
5405   Instruction *Op0, *Op1;
5406   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5407       match(RedOp,
5408             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
5409       match(Op0, m_ZExtOrSExt(m_Value())) &&
5410       Op0->getOpcode() == Op1->getOpcode() &&
5411       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5412       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5413       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5414 
5415     // Matched reduce.add(ext(mul(ext(A), ext(B)))
5416     // Note that the extend opcodes need to all match, or if A==B they will have
5417     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5418     // which is equally fine.
5419     bool IsUnsigned = isa<ZExtInst>(Op0);
5420     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5421     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5422 
5423     InstructionCost ExtCost =
5424         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5425                              TTI::CastContextHint::None, CostKind, Op0);
5426     InstructionCost MulCost =
5427         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5428     InstructionCost Ext2Cost =
5429         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5430                              TTI::CastContextHint::None, CostKind, RedOp);
5431 
5432     InstructionCost RedCost = TTI.getMulAccReductionCost(
5433         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5434 
5435     if (RedCost.isValid() &&
5436         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5437       return I == RetI ? RedCost : 0;
5438   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5439              !TheLoop->isLoopInvariant(RedOp)) {
5440     // Matched reduce(ext(A))
5441     bool IsUnsigned = isa<ZExtInst>(RedOp);
5442     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5443     InstructionCost RedCost = TTI.getExtendedReductionCost(
5444         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5445         RdxDesc.getFastMathFlags(), CostKind);
5446 
5447     InstructionCost ExtCost =
5448         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5449                              TTI::CastContextHint::None, CostKind, RedOp);
5450     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5451       return I == RetI ? RedCost : 0;
5452   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5453              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5454     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5455         Op0->getOpcode() == Op1->getOpcode() &&
5456         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
5457       bool IsUnsigned = isa<ZExtInst>(Op0);
5458       Type *Op0Ty = Op0->getOperand(0)->getType();
5459       Type *Op1Ty = Op1->getOperand(0)->getType();
5460       Type *LargestOpTy =
5461           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5462                                                                     : Op0Ty;
5463       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5464 
5465       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5466       // different sizes. We take the largest type as the ext to reduce, and add
5467       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5468       InstructionCost ExtCost0 = TTI.getCastInstrCost(
5469           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5470           TTI::CastContextHint::None, CostKind, Op0);
5471       InstructionCost ExtCost1 = TTI.getCastInstrCost(
5472           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5473           TTI::CastContextHint::None, CostKind, Op1);
5474       InstructionCost MulCost =
5475           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5476 
5477       InstructionCost RedCost = TTI.getMulAccReductionCost(
5478           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5479       InstructionCost ExtraExtCost = 0;
5480       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5481         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5482         ExtraExtCost = TTI.getCastInstrCost(
5483             ExtraExtOp->getOpcode(), ExtType,
5484             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5485             TTI::CastContextHint::None, CostKind, ExtraExtOp);
5486       }
5487 
5488       if (RedCost.isValid() &&
5489           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5490         return I == RetI ? RedCost : 0;
5491     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5492       // Matched reduce.add(mul())
5493       InstructionCost MulCost =
5494           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5495 
5496       InstructionCost RedCost = TTI.getMulAccReductionCost(
5497           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
5498 
5499       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5500         return I == RetI ? RedCost : 0;
5501     }
5502   }
5503 
5504   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5505 }
5506 
5507 InstructionCost
getMemoryInstructionCost(Instruction * I,ElementCount VF)5508 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5509                                                      ElementCount VF) {
5510   // Calculate scalar cost only. Vectorization cost should be ready at this
5511   // moment.
5512   if (VF.isScalar()) {
5513     Type *ValTy = getLoadStoreType(I);
5514     const Align Alignment = getLoadStoreAlignment(I);
5515     unsigned AS = getLoadStoreAddressSpace(I);
5516 
5517     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5518     return TTI.getAddressComputationCost(ValTy) +
5519            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
5520                                OpInfo, I);
5521   }
5522   return getWideningCost(I, VF);
5523 }
5524 
5525 InstructionCost
getScalarizationOverhead(Instruction * I,ElementCount VF) const5526 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5527                                                      ElementCount VF) const {
5528 
5529   // There is no mechanism yet to create a scalable scalarization loop,
5530   // so this is currently Invalid.
5531   if (VF.isScalable())
5532     return InstructionCost::getInvalid();
5533 
5534   if (VF.isScalar())
5535     return 0;
5536 
5537   InstructionCost Cost = 0;
5538   Type *RetTy = toVectorizedTy(I->getType(), VF);
5539   if (!RetTy->isVoidTy() &&
5540       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
5541 
5542     for (Type *VectorTy : getContainedTypes(RetTy)) {
5543       Cost += TTI.getScalarizationOverhead(
5544           cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
5545           /*Insert=*/true,
5546           /*Extract=*/false, CostKind);
5547     }
5548   }
5549 
5550   // Some targets keep addresses scalar.
5551   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5552     return Cost;
5553 
5554   // Some targets support efficient element stores.
5555   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5556     return Cost;
5557 
5558   // Collect operands to consider.
5559   CallInst *CI = dyn_cast<CallInst>(I);
5560   Instruction::op_range Ops = CI ? CI->args() : I->operands();
5561 
5562   // Skip operands that do not require extraction/scalarization and do not incur
5563   // any overhead.
5564   SmallVector<Type *> Tys;
5565   for (auto *V : filterExtractingOperands(Ops, VF))
5566     Tys.push_back(maybeVectorizeType(V->getType(), VF));
5567   return Cost + TTI.getOperandsScalarizationOverhead(
5568                     filterExtractingOperands(Ops, VF), Tys, CostKind);
5569 }
5570 
setCostBasedWideningDecision(ElementCount VF)5571 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5572   if (VF.isScalar())
5573     return;
5574   NumPredStores = 0;
5575   for (BasicBlock *BB : TheLoop->blocks()) {
5576     // For each instruction in the old loop.
5577     for (Instruction &I : *BB) {
5578       Value *Ptr =  getLoadStorePointerOperand(&I);
5579       if (!Ptr)
5580         continue;
5581 
5582       // TODO: We should generate better code and update the cost model for
5583       // predicated uniform stores. Today they are treated as any other
5584       // predicated store (see added test cases in
5585       // invariant-store-vectorization.ll).
5586       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
5587         NumPredStores++;
5588 
5589       if (Legal->isUniformMemOp(I, VF)) {
5590         auto IsLegalToScalarize = [&]() {
5591           if (!VF.isScalable())
5592             // Scalarization of fixed length vectors "just works".
5593             return true;
5594 
5595           // We have dedicated lowering for unpredicated uniform loads and
5596           // stores.  Note that even with tail folding we know that at least
5597           // one lane is active (i.e. generalized predication is not possible
5598           // here), and the logic below depends on this fact.
5599           if (!foldTailByMasking())
5600             return true;
5601 
5602           // For scalable vectors, a uniform memop load is always
5603           // uniform-by-parts  and we know how to scalarize that.
5604           if (isa<LoadInst>(I))
5605             return true;
5606 
5607           // A uniform store isn't neccessarily uniform-by-part
5608           // and we can't assume scalarization.
5609           auto &SI = cast<StoreInst>(I);
5610           return TheLoop->isLoopInvariant(SI.getValueOperand());
5611         };
5612 
5613         const InstructionCost GatherScatterCost =
5614           isLegalGatherOrScatter(&I, VF) ?
5615           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
5616 
5617         // Load: Scalar load + broadcast
5618         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5619         // FIXME: This cost is a significant under-estimate for tail folded
5620         // memory ops.
5621         const InstructionCost ScalarizationCost =
5622             IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
5623                                  : InstructionCost::getInvalid();
5624 
5625         // Choose better solution for the current VF,  Note that Invalid
5626         // costs compare as maximumal large.  If both are invalid, we get
5627         // scalable invalid which signals a failure and a vectorization abort.
5628         if (GatherScatterCost < ScalarizationCost)
5629           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
5630         else
5631           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
5632         continue;
5633       }
5634 
5635       // We assume that widening is the best solution when possible.
5636       if (memoryInstructionCanBeWidened(&I, VF)) {
5637         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
5638         int ConsecutiveStride = Legal->isConsecutivePtr(
5639             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
5640         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5641                "Expected consecutive stride.");
5642         InstWidening Decision =
5643             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5644         setWideningDecision(&I, VF, Decision, Cost);
5645         continue;
5646       }
5647 
5648       // Choose between Interleaving, Gather/Scatter or Scalarization.
5649       InstructionCost InterleaveCost = InstructionCost::getInvalid();
5650       unsigned NumAccesses = 1;
5651       if (isAccessInterleaved(&I)) {
5652         const auto *Group = getInterleavedAccessGroup(&I);
5653         assert(Group && "Fail to get an interleaved access group.");
5654 
5655         // Make one decision for the whole group.
5656         if (getWideningDecision(&I, VF) != CM_Unknown)
5657           continue;
5658 
5659         NumAccesses = Group->getNumMembers();
5660         if (interleavedAccessCanBeWidened(&I, VF))
5661           InterleaveCost = getInterleaveGroupCost(&I, VF);
5662       }
5663 
5664       InstructionCost GatherScatterCost =
5665           isLegalGatherOrScatter(&I, VF)
5666               ? getGatherScatterCost(&I, VF) * NumAccesses
5667               : InstructionCost::getInvalid();
5668 
5669       InstructionCost ScalarizationCost =
5670           getMemInstScalarizationCost(&I, VF) * NumAccesses;
5671 
5672       // Choose better solution for the current VF,
5673       // write down this decision and use it during vectorization.
5674       InstructionCost Cost;
5675       InstWidening Decision;
5676       if (InterleaveCost <= GatherScatterCost &&
5677           InterleaveCost < ScalarizationCost) {
5678         Decision = CM_Interleave;
5679         Cost = InterleaveCost;
5680       } else if (GatherScatterCost < ScalarizationCost) {
5681         Decision = CM_GatherScatter;
5682         Cost = GatherScatterCost;
5683       } else {
5684         Decision = CM_Scalarize;
5685         Cost = ScalarizationCost;
5686       }
5687       // If the instructions belongs to an interleave group, the whole group
5688       // receives the same decision. The whole group receives the cost, but
5689       // the cost will actually be assigned to one instruction.
5690       if (const auto *Group = getInterleavedAccessGroup(&I))
5691         setWideningDecision(Group, VF, Decision, Cost);
5692       else
5693         setWideningDecision(&I, VF, Decision, Cost);
5694     }
5695   }
5696 
5697   // Make sure that any load of address and any other address computation
5698   // remains scalar unless there is gather/scatter support. This avoids
5699   // inevitable extracts into address registers, and also has the benefit of
5700   // activating LSR more, since that pass can't optimize vectorized
5701   // addresses.
5702   if (TTI.prefersVectorizedAddressing())
5703     return;
5704 
5705   // Start with all scalar pointer uses.
5706   SmallPtrSet<Instruction *, 8> AddrDefs;
5707   for (BasicBlock *BB : TheLoop->blocks())
5708     for (Instruction &I : *BB) {
5709       Instruction *PtrDef =
5710         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5711       if (PtrDef && TheLoop->contains(PtrDef) &&
5712           getWideningDecision(&I, VF) != CM_GatherScatter)
5713         AddrDefs.insert(PtrDef);
5714     }
5715 
5716   // Add all instructions used to generate the addresses.
5717   SmallVector<Instruction *, 4> Worklist;
5718   append_range(Worklist, AddrDefs);
5719   while (!Worklist.empty()) {
5720     Instruction *I = Worklist.pop_back_val();
5721     for (auto &Op : I->operands())
5722       if (auto *InstOp = dyn_cast<Instruction>(Op))
5723         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5724             AddrDefs.insert(InstOp).second)
5725           Worklist.push_back(InstOp);
5726   }
5727 
5728   for (auto *I : AddrDefs) {
5729     if (isa<LoadInst>(I)) {
5730       // Setting the desired widening decision should ideally be handled in
5731       // by cost functions, but since this involves the task of finding out
5732       // if the loaded register is involved in an address computation, it is
5733       // instead changed here when we know this is the case.
5734       InstWidening Decision = getWideningDecision(I, VF);
5735       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5736         // Scalarize a widened load of address.
5737         setWideningDecision(
5738             I, VF, CM_Scalarize,
5739             (VF.getKnownMinValue() *
5740              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
5741       else if (const auto *Group = getInterleavedAccessGroup(I)) {
5742         // Scalarize an interleave group of address loads.
5743         for (unsigned I = 0; I < Group->getFactor(); ++I) {
5744           if (Instruction *Member = Group->getMember(I))
5745             setWideningDecision(
5746                 Member, VF, CM_Scalarize,
5747                 (VF.getKnownMinValue() *
5748                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
5749         }
5750       }
5751     } else {
5752       // Cannot scalarize fixed-order recurrence phis at the moment.
5753       if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5754         continue;
5755 
5756       // Make sure I gets scalarized and a cost estimate without
5757       // scalarization overhead.
5758       ForcedScalars[VF].insert(I);
5759     }
5760   }
5761 }
5762 
setVectorizedCallDecision(ElementCount VF)5763 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
5764   assert(!VF.isScalar() &&
5765          "Trying to set a vectorization decision for a scalar VF");
5766 
5767   auto ForcedScalar = ForcedScalars.find(VF);
5768   for (BasicBlock *BB : TheLoop->blocks()) {
5769     // For each instruction in the old loop.
5770     for (Instruction &I : *BB) {
5771       CallInst *CI = dyn_cast<CallInst>(&I);
5772 
5773       if (!CI)
5774         continue;
5775 
5776       InstructionCost ScalarCost = InstructionCost::getInvalid();
5777       InstructionCost VectorCost = InstructionCost::getInvalid();
5778       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
5779       Function *ScalarFunc = CI->getCalledFunction();
5780       Type *ScalarRetTy = CI->getType();
5781       SmallVector<Type *, 4> Tys, ScalarTys;
5782       for (auto &ArgOp : CI->args())
5783         ScalarTys.push_back(ArgOp->getType());
5784 
5785       // Estimate cost of scalarized vector call. The source operands are
5786       // assumed to be vectors, so we need to extract individual elements from
5787       // there, execute VF scalar calls, and then gather the result into the
5788       // vector return value.
5789       InstructionCost ScalarCallCost =
5790           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
5791 
5792       // Compute costs of unpacking argument values for the scalar calls and
5793       // packing the return values to a vector.
5794       InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
5795 
5796       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5797       // Honor ForcedScalars and UniformAfterVectorization decisions.
5798       // TODO: For calls, it might still be more profitable to widen. Use
5799       // VPlan-based cost model to compare different options.
5800       if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5801                              ForcedScalar->second.contains(CI)) ||
5802                             isUniformAfterVectorization(CI, VF))) {
5803         setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
5804                                 Intrinsic::not_intrinsic, std::nullopt,
5805                                 ScalarCost);
5806         continue;
5807       }
5808 
5809       bool MaskRequired = Legal->isMaskRequired(CI);
5810       // Compute corresponding vector type for return value and arguments.
5811       Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
5812       for (Type *ScalarTy : ScalarTys)
5813         Tys.push_back(toVectorizedTy(ScalarTy, VF));
5814 
5815       // An in-loop reduction using an fmuladd intrinsic is a special case;
5816       // we don't want the normal cost for that intrinsic.
5817       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
5818         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
5819           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
5820                                   getVectorIntrinsicIDForCall(CI, TLI),
5821                                   std::nullopt, *RedCost);
5822           continue;
5823         }
5824 
5825       // Find the cost of vectorizing the call, if we can find a suitable
5826       // vector variant of the function.
5827       VFInfo FuncInfo;
5828       Function *VecFunc = nullptr;
5829       // Search through any available variants for one we can use at this VF.
5830       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
5831         // Must match requested VF.
5832         if (Info.Shape.VF != VF)
5833           continue;
5834 
5835         // Must take a mask argument if one is required
5836         if (MaskRequired && !Info.isMasked())
5837           continue;
5838 
5839         // Check that all parameter kinds are supported
5840         bool ParamsOk = true;
5841         for (VFParameter Param : Info.Shape.Parameters) {
5842           switch (Param.ParamKind) {
5843           case VFParamKind::Vector:
5844             break;
5845           case VFParamKind::OMP_Uniform: {
5846             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5847             // Make sure the scalar parameter in the loop is invariant.
5848             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
5849                                               TheLoop))
5850               ParamsOk = false;
5851             break;
5852           }
5853           case VFParamKind::OMP_Linear: {
5854             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5855             // Find the stride for the scalar parameter in this loop and see if
5856             // it matches the stride for the variant.
5857             // TODO: do we need to figure out the cost of an extract to get the
5858             // first lane? Or do we hope that it will be folded away?
5859             ScalarEvolution *SE = PSE.getSE();
5860             const auto *SAR =
5861                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
5862 
5863             if (!SAR || SAR->getLoop() != TheLoop) {
5864               ParamsOk = false;
5865               break;
5866             }
5867 
5868             const SCEVConstant *Step =
5869                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
5870 
5871             if (!Step ||
5872                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
5873               ParamsOk = false;
5874 
5875             break;
5876           }
5877           case VFParamKind::GlobalPredicate:
5878             break;
5879           default:
5880             ParamsOk = false;
5881             break;
5882           }
5883         }
5884 
5885         if (!ParamsOk)
5886           continue;
5887 
5888         // Found a suitable candidate, stop here.
5889         VecFunc = CI->getModule()->getFunction(Info.VectorName);
5890         FuncInfo = Info;
5891         break;
5892       }
5893 
5894       if (TLI && VecFunc && !CI->isNoBuiltin())
5895         VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
5896 
5897       // Find the cost of an intrinsic; some targets may have instructions that
5898       // perform the operation without needing an actual call.
5899       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
5900       if (IID != Intrinsic::not_intrinsic)
5901         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
5902 
5903       InstructionCost Cost = ScalarCost;
5904       InstWidening Decision = CM_Scalarize;
5905 
5906       if (VectorCost <= Cost) {
5907         Cost = VectorCost;
5908         Decision = CM_VectorCall;
5909       }
5910 
5911       if (IntrinsicCost <= Cost) {
5912         Cost = IntrinsicCost;
5913         Decision = CM_IntrinsicCall;
5914       }
5915 
5916       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
5917                               FuncInfo.getParamIndexForOptionalMask(), Cost);
5918     }
5919   }
5920 }
5921 
shouldConsiderInvariant(Value * Op)5922 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
5923   if (!Legal->isInvariant(Op))
5924     return false;
5925   // Consider Op invariant, if it or its operands aren't predicated
5926   // instruction in the loop. In that case, it is not trivially hoistable.
5927   auto *OpI = dyn_cast<Instruction>(Op);
5928   return !OpI || !TheLoop->contains(OpI) ||
5929          (!isPredicatedInst(OpI) &&
5930           (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
5931           all_of(OpI->operands(),
5932                  [this](Value *Op) { return shouldConsiderInvariant(Op); }));
5933 }
5934 
5935 InstructionCost
getInstructionCost(Instruction * I,ElementCount VF)5936 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5937                                                ElementCount VF) {
5938   // If we know that this instruction will remain uniform, check the cost of
5939   // the scalar version.
5940   if (isUniformAfterVectorization(I, VF))
5941     VF = ElementCount::getFixed(1);
5942 
5943   if (VF.isVector() && isProfitableToScalarize(I, VF))
5944     return InstsToScalarize[VF][I];
5945 
5946   // Forced scalars do not have any scalarization overhead.
5947   auto ForcedScalar = ForcedScalars.find(VF);
5948   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
5949     auto InstSet = ForcedScalar->second;
5950     if (InstSet.count(I))
5951       return getInstructionCost(I, ElementCount::getFixed(1)) *
5952              VF.getKnownMinValue();
5953   }
5954 
5955   Type *RetTy = I->getType();
5956   if (canTruncateToMinimalBitwidth(I, VF))
5957     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5958   auto *SE = PSE.getSE();
5959 
5960   Type *VectorTy;
5961   if (isScalarAfterVectorization(I, VF)) {
5962     [[maybe_unused]] auto HasSingleCopyAfterVectorization =
5963         [this](Instruction *I, ElementCount VF) -> bool {
5964       if (VF.isScalar())
5965         return true;
5966 
5967       auto Scalarized = InstsToScalarize.find(VF);
5968       assert(Scalarized != InstsToScalarize.end() &&
5969              "VF not yet analyzed for scalarization profitability");
5970       return !Scalarized->second.count(I) &&
5971              llvm::all_of(I->users(), [&](User *U) {
5972                auto *UI = cast<Instruction>(U);
5973                return !Scalarized->second.count(UI);
5974              });
5975     };
5976 
5977     // With the exception of GEPs and PHIs, after scalarization there should
5978     // only be one copy of the instruction generated in the loop. This is
5979     // because the VF is either 1, or any instructions that need scalarizing
5980     // have already been dealt with by the time we get here. As a result,
5981     // it means we don't have to multiply the instruction cost by VF.
5982     assert(I->getOpcode() == Instruction::GetElementPtr ||
5983            I->getOpcode() == Instruction::PHI ||
5984            (I->getOpcode() == Instruction::BitCast &&
5985             I->getType()->isPointerTy()) ||
5986            HasSingleCopyAfterVectorization(I, VF));
5987     VectorTy = RetTy;
5988   } else
5989     VectorTy = toVectorizedTy(RetTy, VF);
5990 
5991   if (VF.isVector() && VectorTy->isVectorTy() &&
5992       !TTI.getNumberOfParts(VectorTy))
5993     return InstructionCost::getInvalid();
5994 
5995   // TODO: We need to estimate the cost of intrinsic calls.
5996   switch (I->getOpcode()) {
5997   case Instruction::GetElementPtr:
5998     // We mark this instruction as zero-cost because the cost of GEPs in
5999     // vectorized code depends on whether the corresponding memory instruction
6000     // is scalarized or not. Therefore, we handle GEPs with the memory
6001     // instruction cost.
6002     return 0;
6003   case Instruction::Br: {
6004     // In cases of scalarized and predicated instructions, there will be VF
6005     // predicated blocks in the vectorized loop. Each branch around these
6006     // blocks requires also an extract of its vector compare i1 element.
6007     // Note that the conditional branch from the loop latch will be replaced by
6008     // a single branch controlling the loop, so there is no extra overhead from
6009     // scalarization.
6010     bool ScalarPredicatedBB = false;
6011     BranchInst *BI = cast<BranchInst>(I);
6012     if (VF.isVector() && BI->isConditional() &&
6013         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6014          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6015         BI->getParent() != TheLoop->getLoopLatch())
6016       ScalarPredicatedBB = true;
6017 
6018     if (ScalarPredicatedBB) {
6019       // Not possible to scalarize scalable vector with predicated instructions.
6020       if (VF.isScalable())
6021         return InstructionCost::getInvalid();
6022       // Return cost for branches around scalarized and predicated blocks.
6023       auto *VecI1Ty =
6024           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6025       return (
6026           TTI.getScalarizationOverhead(
6027               VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6028               /*Insert*/ false, /*Extract*/ true, CostKind) +
6029           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6030     }
6031 
6032     if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6033       // The back-edge branch will remain, as will all scalar branches.
6034       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6035 
6036     // This branch will be eliminated by if-conversion.
6037     return 0;
6038     // Note: We currently assume zero cost for an unconditional branch inside
6039     // a predicated block since it will become a fall-through, although we
6040     // may decide in the future to call TTI for all branches.
6041   }
6042   case Instruction::Switch: {
6043     if (VF.isScalar())
6044       return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6045     auto *Switch = cast<SwitchInst>(I);
6046     return Switch->getNumCases() *
6047            TTI.getCmpSelInstrCost(
6048                Instruction::ICmp,
6049                toVectorTy(Switch->getCondition()->getType(), VF),
6050                toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6051                CmpInst::ICMP_EQ, CostKind);
6052   }
6053   case Instruction::PHI: {
6054     auto *Phi = cast<PHINode>(I);
6055 
6056     // First-order recurrences are replaced by vector shuffles inside the loop.
6057     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6058       SmallVector<int> Mask(VF.getKnownMinValue());
6059       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6060       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6061                                 cast<VectorType>(VectorTy),
6062                                 cast<VectorType>(VectorTy), Mask, CostKind,
6063                                 VF.getKnownMinValue() - 1);
6064     }
6065 
6066     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6067     // converted into select instructions. We require N - 1 selects per phi
6068     // node, where N is the number of incoming values.
6069     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6070       Type *ResultTy = Phi->getType();
6071 
6072       // All instructions in an Any-of reduction chain are narrowed to bool.
6073       // Check if that is the case for this phi node.
6074       auto *HeaderUser = cast_if_present<PHINode>(
6075           find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6076             auto *Phi = dyn_cast<PHINode>(U);
6077             if (Phi && Phi->getParent() == TheLoop->getHeader())
6078               return Phi;
6079             return nullptr;
6080           }));
6081       if (HeaderUser) {
6082         auto &ReductionVars = Legal->getReductionVars();
6083         auto Iter = ReductionVars.find(HeaderUser);
6084         if (Iter != ReductionVars.end() &&
6085             RecurrenceDescriptor::isAnyOfRecurrenceKind(
6086                 Iter->second.getRecurrenceKind()))
6087           ResultTy = Type::getInt1Ty(Phi->getContext());
6088       }
6089       return (Phi->getNumIncomingValues() - 1) *
6090              TTI.getCmpSelInstrCost(
6091                  Instruction::Select, toVectorTy(ResultTy, VF),
6092                  toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6093                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6094     }
6095 
6096     // When tail folding with EVL, if the phi is part of an out of loop
6097     // reduction then it will be transformed into a wide vp_merge.
6098     if (VF.isVector() && foldTailWithEVL() &&
6099         Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6100       IntrinsicCostAttributes ICA(
6101           Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6102           {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6103       return TTI.getIntrinsicInstrCost(ICA, CostKind);
6104     }
6105 
6106     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6107   }
6108   case Instruction::UDiv:
6109   case Instruction::SDiv:
6110   case Instruction::URem:
6111   case Instruction::SRem:
6112     if (VF.isVector() && isPredicatedInst(I)) {
6113       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6114       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6115         ScalarCost : SafeDivisorCost;
6116     }
6117     // We've proven all lanes safe to speculate, fall through.
6118     [[fallthrough]];
6119   case Instruction::Add:
6120   case Instruction::Sub: {
6121     auto Info = Legal->getHistogramInfo(I);
6122     if (Info && VF.isVector()) {
6123       const HistogramInfo *HGram = Info.value();
6124       // Assume that a non-constant update value (or a constant != 1) requires
6125       // a multiply, and add that into the cost.
6126       InstructionCost MulCost = TTI::TCC_Free;
6127       ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6128       if (!RHS || RHS->getZExtValue() != 1)
6129         MulCost =
6130             TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6131 
6132       // Find the cost of the histogram operation itself.
6133       Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6134       Type *ScalarTy = I->getType();
6135       Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6136       IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6137                                   Type::getVoidTy(I->getContext()),
6138                                   {PtrTy, ScalarTy, MaskTy});
6139 
6140       // Add the costs together with the add/sub operation.
6141       return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6142              TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6143     }
6144     [[fallthrough]];
6145   }
6146   case Instruction::FAdd:
6147   case Instruction::FSub:
6148   case Instruction::Mul:
6149   case Instruction::FMul:
6150   case Instruction::FDiv:
6151   case Instruction::FRem:
6152   case Instruction::Shl:
6153   case Instruction::LShr:
6154   case Instruction::AShr:
6155   case Instruction::And:
6156   case Instruction::Or:
6157   case Instruction::Xor: {
6158     // If we're speculating on the stride being 1, the multiplication may
6159     // fold away.  We can generalize this for all operations using the notion
6160     // of neutral elements.  (TODO)
6161     if (I->getOpcode() == Instruction::Mul &&
6162         ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
6163           PSE.getSCEV(I->getOperand(0))->isOne()) ||
6164          (TheLoop->isLoopInvariant(I->getOperand(1)) &&
6165           PSE.getSCEV(I->getOperand(1))->isOne())))
6166       return 0;
6167 
6168     // Detect reduction patterns
6169     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6170       return *RedCost;
6171 
6172     // Certain instructions can be cheaper to vectorize if they have a constant
6173     // second vector operand. One example of this are shifts on x86.
6174     Value *Op2 = I->getOperand(1);
6175     if (!isa<Constant>(Op2) && TheLoop->isLoopInvariant(Op2) &&
6176         PSE.getSE()->isSCEVable(Op2->getType()) &&
6177         isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6178       Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6179     }
6180     auto Op2Info = TTI.getOperandInfo(Op2);
6181     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6182         shouldConsiderInvariant(Op2))
6183       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6184 
6185     SmallVector<const Value *, 4> Operands(I->operand_values());
6186     return TTI.getArithmeticInstrCost(
6187         I->getOpcode(), VectorTy, CostKind,
6188         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6189         Op2Info, Operands, I, TLI);
6190   }
6191   case Instruction::FNeg: {
6192     return TTI.getArithmeticInstrCost(
6193         I->getOpcode(), VectorTy, CostKind,
6194         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6195         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6196         I->getOperand(0), I);
6197   }
6198   case Instruction::Select: {
6199     SelectInst *SI = cast<SelectInst>(I);
6200     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6201     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6202 
6203     const Value *Op0, *Op1;
6204     using namespace llvm::PatternMatch;
6205     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6206                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6207       // select x, y, false --> x & y
6208       // select x, true, y --> x | y
6209       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6210       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6211       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6212               Op1->getType()->getScalarSizeInBits() == 1);
6213 
6214       SmallVector<const Value *, 2> Operands{Op0, Op1};
6215       return TTI.getArithmeticInstrCost(
6216           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6217           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6218     }
6219 
6220     Type *CondTy = SI->getCondition()->getType();
6221     if (!ScalarCond)
6222       CondTy = VectorType::get(CondTy, VF);
6223 
6224     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6225     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6226       Pred = Cmp->getPredicate();
6227     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6228                                   CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6229                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6230   }
6231   case Instruction::ICmp:
6232   case Instruction::FCmp: {
6233     Type *ValTy = I->getOperand(0)->getType();
6234 
6235     if (canTruncateToMinimalBitwidth(I, VF)) {
6236       [[maybe_unused]] Instruction *Op0AsInstruction =
6237           dyn_cast<Instruction>(I->getOperand(0));
6238       assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6239               MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6240              "if both the operand and the compare are marked for "
6241              "truncation, they must have the same bitwidth");
6242       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6243     }
6244 
6245     VectorTy = toVectorTy(ValTy, VF);
6246     return TTI.getCmpSelInstrCost(
6247         I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy),
6248         cast<CmpInst>(I)->getPredicate(), CostKind,
6249         {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
6250   }
6251   case Instruction::Store:
6252   case Instruction::Load: {
6253     ElementCount Width = VF;
6254     if (Width.isVector()) {
6255       InstWidening Decision = getWideningDecision(I, Width);
6256       assert(Decision != CM_Unknown &&
6257              "CM decision should be taken at this point");
6258       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6259         return InstructionCost::getInvalid();
6260       if (Decision == CM_Scalarize)
6261         Width = ElementCount::getFixed(1);
6262     }
6263     VectorTy = toVectorTy(getLoadStoreType(I), Width);
6264     return getMemoryInstructionCost(I, VF);
6265   }
6266   case Instruction::BitCast:
6267     if (I->getType()->isPointerTy())
6268       return 0;
6269     [[fallthrough]];
6270   case Instruction::ZExt:
6271   case Instruction::SExt:
6272   case Instruction::FPToUI:
6273   case Instruction::FPToSI:
6274   case Instruction::FPExt:
6275   case Instruction::PtrToInt:
6276   case Instruction::IntToPtr:
6277   case Instruction::SIToFP:
6278   case Instruction::UIToFP:
6279   case Instruction::Trunc:
6280   case Instruction::FPTrunc: {
6281     // Computes the CastContextHint from a Load/Store instruction.
6282     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6283       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6284              "Expected a load or a store!");
6285 
6286       if (VF.isScalar() || !TheLoop->contains(I))
6287         return TTI::CastContextHint::Normal;
6288 
6289       switch (getWideningDecision(I, VF)) {
6290       case LoopVectorizationCostModel::CM_GatherScatter:
6291         return TTI::CastContextHint::GatherScatter;
6292       case LoopVectorizationCostModel::CM_Interleave:
6293         return TTI::CastContextHint::Interleave;
6294       case LoopVectorizationCostModel::CM_Scalarize:
6295       case LoopVectorizationCostModel::CM_Widen:
6296         return isPredicatedInst(I) ? TTI::CastContextHint::Masked
6297                                    : TTI::CastContextHint::Normal;
6298       case LoopVectorizationCostModel::CM_Widen_Reverse:
6299         return TTI::CastContextHint::Reversed;
6300       case LoopVectorizationCostModel::CM_Unknown:
6301         llvm_unreachable("Instr did not go through cost modelling?");
6302       case LoopVectorizationCostModel::CM_VectorCall:
6303       case LoopVectorizationCostModel::CM_IntrinsicCall:
6304         llvm_unreachable_internal("Instr has invalid widening decision");
6305       }
6306 
6307       llvm_unreachable("Unhandled case!");
6308     };
6309 
6310     unsigned Opcode = I->getOpcode();
6311     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6312     // For Trunc, the context is the only user, which must be a StoreInst.
6313     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6314       if (I->hasOneUse())
6315         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6316           CCH = ComputeCCH(Store);
6317     }
6318     // For Z/Sext, the context is the operand, which must be a LoadInst.
6319     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6320              Opcode == Instruction::FPExt) {
6321       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6322         CCH = ComputeCCH(Load);
6323     }
6324 
6325     // We optimize the truncation of induction variables having constant
6326     // integer steps. The cost of these truncations is the same as the scalar
6327     // operation.
6328     if (isOptimizableIVTruncate(I, VF)) {
6329       auto *Trunc = cast<TruncInst>(I);
6330       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6331                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6332     }
6333 
6334     // Detect reduction patterns
6335     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6336       return *RedCost;
6337 
6338     Type *SrcScalarTy = I->getOperand(0)->getType();
6339     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6340     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6341       SrcScalarTy =
6342           IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6343     Type *SrcVecTy =
6344         VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6345 
6346     if (canTruncateToMinimalBitwidth(I, VF)) {
6347       // If the result type is <= the source type, there will be no extend
6348       // after truncating the users to the minimal required bitwidth.
6349       if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6350           (I->getOpcode() == Instruction::ZExt ||
6351            I->getOpcode() == Instruction::SExt))
6352         return 0;
6353     }
6354 
6355     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6356   }
6357   case Instruction::Call:
6358     return getVectorCallCost(cast<CallInst>(I), VF);
6359   case Instruction::ExtractValue:
6360     return TTI.getInstructionCost(I, CostKind);
6361   case Instruction::Alloca:
6362     // We cannot easily widen alloca to a scalable alloca, as
6363     // the result would need to be a vector of pointers.
6364     if (VF.isScalable())
6365       return InstructionCost::getInvalid();
6366     [[fallthrough]];
6367   default:
6368     // This opcode is unknown. Assume that it is the same as 'mul'.
6369     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6370   } // end of switch.
6371 }
6372 
collectValuesToIgnore()6373 void LoopVectorizationCostModel::collectValuesToIgnore() {
6374   // Ignore ephemeral values.
6375   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6376 
6377   SmallVector<Value *, 4> DeadInterleavePointerOps;
6378   SmallVector<Value *, 4> DeadOps;
6379 
6380   // If a scalar epilogue is required, users outside the loop won't use
6381   // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6382   // that is the case.
6383   bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6384   auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6385     return RequiresScalarEpilogue &&
6386            !TheLoop->contains(cast<Instruction>(U)->getParent());
6387   };
6388 
6389   LoopBlocksDFS DFS(TheLoop);
6390   DFS.perform(LI);
6391   MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6392   for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6393     for (Instruction &I : reverse(*BB)) {
6394       // Find all stores to invariant variables. Since they are going to sink
6395       // outside the loop we do not need calculate cost for them.
6396       StoreInst *SI;
6397       if ((SI = dyn_cast<StoreInst>(&I)) &&
6398           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6399         ValuesToIgnore.insert(&I);
6400         DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6401             SI->getValueOperand());
6402       }
6403 
6404       if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6405         continue;
6406 
6407       // Add instructions that would be trivially dead and are only used by
6408       // values already ignored to DeadOps to seed worklist.
6409       if (wouldInstructionBeTriviallyDead(&I, TLI) &&
6410           all_of(I.users(), [this, IsLiveOutDead](User *U) {
6411             return VecValuesToIgnore.contains(U) ||
6412                    ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6413           }))
6414         DeadOps.push_back(&I);
6415 
6416       // For interleave groups, we only create a pointer for the start of the
6417       // interleave group. Queue up addresses of group members except the insert
6418       // position for further processing.
6419       if (isAccessInterleaved(&I)) {
6420         auto *Group = getInterleavedAccessGroup(&I);
6421         if (Group->getInsertPos() == &I)
6422           continue;
6423         Value *PointerOp = getLoadStorePointerOperand(&I);
6424         DeadInterleavePointerOps.push_back(PointerOp);
6425       }
6426 
6427       // Queue branches for analysis. They are dead, if their successors only
6428       // contain dead instructions.
6429       if (auto *Br = dyn_cast<BranchInst>(&I)) {
6430         if (Br->isConditional())
6431           DeadOps.push_back(&I);
6432       }
6433     }
6434 
6435   // Mark ops feeding interleave group members as free, if they are only used
6436   // by other dead computations.
6437   for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6438     auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6439     if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6440           Instruction *UI = cast<Instruction>(U);
6441           return !VecValuesToIgnore.contains(U) &&
6442                  (!isAccessInterleaved(UI) ||
6443                   getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6444         }))
6445       continue;
6446     VecValuesToIgnore.insert(Op);
6447     DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
6448   }
6449 
6450   for (const auto &[_, Ops] : DeadInvariantStoreOps)
6451     llvm::append_range(DeadOps, drop_end(Ops));
6452 
6453   // Mark ops that would be trivially dead and are only used by ignored
6454   // instructions as free.
6455   BasicBlock *Header = TheLoop->getHeader();
6456 
6457   // Returns true if the block contains only dead instructions. Such blocks will
6458   // be removed by VPlan-to-VPlan transforms and won't be considered by the
6459   // VPlan-based cost model, so skip them in the legacy cost-model as well.
6460   auto IsEmptyBlock = [this](BasicBlock *BB) {
6461     return all_of(*BB, [this](Instruction &I) {
6462       return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6463              (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6464     });
6465   };
6466   for (unsigned I = 0; I != DeadOps.size(); ++I) {
6467     auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6468 
6469     // Check if the branch should be considered dead.
6470     if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6471       BasicBlock *ThenBB = Br->getSuccessor(0);
6472       BasicBlock *ElseBB = Br->getSuccessor(1);
6473       // Don't considers branches leaving the loop for simplification.
6474       if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6475         continue;
6476       bool ThenEmpty = IsEmptyBlock(ThenBB);
6477       bool ElseEmpty = IsEmptyBlock(ElseBB);
6478       if ((ThenEmpty && ElseEmpty) ||
6479           (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6480            ElseBB->phis().empty()) ||
6481           (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6482            ThenBB->phis().empty())) {
6483         VecValuesToIgnore.insert(Br);
6484         DeadOps.push_back(Br->getCondition());
6485       }
6486       continue;
6487     }
6488 
6489     // Skip any op that shouldn't be considered dead.
6490     if (!Op || !TheLoop->contains(Op) ||
6491         (isa<PHINode>(Op) && Op->getParent() == Header) ||
6492         !wouldInstructionBeTriviallyDead(Op, TLI) ||
6493         any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6494           return !VecValuesToIgnore.contains(U) &&
6495                  !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6496         }))
6497       continue;
6498 
6499     // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6500     // which applies for both scalar and vector versions. Otherwise it is only
6501     // dead in vector versions, so only add it to VecValuesToIgnore.
6502     if (all_of(Op->users(),
6503                [this](User *U) { return ValuesToIgnore.contains(U); }))
6504       ValuesToIgnore.insert(Op);
6505 
6506     VecValuesToIgnore.insert(Op);
6507     DeadOps.append(Op->op_begin(), Op->op_end());
6508   }
6509 
6510   // Ignore type-promoting instructions we identified during reduction
6511   // detection.
6512   for (const auto &Reduction : Legal->getReductionVars()) {
6513     const RecurrenceDescriptor &RedDes = Reduction.second;
6514     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6515     VecValuesToIgnore.insert_range(Casts);
6516   }
6517   // Ignore type-casting instructions we identified during induction
6518   // detection.
6519   for (const auto &Induction : Legal->getInductionVars()) {
6520     const InductionDescriptor &IndDes = Induction.second;
6521     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6522     VecValuesToIgnore.insert_range(Casts);
6523   }
6524 }
6525 
collectInLoopReductions()6526 void LoopVectorizationCostModel::collectInLoopReductions() {
6527   // Avoid duplicating work finding in-loop reductions.
6528   if (!InLoopReductions.empty())
6529     return;
6530 
6531   for (const auto &Reduction : Legal->getReductionVars()) {
6532     PHINode *Phi = Reduction.first;
6533     const RecurrenceDescriptor &RdxDesc = Reduction.second;
6534 
6535     // We don't collect reductions that are type promoted (yet).
6536     if (RdxDesc.getRecurrenceType() != Phi->getType())
6537       continue;
6538 
6539     // If the target would prefer this reduction to happen "in-loop", then we
6540     // want to record it as such.
6541     RecurKind Kind = RdxDesc.getRecurrenceKind();
6542     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6543         !TTI.preferInLoopReduction(Kind, Phi->getType()))
6544       continue;
6545 
6546     // Check that we can correctly put the reductions into the loop, by
6547     // finding the chain of operations that leads from the phi to the loop
6548     // exit value.
6549     SmallVector<Instruction *, 4> ReductionOperations =
6550         RdxDesc.getReductionOpChain(Phi, TheLoop);
6551     bool InLoop = !ReductionOperations.empty();
6552 
6553     if (InLoop) {
6554       InLoopReductions.insert(Phi);
6555       // Add the elements to InLoopReductionImmediateChains for cost modelling.
6556       Instruction *LastChain = Phi;
6557       for (auto *I : ReductionOperations) {
6558         InLoopReductionImmediateChains[I] = LastChain;
6559         LastChain = I;
6560       }
6561     }
6562     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6563                       << " reduction for phi: " << *Phi << "\n");
6564   }
6565 }
6566 
6567 // This function will select a scalable VF if the target supports scalable
6568 // vectors and a fixed one otherwise.
6569 // TODO: we could return a pair of values that specify the max VF and
6570 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6571 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6572 // doesn't have a cost model that can choose which plan to execute if
6573 // more than one is generated.
determineVPlanVF(const TargetTransformInfo & TTI,LoopVectorizationCostModel & CM)6574 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6575                                      LoopVectorizationCostModel &CM) {
6576   unsigned WidestType;
6577   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6578 
6579   TargetTransformInfo::RegisterKind RegKind =
6580       TTI.enableScalableVectorization()
6581           ? TargetTransformInfo::RGK_ScalableVector
6582           : TargetTransformInfo::RGK_FixedWidthVector;
6583 
6584   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
6585   unsigned N = RegSize.getKnownMinValue() / WidestType;
6586   return ElementCount::get(N, RegSize.isScalable());
6587 }
6588 
6589 VectorizationFactor
planInVPlanNativePath(ElementCount UserVF)6590 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6591   ElementCount VF = UserVF;
6592   // Outer loop handling: They may require CFG and instruction level
6593   // transformations before even evaluating whether vectorization is profitable.
6594   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6595   // the vectorization pipeline.
6596   if (!OrigLoop->isInnermost()) {
6597     // If the user doesn't provide a vectorization factor, determine a
6598     // reasonable one.
6599     if (UserVF.isZero()) {
6600       VF = determineVPlanVF(TTI, CM);
6601       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6602 
6603       // Make sure we have a VF > 1 for stress testing.
6604       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6605         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6606                           << "overriding computed VF.\n");
6607         VF = ElementCount::getFixed(4);
6608       }
6609     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6610                !ForceTargetSupportsScalableVectors) {
6611       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6612                         << "not supported by the target.\n");
6613       reportVectorizationFailure(
6614           "Scalable vectorization requested but not supported by the target",
6615           "the scalable user-specified vectorization width for outer-loop "
6616           "vectorization cannot be used because the target does not support "
6617           "scalable vectors.",
6618           "ScalableVFUnfeasible", ORE, OrigLoop);
6619       return VectorizationFactor::Disabled();
6620     }
6621     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6622     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6623            "VF needs to be a power of two");
6624     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6625                       << "VF " << VF << " to build VPlans.\n");
6626     buildVPlans(VF, VF);
6627 
6628     if (VPlans.empty())
6629       return VectorizationFactor::Disabled();
6630 
6631     // For VPlan build stress testing, we bail out after VPlan construction.
6632     if (VPlanBuildStressTest)
6633       return VectorizationFactor::Disabled();
6634 
6635     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6636   }
6637 
6638   LLVM_DEBUG(
6639       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6640                 "VPlan-native path.\n");
6641   return VectorizationFactor::Disabled();
6642 }
6643 
plan(ElementCount UserVF,unsigned UserIC)6644 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6645   assert(OrigLoop->isInnermost() && "Inner loop expected.");
6646   CM.collectValuesToIgnore();
6647   CM.collectElementTypesForWidening();
6648 
6649   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6650   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6651     return;
6652 
6653   // Invalidate interleave groups if all blocks of loop will be predicated.
6654   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
6655       !useMaskedInterleavedAccesses(TTI)) {
6656     LLVM_DEBUG(
6657         dbgs()
6658         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6659            "which requires masked-interleaved support.\n");
6660     if (CM.InterleaveInfo.invalidateGroups())
6661       // Invalidating interleave groups also requires invalidating all decisions
6662       // based on them, which includes widening decisions and uniform and scalar
6663       // values.
6664       CM.invalidateCostModelingDecisions();
6665   }
6666 
6667   if (CM.foldTailByMasking())
6668     Legal->prepareToFoldTailByMasking();
6669 
6670   ElementCount MaxUserVF =
6671       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6672   if (UserVF) {
6673     if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
6674       reportVectorizationInfo(
6675           "UserVF ignored because it may be larger than the maximal safe VF",
6676           "InvalidUserVF", ORE, OrigLoop);
6677     } else {
6678       assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6679              "VF needs to be a power of two");
6680       // Collect the instructions (and their associated costs) that will be more
6681       // profitable to scalarize.
6682       CM.collectInLoopReductions();
6683       if (CM.selectUserVectorizationFactor(UserVF)) {
6684         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6685         buildVPlansWithVPRecipes(UserVF, UserVF);
6686         LLVM_DEBUG(printPlans(dbgs()));
6687         return;
6688       }
6689       reportVectorizationInfo("UserVF ignored because of invalid costs.",
6690                               "InvalidCost", ORE, OrigLoop);
6691     }
6692   }
6693 
6694   // Collect the Vectorization Factor Candidates.
6695   SmallVector<ElementCount> VFCandidates;
6696   for (auto VF = ElementCount::getFixed(1);
6697        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
6698     VFCandidates.push_back(VF);
6699   for (auto VF = ElementCount::getScalable(1);
6700        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
6701     VFCandidates.push_back(VF);
6702 
6703   CM.collectInLoopReductions();
6704   for (const auto &VF : VFCandidates) {
6705     // Collect Uniform and Scalar instructions after vectorization with VF.
6706     CM.collectNonVectorizedAndSetWideningDecisions(VF);
6707   }
6708 
6709   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
6710   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
6711 
6712   LLVM_DEBUG(printPlans(dbgs()));
6713 }
6714 
getLegacyCost(Instruction * UI,ElementCount VF) const6715 InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6716                                              ElementCount VF) const {
6717   if (ForceTargetInstructionCost.getNumOccurrences())
6718     return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
6719   return CM.getInstructionCost(UI, VF);
6720 }
6721 
isLegacyUniformAfterVectorization(Instruction * I,ElementCount VF) const6722 bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
6723                                                       ElementCount VF) const {
6724   return CM.isUniformAfterVectorization(I, VF);
6725 }
6726 
skipCostComputation(Instruction * UI,bool IsVector) const6727 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6728   return CM.ValuesToIgnore.contains(UI) ||
6729          (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
6730          SkipCostComputation.contains(UI);
6731 }
6732 
6733 InstructionCost
precomputeCosts(VPlan & Plan,ElementCount VF,VPCostContext & CostCtx) const6734 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6735                                           VPCostContext &CostCtx) const {
6736   InstructionCost Cost;
6737   // Cost modeling for inductions is inaccurate in the legacy cost model
6738   // compared to the recipes that are generated. To match here initially during
6739   // VPlan cost model bring up directly use the induction costs from the legacy
6740   // cost model. Note that we do this as pre-processing; the VPlan may not have
6741   // any recipes associated with the original induction increment instruction
6742   // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6743   // the cost of induction phis and increments (both that are represented by
6744   // recipes and those that are not), to avoid distinguishing between them here,
6745   // and skip all recipes that represent induction phis and increments (the
6746   // former case) later on, if they exist, to avoid counting them twice.
6747   // Similarly we pre-compute the cost of any optimized truncates.
6748   // TODO: Switch to more accurate costing based on VPlan.
6749   for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6750     Instruction *IVInc = cast<Instruction>(
6751         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
6752     SmallVector<Instruction *> IVInsts = {IVInc};
6753     for (unsigned I = 0; I != IVInsts.size(); I++) {
6754       for (Value *Op : IVInsts[I]->operands()) {
6755         auto *OpI = dyn_cast<Instruction>(Op);
6756         if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
6757           continue;
6758         IVInsts.push_back(OpI);
6759       }
6760     }
6761     IVInsts.push_back(IV);
6762     for (User *U : IV->users()) {
6763       auto *CI = cast<Instruction>(U);
6764       if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
6765         continue;
6766       IVInsts.push_back(CI);
6767     }
6768 
6769     // If the vector loop gets executed exactly once with the given VF, ignore
6770     // the costs of comparison and induction instructions, as they'll get
6771     // simplified away.
6772     // TODO: Remove this code after stepping away from the legacy cost model and
6773     // adding code to simplify VPlans before calculating their costs.
6774     auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
6775     if (TC == VF && !CM.foldTailByMasking())
6776       addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
6777                                            CostCtx.SkipCostComputation);
6778 
6779     for (Instruction *IVInst : IVInsts) {
6780       if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
6781         continue;
6782       InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
6783       LLVM_DEBUG({
6784         dbgs() << "Cost of " << InductionCost << " for VF " << VF
6785                << ": induction instruction " << *IVInst << "\n";
6786       });
6787       Cost += InductionCost;
6788       CostCtx.SkipCostComputation.insert(IVInst);
6789     }
6790   }
6791 
6792   /// Compute the cost of all exiting conditions of the loop using the legacy
6793   /// cost model. This is to match the legacy behavior, which adds the cost of
6794   /// all exit conditions. Note that this over-estimates the cost, as there will
6795   /// be a single condition to control the vector loop.
6796   SmallVector<BasicBlock *> Exiting;
6797   CM.TheLoop->getExitingBlocks(Exiting);
6798   SetVector<Instruction *> ExitInstrs;
6799   // Collect all exit conditions.
6800   for (BasicBlock *EB : Exiting) {
6801     auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
6802     if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
6803       continue;
6804     if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
6805       ExitInstrs.insert(CondI);
6806     }
6807   }
6808   // Compute the cost of all instructions only feeding the exit conditions.
6809   for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6810     Instruction *CondI = ExitInstrs[I];
6811     if (!OrigLoop->contains(CondI) ||
6812         !CostCtx.SkipCostComputation.insert(CondI).second)
6813       continue;
6814     InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
6815     LLVM_DEBUG({
6816       dbgs() << "Cost of " << CondICost << " for VF " << VF
6817              << ": exit condition instruction " << *CondI << "\n";
6818     });
6819     Cost += CondICost;
6820     for (Value *Op : CondI->operands()) {
6821       auto *OpI = dyn_cast<Instruction>(Op);
6822       if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
6823           any_of(OpI->users(), [&ExitInstrs, this](User *U) {
6824             return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
6825                    !ExitInstrs.contains(cast<Instruction>(U));
6826           }))
6827         continue;
6828       ExitInstrs.insert(OpI);
6829     }
6830   }
6831 
6832   // Pre-compute the costs for branches except for the backedge, as the number
6833   // of replicate regions in a VPlan may not directly match the number of
6834   // branches, which would lead to different decisions.
6835   // TODO: Compute cost of branches for each replicate region in the VPlan,
6836   // which is more accurate than the legacy cost model.
6837   for (BasicBlock *BB : OrigLoop->blocks()) {
6838     if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
6839       continue;
6840     CostCtx.SkipCostComputation.insert(BB->getTerminator());
6841     if (BB == OrigLoop->getLoopLatch())
6842       continue;
6843     auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
6844     Cost += BranchCost;
6845   }
6846 
6847   // Pre-compute costs for instructions that are forced-scalar or profitable to
6848   // scalarize. Their costs will be computed separately in the legacy cost
6849   // model.
6850   for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
6851     if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
6852       continue;
6853     CostCtx.SkipCostComputation.insert(ForcedScalar);
6854     InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
6855     LLVM_DEBUG({
6856       dbgs() << "Cost of " << ForcedCost << " for VF " << VF
6857              << ": forced scalar " << *ForcedScalar << "\n";
6858     });
6859     Cost += ForcedCost;
6860   }
6861   for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
6862     if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
6863       continue;
6864     CostCtx.SkipCostComputation.insert(Scalarized);
6865     LLVM_DEBUG({
6866       dbgs() << "Cost of " << ScalarCost << " for VF " << VF
6867              << ": profitable to scalarize " << *Scalarized << "\n";
6868     });
6869     Cost += ScalarCost;
6870   }
6871 
6872   return Cost;
6873 }
6874 
cost(VPlan & Plan,ElementCount VF) const6875 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6876                                                ElementCount VF) const {
6877   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
6878                         CM.CostKind);
6879   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
6880 
6881   // Now compute and add the VPlan-based cost.
6882   Cost += Plan.cost(VF, CostCtx);
6883 #ifndef NDEBUG
6884   unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
6885   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
6886                     << " (Estimated cost per lane: ");
6887   if (Cost.isValid()) {
6888     double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
6889     LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
6890   } else /* No point dividing an invalid cost - it will still be invalid */
6891     LLVM_DEBUG(dbgs() << "Invalid");
6892   LLVM_DEBUG(dbgs() << ")\n");
6893 #endif
6894   return Cost;
6895 }
6896 
6897 #ifndef NDEBUG
6898 /// Return true if the original loop \ TheLoop contains any instructions that do
6899 /// not have corresponding recipes in \p Plan and are not marked to be ignored
6900 /// in \p CostCtx. This means the VPlan contains simplification that the legacy
6901 /// cost-model did not account for.
planContainsAdditionalSimplifications(VPlan & Plan,VPCostContext & CostCtx,Loop * TheLoop,ElementCount VF)6902 static bool planContainsAdditionalSimplifications(VPlan &Plan,
6903                                                   VPCostContext &CostCtx,
6904                                                   Loop *TheLoop,
6905                                                   ElementCount VF) {
6906   // First collect all instructions for the recipes in Plan.
6907   auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
6908     if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
6909       return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
6910     if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
6911       return &WidenMem->getIngredient();
6912     return nullptr;
6913   };
6914 
6915   DenseSet<Instruction *> SeenInstrs;
6916   auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
6917   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
6918     for (VPRecipeBase &R : *VPBB) {
6919       if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
6920         auto *IG = IR->getInterleaveGroup();
6921         unsigned NumMembers = IG->getNumMembers();
6922         for (unsigned I = 0; I != NumMembers; ++I) {
6923           if (Instruction *M = IG->getMember(I))
6924             SeenInstrs.insert(M);
6925         }
6926         continue;
6927       }
6928       // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
6929       // cost model won't cost it whilst the legacy will.
6930       if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
6931         if (none_of(FOR->users(), [](VPUser *U) {
6932               auto *VPI = dyn_cast<VPInstruction>(U);
6933               return VPI && VPI->getOpcode() ==
6934                                 VPInstruction::FirstOrderRecurrenceSplice;
6935             }))
6936           return true;
6937       }
6938       // The VPlan-based cost model is more accurate for partial reduction and
6939       // comparing against the legacy cost isn't desirable.
6940       if (isa<VPPartialReductionRecipe>(&R))
6941         return true;
6942 
6943       /// If a VPlan transform folded a recipe to one producing a single-scalar,
6944       /// but the original instruction wasn't uniform-after-vectorization in the
6945       /// legacy cost model, the legacy cost overestimates the actual cost.
6946       if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
6947         if (RepR->isSingleScalar() &&
6948             !CostCtx.isLegacyUniformAfterVectorization(
6949                 RepR->getUnderlyingInstr(), VF))
6950           return true;
6951       }
6952       if (Instruction *UI = GetInstructionForCost(&R)) {
6953         // If we adjusted the predicate of the recipe, the cost in the legacy
6954         // cost model may be different.
6955         if (auto *WidenCmp = dyn_cast<VPWidenRecipe>(&R)) {
6956           if ((WidenCmp->getOpcode() == Instruction::ICmp ||
6957                WidenCmp->getOpcode() == Instruction::FCmp) &&
6958               WidenCmp->getPredicate() != cast<CmpInst>(UI)->getPredicate())
6959             return true;
6960         }
6961         SeenInstrs.insert(UI);
6962       }
6963     }
6964   }
6965 
6966   // Return true if the loop contains any instructions that are not also part of
6967   // the VPlan or are skipped for VPlan-based cost computations. This indicates
6968   // that the VPlan contains extra simplifications.
6969   return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
6970                                     TheLoop](BasicBlock *BB) {
6971     return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
6972       // Skip induction phis when checking for simplifications, as they may not
6973       // be lowered directly be lowered to a corresponding PHI recipe.
6974       if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
6975           CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
6976         return false;
6977       return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
6978     });
6979   });
6980 }
6981 #endif
6982 
computeBestVF()6983 VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
6984   if (VPlans.empty())
6985     return VectorizationFactor::Disabled();
6986   // If there is a single VPlan with a single VF, return it directly.
6987   VPlan &FirstPlan = *VPlans[0];
6988   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
6989     return {*FirstPlan.vectorFactors().begin(), 0, 0};
6990 
6991   LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
6992                     << (CM.CostKind == TTI::TCK_RecipThroughput
6993                             ? "Reciprocal Throughput\n"
6994                         : CM.CostKind == TTI::TCK_Latency
6995                             ? "Instruction Latency\n"
6996                         : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
6997                         : CM.CostKind == TTI::TCK_SizeAndLatency
6998                             ? "Code Size and Latency\n"
6999                             : "Unknown\n"));
7000 
7001   ElementCount ScalarVF = ElementCount::getFixed(1);
7002   assert(hasPlanWithVF(ScalarVF) &&
7003          "More than a single plan/VF w/o any plan having scalar VF");
7004 
7005   // TODO: Compute scalar cost using VPlan-based cost model.
7006   InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7007   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7008   VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7009   VectorizationFactor BestFactor = ScalarFactor;
7010 
7011   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7012   if (ForceVectorization) {
7013     // Ignore scalar width, because the user explicitly wants vectorization.
7014     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7015     // evaluation.
7016     BestFactor.Cost = InstructionCost::getMax();
7017   }
7018 
7019   for (auto &P : VPlans) {
7020     ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
7021                                P->vectorFactors().end());
7022 
7023     SmallVector<VPRegisterUsage, 8> RUs;
7024     if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
7025         CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
7026       RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
7027 
7028     for (unsigned I = 0; I < VFs.size(); I++) {
7029       ElementCount VF = VFs[I];
7030       if (VF.isScalar())
7031         continue;
7032       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7033         LLVM_DEBUG(
7034             dbgs()
7035             << "LV: Not considering vector loop of width " << VF
7036             << " because it will not generate any vector instructions.\n");
7037         continue;
7038       }
7039       if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
7040         LLVM_DEBUG(
7041             dbgs()
7042             << "LV: Not considering vector loop of width " << VF
7043             << " because it would cause replicated blocks to be generated,"
7044             << " which isn't allowed when optimizing for size.\n");
7045         continue;
7046       }
7047 
7048       InstructionCost Cost = cost(*P, VF);
7049       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7050 
7051       if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI)) {
7052         LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7053                           << VF << " because it uses too many registers\n");
7054         continue;
7055       }
7056 
7057       if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7058         BestFactor = CurrentFactor;
7059 
7060       // If profitable add it to ProfitableVF list.
7061       if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
7062         ProfitableVFs.push_back(CurrentFactor);
7063     }
7064   }
7065 
7066 #ifndef NDEBUG
7067   // Select the optimal vectorization factor according to the legacy cost-model.
7068   // This is now only used to verify the decisions by the new VPlan-based
7069   // cost-model and will be retired once the VPlan-based cost-model is
7070   // stabilized.
7071   VectorizationFactor LegacyVF = selectVectorizationFactor();
7072   VPlan &BestPlan = getPlanFor(BestFactor.Width);
7073 
7074   // Pre-compute the cost and use it to check if BestPlan contains any
7075   // simplifications not accounted for in the legacy cost model. If that's the
7076   // case, don't trigger the assertion, as the extra simplifications may cause a
7077   // different VF to be picked by the VPlan-based cost model.
7078   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7079                         CM.CostKind);
7080   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7081   // Verify that the VPlan-based and legacy cost models agree, except for VPlans
7082   // with early exits and plans with additional VPlan simplifications. The
7083   // legacy cost model doesn't properly model costs for such loops.
7084   assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7085           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7086                                                 CostCtx, OrigLoop,
7087                                                 BestFactor.Width) ||
7088           planContainsAdditionalSimplifications(
7089               getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7090          " VPlan cost model and legacy cost model disagreed");
7091   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7092          "when vectorizing, the scalar cost must be computed.");
7093 #endif
7094 
7095   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7096   return BestFactor;
7097 }
7098 
addRuntimeUnrollDisableMetaData(Loop * L)7099 static void addRuntimeUnrollDisableMetaData(Loop *L) {
7100   SmallVector<Metadata *, 4> MDs;
7101   // Reserve first location for self reference to the LoopID metadata node.
7102   MDs.push_back(nullptr);
7103   bool IsUnrollMetadata = false;
7104   MDNode *LoopID = L->getLoopID();
7105   if (LoopID) {
7106     // First find existing loop unrolling disable metadata.
7107     for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7108       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7109       if (MD) {
7110         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7111         IsUnrollMetadata =
7112             S && S->getString().starts_with("llvm.loop.unroll.disable");
7113       }
7114       MDs.push_back(LoopID->getOperand(I));
7115     }
7116   }
7117 
7118   if (!IsUnrollMetadata) {
7119     // Add runtime unroll disable metadata.
7120     LLVMContext &Context = L->getHeader()->getContext();
7121     SmallVector<Metadata *, 1> DisableOperands;
7122     DisableOperands.push_back(
7123         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7124     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7125     MDs.push_back(DisableNode);
7126     MDNode *NewLoopID = MDNode::get(Context, MDs);
7127     // Set operand 0 to refer to the loop id itself.
7128     NewLoopID->replaceOperandWith(0, NewLoopID);
7129     L->setLoopID(NewLoopID);
7130   }
7131 }
7132 
getStartValueFromReductionResult(VPInstruction * RdxResult)7133 static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) {
7134   using namespace VPlanPatternMatch;
7135   assert(RdxResult->getOpcode() == VPInstruction::ComputeFindIVResult &&
7136          "RdxResult must be ComputeFindIVResult");
7137   VPValue *StartVPV = RdxResult->getOperand(1);
7138   match(StartVPV, m_Freeze(m_VPValue(StartVPV)));
7139   return StartVPV->getLiveInIRValue();
7140 }
7141 
7142 // If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
7143 // epilog loop, fix the reduction's scalar PHI node by adding the incoming value
7144 // from the main vector loop.
fixReductionScalarResumeWhenVectorizingEpilog(VPPhi * EpiResumePhiR,VPTransformState & State,BasicBlock * BypassBlock)7145 static void fixReductionScalarResumeWhenVectorizingEpilog(
7146     VPPhi *EpiResumePhiR, VPTransformState &State, BasicBlock *BypassBlock) {
7147   // Get the VPInstruction computing the reduction result in the middle block.
7148   // The first operand may not be from the middle block if it is not connected
7149   // to the scalar preheader. In that case, there's nothing to fix.
7150   VPValue *Incoming = EpiResumePhiR->getOperand(0);
7151   match(Incoming, VPlanPatternMatch::m_ZExtOrSExt(
7152                       VPlanPatternMatch::m_VPValue(Incoming)));
7153   auto *EpiRedResult = dyn_cast<VPInstruction>(Incoming);
7154   if (!EpiRedResult ||
7155       (EpiRedResult->getOpcode() != VPInstruction::ComputeAnyOfResult &&
7156        EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult &&
7157        EpiRedResult->getOpcode() != VPInstruction::ComputeFindIVResult))
7158     return;
7159 
7160   auto *EpiRedHeaderPhi =
7161       cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7162   RecurKind Kind = EpiRedHeaderPhi->getRecurrenceKind();
7163   Value *MainResumeValue;
7164   if (auto *VPI = dyn_cast<VPInstruction>(EpiRedHeaderPhi->getStartValue())) {
7165     assert((VPI->getOpcode() == VPInstruction::Broadcast ||
7166             VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7167            "unexpected start recipe");
7168     MainResumeValue = VPI->getOperand(0)->getUnderlyingValue();
7169   } else
7170     MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7171   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) {
7172     [[maybe_unused]] Value *StartV =
7173         EpiRedResult->getOperand(1)->getLiveInIRValue();
7174     auto *Cmp = cast<ICmpInst>(MainResumeValue);
7175     assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7176            "AnyOf expected to start with ICMP_NE");
7177     assert(Cmp->getOperand(1) == StartV &&
7178            "AnyOf expected to start by comparing main resume value to original "
7179            "start value");
7180     MainResumeValue = Cmp->getOperand(0);
7181   } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind)) {
7182     Value *StartV = getStartValueFromReductionResult(EpiRedResult);
7183     Value *SentinelV = EpiRedResult->getOperand(2)->getLiveInIRValue();
7184     using namespace llvm::PatternMatch;
7185     Value *Cmp, *OrigResumeV, *CmpOp;
7186     [[maybe_unused]] bool IsExpectedPattern =
7187         match(MainResumeValue,
7188               m_Select(m_OneUse(m_Value(Cmp)), m_Specific(SentinelV),
7189                        m_Value(OrigResumeV))) &&
7190         (match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
7191                                    m_Value(CmpOp))) &&
7192          ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(CmpOp))));
7193     assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7194     MainResumeValue = OrigResumeV;
7195   }
7196   PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7197 
7198   // When fixing reductions in the epilogue loop we should already have
7199   // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7200   // over the incoming values correctly.
7201   auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiR, true));
7202   EpiResumePhi->setIncomingValueForBlock(
7203       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7204 }
7205 
executePlan(ElementCount BestVF,unsigned BestUF,VPlan & BestVPlan,InnerLoopVectorizer & ILV,DominatorTree * DT,bool VectorizingEpilogue)7206 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7207     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7208     InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
7209   assert(BestVPlan.hasVF(BestVF) &&
7210          "Trying to execute plan with unsupported VF");
7211   assert(BestVPlan.hasUF(BestUF) &&
7212          "Trying to execute plan with unsupported UF");
7213   if (BestVPlan.hasEarlyExit())
7214     ++LoopsEarlyExitVectorized;
7215   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7216   // cost model is complete for better cost estimates.
7217   VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
7218                            OrigLoop->getHeader()->getContext());
7219   VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
7220   VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
7221   bool HasBranchWeights =
7222       hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
7223   if (HasBranchWeights) {
7224     std::optional<unsigned> VScale = CM.getVScaleForTuning();
7225     VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator,
7226                              BestVPlan, BestVF, VScale);
7227   }
7228 
7229   if (!VectorizingEpilogue) {
7230     // Checks are the same for all VPlans, added to BestVPlan only for
7231     // compactness.
7232     attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
7233   }
7234 
7235   // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7236   VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
7237   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7238   VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
7239   VPlanTransforms::narrowInterleaveGroups(
7240       BestVPlan, BestVF,
7241       TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
7242   VPlanTransforms::removeDeadRecipes(BestVPlan);
7243 
7244   VPlanTransforms::convertToConcreteRecipes(BestVPlan,
7245                                             *Legal->getWidestInductionType());
7246   // Regions are dissolved after optimizing for VF and UF, which completely
7247   // removes unneeded loop regions first.
7248   VPlanTransforms::dissolveLoopRegions(BestVPlan);
7249   // Perform the actual loop transformation.
7250   VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7251                          OrigLoop->getParentLoop(),
7252                          Legal->getWidestInductionType());
7253 
7254 #ifdef EXPENSIVE_CHECKS
7255   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7256 #endif
7257 
7258   // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7259   // making any changes to the CFG.
7260   DenseMap<const SCEV *, Value *> ExpandedSCEVs;
7261   auto *Entry = cast<VPIRBasicBlock>(BestVPlan.getEntry());
7262   State.Builder.SetInsertPoint(Entry->getIRBasicBlock()->getTerminator());
7263   for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
7264     auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
7265     if (!ExpSCEV)
7266       continue;
7267     ExpSCEV->execute(State);
7268     ExpandedSCEVs[ExpSCEV->getSCEV()] = State.get(ExpSCEV, VPLane(0));
7269     VPValue *Exp = BestVPlan.getOrAddLiveIn(ExpandedSCEVs[ExpSCEV->getSCEV()]);
7270     ExpSCEV->replaceAllUsesWith(Exp);
7271     if (BestVPlan.getTripCount() == ExpSCEV)
7272       BestVPlan.resetTripCount(Exp);
7273     ExpSCEV->eraseFromParent();
7274   }
7275 
7276   if (!ILV.getTripCount())
7277     ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7278   else
7279     assert(VectorizingEpilogue && "should only re-use the existing trip "
7280                                   "count during epilogue vectorization");
7281 
7282   // 1. Set up the skeleton for vectorization, including vector pre-header and
7283   // middle block. The vector loop is created during VPlan execution.
7284   BasicBlock *EntryBB =
7285       cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock();
7286   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7287   if (VectorizingEpilogue)
7288     VPlanTransforms::removeDeadRecipes(BestVPlan);
7289 
7290   assert(verifyVPlanIsValid(BestVPlan, true /*VerifyLate*/) &&
7291          "final VPlan is invalid");
7292 
7293   ILV.printDebugTracesAtStart();
7294 
7295   //===------------------------------------------------===//
7296   //
7297   // Notice: any optimization or new instruction that go
7298   // into the code below should also be implemented in
7299   // the cost-model.
7300   //
7301   //===------------------------------------------------===//
7302 
7303   // 2. Copy and widen instructions from the old loop into the new loop.
7304   BestVPlan.prepareToExecute(
7305       ILV.getTripCount(),
7306       ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
7307   replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
7308 
7309   // Move check blocks to their final position.
7310   // TODO: Move as part of VPIRBB execute and update impacted tests.
7311   if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second)
7312     MemCheckBlock->moveAfter(EntryBB);
7313   if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVChecks().second)
7314     SCEVCheckBlock->moveAfter(EntryBB);
7315 
7316   BestVPlan.execute(&State);
7317 
7318   // 2.5 When vectorizing the epilogue, fix reduction resume values from the
7319   // additional bypass block.
7320   if (VectorizingEpilogue) {
7321     assert(!BestVPlan.hasEarlyExit() &&
7322            "Epilogue vectorisation not yet supported with early exits");
7323     BasicBlock *PH = OrigLoop->getLoopPreheader();
7324     BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7325     for (auto *Pred : predecessors(PH)) {
7326       for (PHINode &Phi : PH->phis()) {
7327         if (Phi.getBasicBlockIndex(Pred) != -1)
7328           continue;
7329         Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
7330       }
7331     }
7332     VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader();
7333     if (ScalarPH->getNumPredecessors() > 0) {
7334       // If ScalarPH has predecessors, we may need to update its reduction
7335       // resume values.
7336       for (VPRecipeBase &R : ScalarPH->phis()) {
7337         fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), State,
7338                                                       BypassBlock);
7339       }
7340     }
7341   }
7342 
7343   // 2.6. Maintain Loop Hints
7344   // Keep all loop hints from the original loop on the vector loop (we'll
7345   // replace the vectorizer-specific hints below).
7346   VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
7347   if (HeaderVPBB) {
7348     MDNode *OrigLoopID = OrigLoop->getLoopID();
7349 
7350     std::optional<MDNode *> VectorizedLoopID =
7351         makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7352                                         LLVMLoopVectorizeFollowupVectorized});
7353 
7354     Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7355     if (VectorizedLoopID) {
7356       L->setLoopID(*VectorizedLoopID);
7357     } else {
7358       // Keep all loop hints from the original loop on the vector loop (we'll
7359       // replace the vectorizer-specific hints below).
7360       if (MDNode *LID = OrigLoop->getLoopID())
7361         L->setLoopID(LID);
7362 
7363       LoopVectorizeHints Hints(L, true, *ORE);
7364       Hints.setAlreadyVectorized();
7365 
7366       // Check if it's EVL-vectorized and mark the corresponding metadata.
7367       bool IsEVLVectorized =
7368           llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) {
7369             // Looking for the ExplictVectorLength VPInstruction.
7370             if (const auto *VI = dyn_cast<VPInstruction>(&Recipe))
7371               return VI->getOpcode() == VPInstruction::ExplicitVectorLength;
7372             return false;
7373           });
7374       if (IsEVLVectorized) {
7375         LLVMContext &Context = L->getHeader()->getContext();
7376         MDNode *LoopID = L->getLoopID();
7377         auto *IsEVLVectorizedMD = MDNode::get(
7378             Context,
7379             {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"),
7380              MDString::get(Context, "evl")});
7381         MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {},
7382                                                            {IsEVLVectorizedMD});
7383         L->setLoopID(NewLoopID);
7384       }
7385     }
7386     TargetTransformInfo::UnrollingPreferences UP;
7387     TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7388     if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7389       addRuntimeUnrollDisableMetaData(L);
7390   }
7391 
7392   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7393   //    predication, updating analyses.
7394   ILV.fixVectorizedLoop(State);
7395 
7396   ILV.printDebugTracesAtEnd();
7397 
7398   return ExpandedSCEVs;
7399 }
7400 
7401 //===--------------------------------------------------------------------===//
7402 // EpilogueVectorizerMainLoop
7403 //===--------------------------------------------------------------------===//
7404 
7405 /// This function is partially responsible for generating the control flow
7406 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
createEpilogueVectorizedLoopSkeleton()7407 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7408   createVectorLoopSkeleton("");
7409 
7410   // Generate the code to check the minimum iteration count of the vector
7411   // epilogue (see below).
7412   EPI.EpilogueIterationCountCheck =
7413       emitIterationCountCheck(LoopScalarPreHeader, true);
7414   EPI.EpilogueIterationCountCheck->setName("iter.check");
7415 
7416   // Generate the iteration count check for the main loop, *after* the check
7417   // for the epilogue loop, so that the path-length is shorter for the case
7418   // that goes directly through the vector epilogue. The longer-path length for
7419   // the main loop is compensated for, by the gain from vectorizing the larger
7420   // trip count. Note: the branch will get updated later on when we vectorize
7421   // the epilogue.
7422   EPI.MainLoopIterationCountCheck =
7423       emitIterationCountCheck(LoopScalarPreHeader, false);
7424 
7425   // Generate the induction variable.
7426   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7427 
7428   replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
7429   return LoopVectorPreHeader;
7430 }
7431 
printDebugTracesAtStart()7432 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7433   LLVM_DEBUG({
7434     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7435            << "Main Loop VF:" << EPI.MainLoopVF
7436            << ", Main Loop UF:" << EPI.MainLoopUF
7437            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7438            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7439   });
7440 }
7441 
printDebugTracesAtEnd()7442 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7443   DEBUG_WITH_TYPE(VerboseDebug, {
7444     dbgs() << "intermediate fn:\n"
7445            << *OrigLoop->getHeader()->getParent() << "\n";
7446   });
7447 }
7448 
7449 BasicBlock *
emitIterationCountCheck(BasicBlock * Bypass,bool ForEpilogue)7450 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7451                                                     bool ForEpilogue) {
7452   assert(Bypass && "Expected valid bypass basic block.");
7453   Value *Count = getTripCount();
7454   MinProfitableTripCount = ElementCount::getFixed(0);
7455   Value *CheckMinIters = createIterationCountCheck(
7456       ForEpilogue ? EPI.EpilogueVF : VF, ForEpilogue ? EPI.EpilogueUF : UF);
7457 
7458   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7459   if (!ForEpilogue)
7460     TCCheckBlock->setName("vector.main.loop.iter.check");
7461 
7462   // Create new preheader for vector loop.
7463   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7464                                    static_cast<DominatorTree *>(nullptr), LI,
7465                                    nullptr, "vector.ph");
7466 
7467   if (ForEpilogue) {
7468     // Save the trip count so we don't have to regenerate it in the
7469     // vec.epilog.iter.check. This is safe to do because the trip count
7470     // generated here dominates the vector epilog iter check.
7471     EPI.TripCount = Count;
7472   }
7473 
7474   BranchInst &BI =
7475       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7476   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7477     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7478   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7479 
7480   // When vectorizing the main loop, its trip-count check is placed in a new
7481   // block, whereas the overall trip-count check is placed in the VPlan entry
7482   // block. When vectorizing the epilogue loop, its trip-count check is placed
7483   // in the VPlan entry block.
7484   if (!ForEpilogue)
7485     introduceCheckBlockInVPlan(TCCheckBlock);
7486   return TCCheckBlock;
7487 }
7488 
7489 //===--------------------------------------------------------------------===//
7490 // EpilogueVectorizerEpilogueLoop
7491 //===--------------------------------------------------------------------===//
7492 
7493 /// This function is partially responsible for generating the control flow
7494 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7495 BasicBlock *
createEpilogueVectorizedLoopSkeleton()7496 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7497   createVectorLoopSkeleton("vec.epilog.");
7498 
7499   // Now, compare the remaining count and if there aren't enough iterations to
7500   // execute the vectorized epilogue skip to the scalar part.
7501   LoopVectorPreHeader->setName("vec.epilog.ph");
7502   BasicBlock *VecEpilogueIterationCountCheck =
7503       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
7504                  nullptr, "vec.epilog.iter.check", true);
7505   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7506                                           VecEpilogueIterationCountCheck);
7507   AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7508 
7509   // Adjust the control flow taking the state info from the main loop
7510   // vectorization into account.
7511   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7512          "expected this to be saved from the previous pass.");
7513   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7514       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7515 
7516   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7517       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7518 
7519   // Adjust the terminators of runtime check blocks and phis using them.
7520   BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
7521   BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
7522   if (SCEVCheckBlock)
7523     SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
7524         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7525   if (MemCheckBlock)
7526     MemCheckBlock->getTerminator()->replaceUsesOfWith(
7527         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7528 
7529   DT->changeImmediateDominator(LoopScalarPreHeader,
7530                                EPI.EpilogueIterationCountCheck);
7531 
7532   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7533   // reductions which merge control-flow from the latch block and the middle
7534   // block. Update the incoming values here and move the Phi into the preheader.
7535   SmallVector<PHINode *, 4> PhisInBlock(
7536       llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
7537 
7538   for (PHINode *Phi : PhisInBlock) {
7539     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7540     Phi->replaceIncomingBlockWith(
7541         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7542         VecEpilogueIterationCountCheck);
7543 
7544     // If the phi doesn't have an incoming value from the
7545     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7546     // value and also those from other check blocks. This is needed for
7547     // reduction phis only.
7548     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7549           return EPI.EpilogueIterationCountCheck == IncB;
7550         }))
7551       continue;
7552     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7553     if (SCEVCheckBlock)
7554       Phi->removeIncomingValue(SCEVCheckBlock);
7555     if (MemCheckBlock)
7556       Phi->removeIncomingValue(MemCheckBlock);
7557   }
7558 
7559   replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
7560   return LoopVectorPreHeader;
7561 }
7562 
7563 BasicBlock *
emitMinimumVectorEpilogueIterCountCheck(BasicBlock * Bypass,BasicBlock * Insert)7564 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7565     BasicBlock *Bypass, BasicBlock *Insert) {
7566 
7567   assert(EPI.TripCount &&
7568          "Expected trip count to have been saved in the first pass.");
7569   Value *TC = EPI.TripCount;
7570   IRBuilder<> Builder(Insert->getTerminator());
7571   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7572 
7573   // Generate code to check if the loop's trip count is less than VF * UF of the
7574   // vector epilogue loop.
7575   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7576                ? ICmpInst::ICMP_ULE
7577                : ICmpInst::ICMP_ULT;
7578 
7579   Value *CheckMinIters =
7580       Builder.CreateICmp(P, Count,
7581                          createStepForVF(Builder, Count->getType(),
7582                                          EPI.EpilogueVF, EPI.EpilogueUF),
7583                          "min.epilog.iters.check");
7584 
7585   BranchInst &BI =
7586       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7587   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7588     // FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't
7589     // think the MainLoopStep is correct.
7590     unsigned MainLoopStep = UF * VF.getKnownMinValue();
7591     unsigned EpilogueLoopStep =
7592         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7593     // We assume the remaining `Count` is equally distributed in
7594     // [0, MainLoopStep)
7595     // So the probability for `Count < EpilogueLoopStep` should be
7596     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7597     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7598     const uint32_t Weights[] = {EstimatedSkipCount,
7599                                 MainLoopStep - EstimatedSkipCount};
7600     setBranchWeights(BI, Weights, /*IsExpected=*/false);
7601   }
7602   ReplaceInstWithInst(Insert->getTerminator(), &BI);
7603 
7604   // A new entry block has been created for the epilogue VPlan. Hook it in, as
7605   // otherwise we would try to modify the entry to the main vector loop.
7606   VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
7607   VPBasicBlock *OldEntry = Plan.getEntry();
7608   VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
7609   Plan.setEntry(NewEntry);
7610   // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7611 
7612   return Insert;
7613 }
7614 
printDebugTracesAtStart()7615 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7616   LLVM_DEBUG({
7617     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7618            << "Epilogue Loop VF:" << EPI.EpilogueVF
7619            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7620   });
7621 }
7622 
printDebugTracesAtEnd()7623 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7624   DEBUG_WITH_TYPE(VerboseDebug, {
7625     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7626   });
7627 }
7628 
7629 VPWidenMemoryRecipe *
tryToWidenMemory(Instruction * I,ArrayRef<VPValue * > Operands,VFRange & Range)7630 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
7631                                   VFRange &Range) {
7632   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7633          "Must be called with either a load or store");
7634 
7635   auto WillWiden = [&](ElementCount VF) -> bool {
7636     LoopVectorizationCostModel::InstWidening Decision =
7637         CM.getWideningDecision(I, VF);
7638     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7639            "CM decision should be taken at this point.");
7640     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7641       return true;
7642     if (CM.isScalarAfterVectorization(I, VF) ||
7643         CM.isProfitableToScalarize(I, VF))
7644       return false;
7645     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7646   };
7647 
7648   if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range))
7649     return nullptr;
7650 
7651   VPValue *Mask = nullptr;
7652   if (Legal->isMaskRequired(I))
7653     Mask = getBlockInMask(Builder.getInsertBlock());
7654 
7655   // Determine if the pointer operand of the access is either consecutive or
7656   // reverse consecutive.
7657   LoopVectorizationCostModel::InstWidening Decision =
7658       CM.getWideningDecision(I, Range.Start);
7659   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7660   bool Consecutive =
7661       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
7662 
7663   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
7664   if (Consecutive) {
7665     auto *GEP = dyn_cast<GetElementPtrInst>(
7666         Ptr->getUnderlyingValue()->stripPointerCasts());
7667     VPSingleDefRecipe *VectorPtr;
7668     if (Reverse) {
7669       // When folding the tail, we may compute an address that we don't in the
7670       // original scalar loop and it may not be inbounds. Drop Inbounds in that
7671       // case.
7672       GEPNoWrapFlags Flags =
7673           (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
7674               ? GEPNoWrapFlags::none()
7675               : GEPNoWrapFlags::inBounds();
7676       VectorPtr =
7677           new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
7678                                        /*Stride*/ -1, Flags, I->getDebugLoc());
7679     } else {
7680       VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7681                                             GEP ? GEP->getNoWrapFlags()
7682                                                 : GEPNoWrapFlags::none(),
7683                                             I->getDebugLoc());
7684     }
7685     Builder.insert(VectorPtr);
7686     Ptr = VectorPtr;
7687   }
7688   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7689     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7690                                  VPIRMetadata(*Load, LVer), I->getDebugLoc());
7691 
7692   StoreInst *Store = cast<StoreInst>(I);
7693   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
7694                                 Reverse, VPIRMetadata(*Store, LVer),
7695                                 I->getDebugLoc());
7696 }
7697 
7698 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
7699 /// insert a recipe to expand the step for the induction recipe.
7700 static VPWidenIntOrFpInductionRecipe *
createWidenInductionRecipes(PHINode * Phi,Instruction * PhiOrTrunc,VPValue * Start,const InductionDescriptor & IndDesc,VPlan & Plan,ScalarEvolution & SE,Loop & OrigLoop)7701 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
7702                             VPValue *Start, const InductionDescriptor &IndDesc,
7703                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
7704   assert(IndDesc.getStartValue() ==
7705          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
7706   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7707          "step must be loop invariant");
7708 
7709   VPValue *Step =
7710       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
7711   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
7712     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7713                                              IndDesc, TruncI,
7714                                              TruncI->getDebugLoc());
7715   }
7716   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
7717   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7718                                            IndDesc, Phi->getDebugLoc());
7719 }
7720 
tryToOptimizeInductionPHI(PHINode * Phi,ArrayRef<VPValue * > Operands,VFRange & Range)7721 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
7722     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
7723 
7724   // Check if this is an integer or fp induction. If so, build the recipe that
7725   // produces its scalar and vector values.
7726   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7727     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
7728                                        *PSE.getSE(), *OrigLoop);
7729 
7730   // Check if this is pointer induction. If so, build the recipe for it.
7731   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7732     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
7733                                                            *PSE.getSE());
7734     return new VPWidenPointerInductionRecipe(
7735         Phi, Operands[0], Step, &Plan.getVFxUF(), *II,
7736         LoopVectorizationPlanner::getDecisionAndClampRange(
7737             [&](ElementCount VF) {
7738               return CM.isScalarAfterVectorization(Phi, VF);
7739             },
7740             Range),
7741         Phi->getDebugLoc());
7742   }
7743   return nullptr;
7744 }
7745 
tryToOptimizeInductionTruncate(TruncInst * I,ArrayRef<VPValue * > Operands,VFRange & Range)7746 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
7747     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
7748   // Optimize the special case where the source is a constant integer
7749   // induction variable. Notice that we can only optimize the 'trunc' case
7750   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7751   // (c) other casts depend on pointer size.
7752 
7753   // Determine whether \p K is a truncation based on an induction variable that
7754   // can be optimized.
7755   auto IsOptimizableIVTruncate =
7756       [&](Instruction *K) -> std::function<bool(ElementCount)> {
7757     return [=](ElementCount VF) -> bool {
7758       return CM.isOptimizableIVTruncate(K, VF);
7759     };
7760   };
7761 
7762   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7763           IsOptimizableIVTruncate(I), Range)) {
7764 
7765     auto *Phi = cast<PHINode>(I->getOperand(0));
7766     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
7767     VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
7768     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
7769                                        *OrigLoop);
7770   }
7771   return nullptr;
7772 }
7773 
tryToWidenCall(CallInst * CI,ArrayRef<VPValue * > Operands,VFRange & Range)7774 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
7775                                                    ArrayRef<VPValue *> Operands,
7776                                                    VFRange &Range) {
7777   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7778       [this, CI](ElementCount VF) {
7779         return CM.isScalarWithPredication(CI, VF);
7780       },
7781       Range);
7782 
7783   if (IsPredicated)
7784     return nullptr;
7785 
7786   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7787   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7788              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7789              ID == Intrinsic::pseudoprobe ||
7790              ID == Intrinsic::experimental_noalias_scope_decl))
7791     return nullptr;
7792 
7793   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
7794 
7795   // Is it beneficial to perform intrinsic call compared to lib call?
7796   bool ShouldUseVectorIntrinsic =
7797       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
7798                 [&](ElementCount VF) -> bool {
7799                   return CM.getCallWideningDecision(CI, VF).Kind ==
7800                          LoopVectorizationCostModel::CM_IntrinsicCall;
7801                 },
7802                 Range);
7803   if (ShouldUseVectorIntrinsic)
7804     return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
7805                                       CI->getDebugLoc());
7806 
7807   Function *Variant = nullptr;
7808   std::optional<unsigned> MaskPos;
7809   // Is better to call a vectorized version of the function than to to scalarize
7810   // the call?
7811   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7812       [&](ElementCount VF) -> bool {
7813         // The following case may be scalarized depending on the VF.
7814         // The flag shows whether we can use a usual Call for vectorized
7815         // version of the instruction.
7816 
7817         // If we've found a variant at a previous VF, then stop looking. A
7818         // vectorized variant of a function expects input in a certain shape
7819         // -- basically the number of input registers, the number of lanes
7820         // per register, and whether there's a mask required.
7821         // We store a pointer to the variant in the VPWidenCallRecipe, so
7822         // once we have an appropriate variant it's only valid for that VF.
7823         // This will force a different vplan to be generated for each VF that
7824         // finds a valid variant.
7825         if (Variant)
7826           return false;
7827         LoopVectorizationCostModel::CallWideningDecision Decision =
7828             CM.getCallWideningDecision(CI, VF);
7829         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
7830           Variant = Decision.Variant;
7831           MaskPos = Decision.MaskPos;
7832           return true;
7833         }
7834 
7835         return false;
7836       },
7837       Range);
7838   if (ShouldUseVectorCall) {
7839     if (MaskPos.has_value()) {
7840       // We have 2 cases that would require a mask:
7841       //   1) The block needs to be predicated, either due to a conditional
7842       //      in the scalar loop or use of an active lane mask with
7843       //      tail-folding, and we use the appropriate mask for the block.
7844       //   2) No mask is required for the block, but the only available
7845       //      vector variant at this VF requires a mask, so we synthesize an
7846       //      all-true mask.
7847       VPValue *Mask = nullptr;
7848       if (Legal->isMaskRequired(CI))
7849         Mask = getBlockInMask(Builder.getInsertBlock());
7850       else
7851         Mask = Plan.getOrAddLiveIn(
7852             ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));
7853 
7854       Ops.insert(Ops.begin() + *MaskPos, Mask);
7855     }
7856 
7857     Ops.push_back(Operands.back());
7858     return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
7859   }
7860 
7861   return nullptr;
7862 }
7863 
shouldWiden(Instruction * I,VFRange & Range) const7864 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7865   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7866          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7867   // Instruction should be widened, unless it is scalar after vectorization,
7868   // scalarization is profitable or it is predicated.
7869   auto WillScalarize = [this, I](ElementCount VF) -> bool {
7870     return CM.isScalarAfterVectorization(I, VF) ||
7871            CM.isProfitableToScalarize(I, VF) ||
7872            CM.isScalarWithPredication(I, VF);
7873   };
7874   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7875                                                              Range);
7876 }
7877 
tryToWiden(Instruction * I,ArrayRef<VPValue * > Operands)7878 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
7879                                            ArrayRef<VPValue *> Operands) {
7880   switch (I->getOpcode()) {
7881   default:
7882     return nullptr;
7883   case Instruction::SDiv:
7884   case Instruction::UDiv:
7885   case Instruction::SRem:
7886   case Instruction::URem: {
7887     // If not provably safe, use a select to form a safe divisor before widening the
7888     // div/rem operation itself.  Otherwise fall through to general handling below.
7889     if (CM.isPredicatedInst(I)) {
7890       SmallVector<VPValue *> Ops(Operands);
7891       VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
7892       VPValue *One =
7893           Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
7894       auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
7895       Ops[1] = SafeRHS;
7896       return new VPWidenRecipe(*I, Ops);
7897     }
7898     [[fallthrough]];
7899   }
7900   case Instruction::Add:
7901   case Instruction::And:
7902   case Instruction::AShr:
7903   case Instruction::FAdd:
7904   case Instruction::FCmp:
7905   case Instruction::FDiv:
7906   case Instruction::FMul:
7907   case Instruction::FNeg:
7908   case Instruction::FRem:
7909   case Instruction::FSub:
7910   case Instruction::ICmp:
7911   case Instruction::LShr:
7912   case Instruction::Mul:
7913   case Instruction::Or:
7914   case Instruction::Select:
7915   case Instruction::Shl:
7916   case Instruction::Sub:
7917   case Instruction::Xor:
7918   case Instruction::Freeze: {
7919     SmallVector<VPValue *> NewOps(Operands);
7920     if (Instruction::isBinaryOp(I->getOpcode())) {
7921       // The legacy cost model uses SCEV to check if some of the operands are
7922       // constants. To match the legacy cost model's behavior, use SCEV to try
7923       // to replace operands with constants.
7924       ScalarEvolution &SE = *PSE.getSE();
7925       auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
7926         if (!Op->isLiveIn())
7927           return Op;
7928         Value *V = Op->getUnderlyingValue();
7929         if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
7930           return Op;
7931         auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
7932         if (!C)
7933           return Op;
7934         return Plan.getOrAddLiveIn(C->getValue());
7935       };
7936       // For Mul, the legacy cost model checks both operands.
7937       if (I->getOpcode() == Instruction::Mul)
7938         NewOps[0] = GetConstantViaSCEV(NewOps[0]);
7939       // For other binops, the legacy cost model only checks the second operand.
7940       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
7941     }
7942     return new VPWidenRecipe(*I, NewOps);
7943   }
7944   case Instruction::ExtractValue: {
7945     SmallVector<VPValue *> NewOps(Operands);
7946     Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
7947     auto *EVI = cast<ExtractValueInst>(I);
7948     assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
7949     unsigned Idx = EVI->getIndices()[0];
7950     NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
7951     return new VPWidenRecipe(*I, NewOps);
7952   }
7953   };
7954 }
7955 
7956 VPHistogramRecipe *
tryToWidenHistogram(const HistogramInfo * HI,ArrayRef<VPValue * > Operands)7957 VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7958                                      ArrayRef<VPValue *> Operands) {
7959   // FIXME: Support other operations.
7960   unsigned Opcode = HI->Update->getOpcode();
7961   assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
7962          "Histogram update operation must be an Add or Sub");
7963 
7964   SmallVector<VPValue *, 3> HGramOps;
7965   // Bucket address.
7966   HGramOps.push_back(Operands[1]);
7967   // Increment value.
7968   HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
7969 
7970   // In case of predicated execution (due to tail-folding, or conditional
7971   // execution, or both), pass the relevant mask.
7972   if (Legal->isMaskRequired(HI->Store))
7973     HGramOps.push_back(getBlockInMask(Builder.getInsertBlock()));
7974 
7975   return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc());
7976 }
7977 
7978 VPReplicateRecipe *
handleReplication(Instruction * I,ArrayRef<VPValue * > Operands,VFRange & Range)7979 VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands,
7980                                    VFRange &Range) {
7981   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7982       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7983       Range);
7984 
7985   bool IsPredicated = CM.isPredicatedInst(I);
7986 
7987   // Even if the instruction is not marked as uniform, there are certain
7988   // intrinsic calls that can be effectively treated as such, so we check for
7989   // them here. Conservatively, we only do this for scalable vectors, since
7990   // for fixed-width VFs we can always fall back on full scalarization.
7991   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
7992     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
7993     case Intrinsic::assume:
7994     case Intrinsic::lifetime_start:
7995     case Intrinsic::lifetime_end:
7996       // For scalable vectors if one of the operands is variant then we still
7997       // want to mark as uniform, which will generate one instruction for just
7998       // the first lane of the vector. We can't scalarize the call in the same
7999       // way as for fixed-width vectors because we don't know how many lanes
8000       // there are.
8001       //
8002       // The reasons for doing it this way for scalable vectors are:
8003       //   1. For the assume intrinsic generating the instruction for the first
8004       //      lane is still be better than not generating any at all. For
8005       //      example, the input may be a splat across all lanes.
8006       //   2. For the lifetime start/end intrinsics the pointer operand only
8007       //      does anything useful when the input comes from a stack object,
8008       //      which suggests it should always be uniform. For non-stack objects
8009       //      the effect is to poison the object, which still allows us to
8010       //      remove the call.
8011       IsUniform = true;
8012       break;
8013     default:
8014       break;
8015     }
8016   }
8017   VPValue *BlockInMask = nullptr;
8018   if (!IsPredicated) {
8019     // Finalize the recipe for Instr, first if it is not predicated.
8020     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8021   } else {
8022     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8023     // Instructions marked for predication are replicated and a mask operand is
8024     // added initially. Masked replicate recipes will later be placed under an
8025     // if-then construct to prevent side-effects. Generate recipes to compute
8026     // the block mask for this region.
8027     BlockInMask = getBlockInMask(Builder.getInsertBlock());
8028   }
8029 
8030   // Note that there is some custom logic to mark some intrinsics as uniform
8031   // manually above for scalable vectors, which this assert needs to account for
8032   // as well.
8033   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8034           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8035          "Should not predicate a uniform recipe");
8036   auto *Recipe = new VPReplicateRecipe(I, Operands, IsUniform, BlockInMask,
8037                                        VPIRMetadata(*I, LVer));
8038   return Recipe;
8039 }
8040 
8041 /// Find all possible partial reductions in the loop and track all of those that
8042 /// are valid so recipes can be formed later.
collectScaledReductions(VFRange & Range)8043 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
8044   // Find all possible partial reductions.
8045   SmallVector<std::pair<PartialReductionChain, unsigned>>
8046       PartialReductionChains;
8047   for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
8048     getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range,
8049                         PartialReductionChains);
8050   }
8051 
8052   // A partial reduction is invalid if any of its extends are used by
8053   // something that isn't another partial reduction. This is because the
8054   // extends are intended to be lowered along with the reduction itself.
8055 
8056   // Build up a set of partial reduction ops for efficient use checking.
8057   SmallSet<User *, 4> PartialReductionOps;
8058   for (const auto &[PartialRdx, _] : PartialReductionChains)
8059     PartialReductionOps.insert(PartialRdx.ExtendUser);
8060 
8061   auto ExtendIsOnlyUsedByPartialReductions =
8062       [&PartialReductionOps](Instruction *Extend) {
8063         return all_of(Extend->users(), [&](const User *U) {
8064           return PartialReductionOps.contains(U);
8065         });
8066       };
8067 
8068   // Check if each use of a chain's two extends is a partial reduction
8069   // and only add those that don't have non-partial reduction users.
8070   for (auto Pair : PartialReductionChains) {
8071     PartialReductionChain Chain = Pair.first;
8072     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8073         (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
8074       ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
8075   }
8076 }
8077 
getScaledReductions(Instruction * PHI,Instruction * RdxExitInstr,VFRange & Range,SmallVectorImpl<std::pair<PartialReductionChain,unsigned>> & Chains)8078 bool VPRecipeBuilder::getScaledReductions(
8079     Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
8080     SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
8081   if (!CM.TheLoop->contains(RdxExitInstr))
8082     return false;
8083 
8084   auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
8085   if (!Update)
8086     return false;
8087 
8088   Value *Op = Update->getOperand(0);
8089   Value *PhiOp = Update->getOperand(1);
8090   if (Op == PHI)
8091     std::swap(Op, PhiOp);
8092 
8093   // Try and get a scaled reduction from the first non-phi operand.
8094   // If one is found, we use the discovered reduction instruction in
8095   // place of the accumulator for costing.
8096   if (auto *OpInst = dyn_cast<Instruction>(Op)) {
8097     if (getScaledReductions(PHI, OpInst, Range, Chains)) {
8098       PHI = Chains.rbegin()->first.Reduction;
8099 
8100       Op = Update->getOperand(0);
8101       PhiOp = Update->getOperand(1);
8102       if (Op == PHI)
8103         std::swap(Op, PhiOp);
8104     }
8105   }
8106   if (PhiOp != PHI)
8107     return false;
8108 
8109   using namespace llvm::PatternMatch;
8110 
8111   // If the update is a binary operator, check both of its operands to see if
8112   // they are extends. Otherwise, see if the update comes directly from an
8113   // extend.
8114   Instruction *Exts[2] = {nullptr};
8115   BinaryOperator *ExtendUser = dyn_cast<BinaryOperator>(Op);
8116   std::optional<unsigned> BinOpc;
8117   Type *ExtOpTypes[2] = {nullptr};
8118 
8119   auto CollectExtInfo = [this, &Exts,
8120                          &ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
8121     unsigned I = 0;
8122     for (Value *OpI : Ops) {
8123       Value *ExtOp;
8124       if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
8125         return false;
8126       Exts[I] = cast<Instruction>(OpI);
8127 
8128       // TODO: We should be able to support live-ins.
8129       if (!CM.TheLoop->contains(Exts[I]))
8130         return false;
8131 
8132       ExtOpTypes[I] = ExtOp->getType();
8133       I++;
8134     }
8135     return true;
8136   };
8137 
8138   if (ExtendUser) {
8139     if (!ExtendUser->hasOneUse())
8140       return false;
8141 
8142     // Use the side-effect of match to replace BinOp only if the pattern is
8143     // matched, we don't care at this point whether it actually matched.
8144     match(ExtendUser, m_Neg(m_BinOp(ExtendUser)));
8145 
8146     SmallVector<Value *> Ops(ExtendUser->operands());
8147     if (!CollectExtInfo(Ops))
8148       return false;
8149 
8150     BinOpc = std::make_optional(ExtendUser->getOpcode());
8151   } else if (match(Update, m_Add(m_Value(), m_Value()))) {
8152     // We already know the operands for Update are Op and PhiOp.
8153     SmallVector<Value *> Ops({Op});
8154     if (!CollectExtInfo(Ops))
8155       return false;
8156 
8157     ExtendUser = Update;
8158     BinOpc = std::nullopt;
8159   } else
8160     return false;
8161 
8162   TTI::PartialReductionExtendKind OpAExtend =
8163       TTI::getPartialReductionExtendKind(Exts[0]);
8164   TTI::PartialReductionExtendKind OpBExtend =
8165       Exts[1] ? TTI::getPartialReductionExtendKind(Exts[1]) : TTI::PR_None;
8166   PartialReductionChain Chain(RdxExitInstr, Exts[0], Exts[1], ExtendUser);
8167 
8168   TypeSize PHISize = PHI->getType()->getPrimitiveSizeInBits();
8169   TypeSize ASize = ExtOpTypes[0]->getPrimitiveSizeInBits();
8170   if (!PHISize.hasKnownScalarFactor(ASize))
8171     return false;
8172   unsigned TargetScaleFactor = PHISize.getKnownScalarFactor(ASize);
8173 
8174   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8175           [&](ElementCount VF) {
8176             InstructionCost Cost = TTI->getPartialReductionCost(
8177                 Update->getOpcode(), ExtOpTypes[0], ExtOpTypes[1],
8178                 PHI->getType(), VF, OpAExtend, OpBExtend, BinOpc, CM.CostKind);
8179             return Cost.isValid();
8180           },
8181           Range)) {
8182     Chains.emplace_back(Chain, TargetScaleFactor);
8183     return true;
8184   }
8185 
8186   return false;
8187 }
8188 
tryToCreateWidenRecipe(VPSingleDefRecipe * R,VFRange & Range)8189 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
8190                                                       VFRange &Range) {
8191   // First, check for specific widening recipes that deal with inductions, Phi
8192   // nodes, calls and memory operations.
8193   VPRecipeBase *Recipe;
8194   Instruction *Instr = R->getUnderlyingInstr();
8195   SmallVector<VPValue *, 4> Operands(R->operands());
8196   if (auto *PhiR = dyn_cast<VPWidenPHIRecipe>(R)) {
8197     VPBasicBlock *Parent = PhiR->getParent();
8198     [[maybe_unused]] VPRegionBlock *LoopRegionOf =
8199         Parent->getEnclosingLoopRegion();
8200     assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
8201            "Non-header phis should have been handled during predication");
8202     auto *Phi = cast<PHINode>(R->getUnderlyingInstr());
8203     assert(Operands.size() == 2 && "Must have 2 operands for header phis");
8204     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8205       return Recipe;
8206 
8207     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8208     assert((Legal->isReductionVariable(Phi) ||
8209             Legal->isFixedOrderRecurrence(Phi)) &&
8210            "can only widen reductions and fixed-order recurrences here");
8211     VPValue *StartV = Operands[0];
8212     if (Legal->isReductionVariable(Phi)) {
8213       const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(Phi);
8214       assert(RdxDesc.getRecurrenceStartValue() ==
8215              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8216 
8217       // If the PHI is used by a partial reduction, set the scale factor.
8218       unsigned ScaleFactor =
8219           getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8220       PhiRecipe = new VPReductionPHIRecipe(
8221           Phi, RdxDesc.getRecurrenceKind(), *StartV, CM.isInLoopReduction(Phi),
8222           CM.useOrderedReductions(RdxDesc), ScaleFactor);
8223     } else {
8224       // TODO: Currently fixed-order recurrences are modeled as chains of
8225       // first-order recurrences. If there are no users of the intermediate
8226       // recurrences in the chain, the fixed order recurrence should be modeled
8227       // directly, enabling more efficient codegen.
8228       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8229     }
8230     // Add backedge value.
8231     PhiRecipe->addOperand(Operands[1]);
8232     return PhiRecipe;
8233   }
8234 
8235   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8236                                     cast<TruncInst>(Instr), Operands, Range)))
8237     return Recipe;
8238 
8239   // All widen recipes below deal only with VF > 1.
8240   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8241           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8242     return nullptr;
8243 
8244   if (auto *CI = dyn_cast<CallInst>(Instr))
8245     return tryToWidenCall(CI, Operands, Range);
8246 
8247   if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8248     if (auto HistInfo = Legal->getHistogramInfo(SI))
8249       return tryToWidenHistogram(*HistInfo, Operands);
8250 
8251   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8252     return tryToWidenMemory(Instr, Operands, Range);
8253 
8254   if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
8255     return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
8256 
8257   if (!shouldWiden(Instr, Range))
8258     return nullptr;
8259 
8260   if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8261     return new VPWidenGEPRecipe(GEP, Operands);
8262 
8263   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8264     return new VPWidenSelectRecipe(*SI, Operands);
8265   }
8266 
8267   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8268     return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8269                                  *CI);
8270   }
8271 
8272   return tryToWiden(Instr, Operands);
8273 }
8274 
8275 VPRecipeBase *
tryToCreatePartialReduction(Instruction * Reduction,ArrayRef<VPValue * > Operands,unsigned ScaleFactor)8276 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8277                                              ArrayRef<VPValue *> Operands,
8278                                              unsigned ScaleFactor) {
8279   assert(Operands.size() == 2 &&
8280          "Unexpected number of operands for partial reduction");
8281 
8282   VPValue *BinOp = Operands[0];
8283   VPValue *Accumulator = Operands[1];
8284   VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8285   if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
8286       isa<VPPartialReductionRecipe>(BinOpRecipe))
8287     std::swap(BinOp, Accumulator);
8288 
8289   unsigned ReductionOpcode = Reduction->getOpcode();
8290   if (ReductionOpcode == Instruction::Sub) {
8291     auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
8292     SmallVector<VPValue *, 2> Ops;
8293     Ops.push_back(Plan.getOrAddLiveIn(Zero));
8294     Ops.push_back(BinOp);
8295     BinOp = new VPWidenRecipe(*Reduction, Ops);
8296     Builder.insert(BinOp->getDefiningRecipe());
8297     ReductionOpcode = Instruction::Add;
8298   }
8299 
8300   VPValue *Cond = nullptr;
8301   if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) {
8302     assert((ReductionOpcode == Instruction::Add ||
8303             ReductionOpcode == Instruction::Sub) &&
8304            "Expected an ADD or SUB operation for predicated partial "
8305            "reductions (because the neutral element in the mask is zero)!");
8306     Cond = getBlockInMask(Builder.getInsertBlock());
8307     VPValue *Zero =
8308         Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
8309     BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
8310   }
8311   return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
8312                                       ScaleFactor, Reduction);
8313 }
8314 
buildVPlansWithVPRecipes(ElementCount MinVF,ElementCount MaxVF)8315 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8316                                                         ElementCount MaxVF) {
8317   if (ElementCount::isKnownGT(MinVF, MaxVF))
8318     return;
8319 
8320   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8321 
8322   const LoopAccessInfo *LAI = Legal->getLAI();
8323   LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
8324                       OrigLoop, LI, DT, PSE.getSE());
8325   if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8326       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
8327     // Only use noalias metadata when using memory checks guaranteeing no
8328     // overlap across all iterations.
8329     LVer.prepareNoAliasMetadata();
8330   }
8331 
8332   auto MaxVFTimes2 = MaxVF * 2;
8333   auto VPlan0 = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
8334   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8335     VFRange SubRange = {VF, MaxVFTimes2};
8336     if (auto Plan = tryToBuildVPlanWithVPRecipes(
8337             std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
8338       bool HasScalarVF = Plan->hasScalarVFOnly();
8339       // Now optimize the initial VPlan.
8340       if (!HasScalarVF)
8341         VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths,
8342                                  *Plan, CM.getMinimalBitwidths());
8343       VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan);
8344       // TODO: try to put it close to addActiveLaneMask().
8345       // Discard the plan if it is not EVL-compatible
8346       if (CM.foldTailWithEVL() && !HasScalarVF &&
8347           !VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength,
8348                                     *Plan, CM.getMaxSafeElements()))
8349         break;
8350       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8351       VPlans.push_back(std::move(Plan));
8352     }
8353     VF = SubRange.End;
8354   }
8355 }
8356 
8357 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8358 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8359 /// the end value of the induction.
addResumePhiRecipeForInduction(VPWidenInductionRecipe * WideIV,VPBuilder & VectorPHBuilder,VPBuilder & ScalarPHBuilder,VPTypeAnalysis & TypeInfo,VPValue * VectorTC)8360 static VPInstruction *addResumePhiRecipeForInduction(
8361     VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8362     VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
8363   auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
8364   // Truncated wide inductions resume from the last lane of their vector value
8365   // in the last vector iteration which is handled elsewhere.
8366   if (WideIntOrFp && WideIntOrFp->getTruncInst())
8367     return nullptr;
8368 
8369   VPValue *Start = WideIV->getStartValue();
8370   VPValue *Step = WideIV->getStepValue();
8371   const InductionDescriptor &ID = WideIV->getInductionDescriptor();
8372   VPValue *EndValue = VectorTC;
8373   if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
8374     EndValue = VectorPHBuilder.createDerivedIV(
8375         ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
8376         Start, VectorTC, Step);
8377   }
8378 
8379   // EndValue is derived from the vector trip count (which has the same type as
8380   // the widest induction) and thus may be wider than the induction here.
8381   Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
8382   if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
8383     EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
8384                                                 ScalarTypeOfWideIV,
8385                                                 WideIV->getDebugLoc());
8386   }
8387 
8388   auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
8389       {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
8390   return ResumePhiRecipe;
8391 }
8392 
8393 /// Create resume phis in the scalar preheader for first-order recurrences,
8394 /// reductions and inductions, and update the VPIRInstructions wrapping the
8395 /// original phis in the scalar header. End values for inductions are added to
8396 /// \p IVEndValues.
addScalarResumePhis(VPRecipeBuilder & Builder,VPlan & Plan,DenseMap<VPValue *,VPValue * > & IVEndValues)8397 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
8398                                 DenseMap<VPValue *, VPValue *> &IVEndValues) {
8399   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
8400   auto *ScalarPH = Plan.getScalarPreheader();
8401   auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
8402   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8403   VPBuilder VectorPHBuilder(
8404       cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
8405   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8406   VPBuilder ScalarPHBuilder(ScalarPH);
8407   for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
8408     auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
8409 
8410     // TODO: Extract final value from induction recipe initially, optimize to
8411     // pre-computed end value together in optimizeInductionExitUsers.
8412     auto *VectorPhiR =
8413         cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
8414     if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
8415       if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
8416               WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
8417               &Plan.getVectorTripCount())) {
8418         assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
8419         IVEndValues[WideIVR] = ResumePhi->getOperand(0);
8420         ScalarPhiIRI->addOperand(ResumePhi);
8421         continue;
8422       }
8423       // TODO: Also handle truncated inductions here. Computing end-values
8424       // separately should be done as VPlan-to-VPlan optimization, after
8425       // legalizing all resume values to use the last lane from the loop.
8426       assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
8427              "should only skip truncated wide inductions");
8428       continue;
8429     }
8430 
8431     // The backedge value provides the value to resume coming out of a loop,
8432     // which for FORs is a vector whose last element needs to be extracted. The
8433     // start value provides the value if the loop is bypassed.
8434     bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
8435     auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
8436     assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8437            "Cannot handle loops with uncountable early exits");
8438     if (IsFOR)
8439       ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
8440           VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
8441           "vector.recur.extract");
8442     StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
8443     auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
8444         {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
8445     ScalarPhiIRI->addOperand(ResumePhiR);
8446   }
8447 }
8448 
8449 // Collect VPIRInstructions for phis in the exit block from the latch only.
collectUsersInLatchExitBlock(VPlan & Plan)8450 static SetVector<VPIRInstruction *> collectUsersInLatchExitBlock(VPlan &Plan) {
8451   SetVector<VPIRInstruction *> ExitUsersToFix;
8452   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
8453 
8454     if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock())
8455       continue;
8456 
8457     for (VPRecipeBase &R : ExitVPBB->phis()) {
8458       auto *ExitIRI = cast<VPIRPhi>(&R);
8459       assert(ExitIRI->getNumOperands() == 1 && "must have a single operand");
8460       VPValue *V = ExitIRI->getOperand(0);
8461       if (V->isLiveIn())
8462         continue;
8463       assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
8464              "Only recipes defined inside a region should need fixing.");
8465       ExitUsersToFix.insert(ExitIRI);
8466     }
8467   }
8468   return ExitUsersToFix;
8469 }
8470 
8471 // Add exit values to \p Plan. Extracts are added for each entry in \p
8472 // ExitUsersToFix if needed and their operands are updated.
8473 static void
addUsersInExitBlocks(VPlan & Plan,const SetVector<VPIRInstruction * > & ExitUsersToFix)8474 addUsersInExitBlocks(VPlan &Plan,
8475                      const SetVector<VPIRInstruction *> &ExitUsersToFix) {
8476   if (ExitUsersToFix.empty())
8477     return;
8478 
8479   auto *MiddleVPBB = Plan.getMiddleBlock();
8480   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8481 
8482   // Introduce extract for exiting values and update the VPIRInstructions
8483   // modeling the corresponding LCSSA phis.
8484   for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8485     assert(ExitIRI->getNumOperands() == 1 &&
8486            ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
8487            "exit values from early exits must be fixed when branch to "
8488            "early-exit is added");
8489     ExitIRI->extractLastLaneOfFirstOperand(B);
8490   }
8491 }
8492 
8493 /// Handle users in the exit block for first order reductions in the original
8494 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
8495 /// users in the original exit block using the VPIRInstruction wrapping to the
8496 /// LCSSA phi.
addExitUsersForFirstOrderRecurrences(VPlan & Plan,SetVector<VPIRInstruction * > & ExitUsersToFix,VFRange & Range)8497 static void addExitUsersForFirstOrderRecurrences(
8498     VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix, VFRange &Range) {
8499   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8500   auto *ScalarPHVPBB = Plan.getScalarPreheader();
8501   auto *MiddleVPBB = Plan.getMiddleBlock();
8502   VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8503   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8504 
8505   auto IsScalableOne = [](ElementCount VF) -> bool {
8506     return VF == ElementCount::getScalable(1);
8507   };
8508 
8509   for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8510     auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
8511     if (!FOR)
8512       continue;
8513 
8514     assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8515            "Cannot handle loops with uncountable early exits");
8516 
8517     // This is the second phase of vectorizing first-order recurrences, creating
8518     // extract for users outside the loop. An overview of the transformation is
8519     // described below. Suppose we have the following loop with some use after
8520     // the loop of the last a[i-1],
8521     //
8522     //   for (int i = 0; i < n; ++i) {
8523     //     t = a[i - 1];
8524     //     b[i] = a[i] - t;
8525     //   }
8526     //   use t;
8527     //
8528     // There is a first-order recurrence on "a". For this loop, the shorthand
8529     // scalar IR looks like:
8530     //
8531     //   scalar.ph:
8532     //     s.init = a[-1]
8533     //     br scalar.body
8534     //
8535     //   scalar.body:
8536     //     i = phi [0, scalar.ph], [i+1, scalar.body]
8537     //     s1 = phi [s.init, scalar.ph], [s2, scalar.body]
8538     //     s2 = a[i]
8539     //     b[i] = s2 - s1
8540     //     br cond, scalar.body, exit.block
8541     //
8542     //   exit.block:
8543     //     use = lcssa.phi [s1, scalar.body]
8544     //
8545     // In this example, s1 is a recurrence because it's value depends on the
8546     // previous iteration. In the first phase of vectorization, we created a
8547     // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
8548     // for users in the scalar preheader and exit block.
8549     //
8550     //   vector.ph:
8551     //     v_init = vector(..., ..., ..., a[-1])
8552     //     br vector.body
8553     //
8554     //   vector.body
8555     //     i = phi [0, vector.ph], [i+4, vector.body]
8556     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
8557     //     v2 = a[i, i+1, i+2, i+3]
8558     //     b[i] = v2 - v1
8559     //     // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
8560     //     b[i, i+1, i+2, i+3] = v2 - v1
8561     //     br cond, vector.body, middle.block
8562     //
8563     //   middle.block:
8564     //     vector.recur.extract.for.phi = v2(2)
8565     //     vector.recur.extract = v2(3)
8566     //     br cond, scalar.ph, exit.block
8567     //
8568     //   scalar.ph:
8569     //     scalar.recur.init = phi [vector.recur.extract, middle.block],
8570     //                             [s.init, otherwise]
8571     //     br scalar.body
8572     //
8573     //   scalar.body:
8574     //     i = phi [0, scalar.ph], [i+1, scalar.body]
8575     //     s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
8576     //     s2 = a[i]
8577     //     b[i] = s2 - s1
8578     //     br cond, scalar.body, exit.block
8579     //
8580     //   exit.block:
8581     //     lo = lcssa.phi [s1, scalar.body],
8582     //                    [vector.recur.extract.for.phi, middle.block]
8583     //
8584     // Now update VPIRInstructions modeling LCSSA phis in the exit block.
8585     // Extract the penultimate value of the recurrence and use it as operand for
8586     // the VPIRInstruction modeling the phi.
8587     for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8588       if (ExitIRI->getOperand(0) != FOR)
8589         continue;
8590       // For VF vscale x 1, if vscale = 1, we are unable to extract the
8591       // penultimate value of the recurrence. Instead, we rely on function
8592       // addUsersInExitBlocks to extract the last element from the result of
8593       // VPInstruction::FirstOrderRecurrenceSplice by leaving the user of the
8594       // recurrence phi in ExitUsersToFix.
8595       // TODO: Consider vscale_range info and UF.
8596       if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,
8597                                                              Range))
8598         return;
8599       VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
8600           VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
8601           {}, "vector.recur.extract.for.phi");
8602       ExitIRI->setOperand(0, PenultimateElement);
8603       ExitUsersToFix.remove(ExitIRI);
8604     }
8605   }
8606 }
8607 
tryToBuildVPlanWithVPRecipes(VPlanPtr Plan,VFRange & Range,LoopVersioning * LVer)8608 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8609     VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8610 
8611   using namespace llvm::VPlanPatternMatch;
8612   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8613 
8614   // ---------------------------------------------------------------------------
8615   // Build initial VPlan: Scan the body of the loop in a topological order to
8616   // visit each basic block after having visited its predecessor basic blocks.
8617   // ---------------------------------------------------------------------------
8618 
8619   // Create initial VPlan skeleton, having a basic block for the pre-header
8620   // which contains SCEV expansions that need to happen before the CFG is
8621   // modified; a basic block for the vector pre-header, followed by a region for
8622   // the vector loop, followed by the middle basic block. The skeleton vector
8623   // loop region contains a header and latch basic blocks.
8624 
8625   bool RequiresScalarEpilogueCheck =
8626       LoopVectorizationPlanner::getDecisionAndClampRange(
8627           [this](ElementCount VF) {
8628             return !CM.requiresScalarEpilogue(VF.isVector());
8629           },
8630           Range);
8631   VPlanTransforms::prepareForVectorization(
8632       *Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
8633       CM.foldTailByMasking(), OrigLoop,
8634       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()),
8635       Legal->hasUncountableEarlyExit(), Range);
8636   VPlanTransforms::createLoopRegions(*Plan);
8637 
8638   // Don't use getDecisionAndClampRange here, because we don't know the UF
8639   // so this function is better to be conservative, rather than to split
8640   // it up into different VPlans.
8641   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8642   bool IVUpdateMayOverflow = false;
8643   for (ElementCount VF : Range)
8644     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8645 
8646   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8647   // Use NUW for the induction increment if we proved that it won't overflow in
8648   // the vector loop or when not folding the tail. In the later case, we know
8649   // that the canonical induction increment will not overflow as the vector trip
8650   // count is >= increment and a multiple of the increment.
8651   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8652   if (!HasNUW) {
8653     auto *IVInc = Plan->getVectorLoopRegion()
8654                       ->getExitingBasicBlock()
8655                       ->getTerminator()
8656                       ->getOperand(0);
8657     assert(match(IVInc, m_VPInstruction<Instruction::Add>(
8658                             m_Specific(Plan->getCanonicalIV()), m_VPValue())) &&
8659            "Did not find the canonical IV increment");
8660     cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
8661   }
8662 
8663   // ---------------------------------------------------------------------------
8664   // Pre-construction: record ingredients whose recipes we'll need to further
8665   // process after constructing the initial VPlan.
8666   // ---------------------------------------------------------------------------
8667 
8668   // For each interleave group which is relevant for this (possibly trimmed)
8669   // Range, add it to the set of groups to be later applied to the VPlan and add
8670   // placeholders for its members' Recipes which we'll be replacing with a
8671   // single VPInterleaveRecipe.
8672   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8673     auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8674       bool Result = (VF.isVector() && // Query is illegal for VF == 1
8675                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
8676                          LoopVectorizationCostModel::CM_Interleave);
8677       // For scalable vectors, the interleave factors must be <= 8 since we
8678       // require the (de)interleaveN intrinsics instead of shufflevectors.
8679       assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8680              "Unsupported interleave factor for scalable vectors");
8681       return Result;
8682     };
8683     if (!getDecisionAndClampRange(ApplyIG, Range))
8684       continue;
8685     InterleaveGroups.insert(IG);
8686   }
8687 
8688   // ---------------------------------------------------------------------------
8689   // Predicate and linearize the top-level loop region.
8690   // ---------------------------------------------------------------------------
8691   auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
8692       *Plan, CM.foldTailByMasking());
8693 
8694   // ---------------------------------------------------------------------------
8695   // Construct wide recipes and apply predication for original scalar
8696   // VPInstructions in the loop.
8697   // ---------------------------------------------------------------------------
8698   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8699                                 Builder, BlockMaskCache, LVer);
8700   RecipeBuilder.collectScaledReductions(Range);
8701 
8702   // Scan the body of the loop in a topological order to visit each basic block
8703   // after having visited its predecessor basic blocks.
8704   VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8705   VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8706   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8707       HeaderVPBB);
8708 
8709   auto *MiddleVPBB = Plan->getMiddleBlock();
8710   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8711   // Mapping from VPValues in the initial plan to their widened VPValues. Needed
8712   // temporarily to update created block masks.
8713   DenseMap<VPValue *, VPValue *> Old2New;
8714   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
8715     // Convert input VPInstructions to widened recipes.
8716     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
8717       auto *SingleDef = cast<VPSingleDefRecipe>(&R);
8718       auto *UnderlyingValue = SingleDef->getUnderlyingValue();
8719       // Skip recipes that do not need transforming, including canonical IV,
8720       // wide canonical IV and VPInstructions without underlying values. The
8721       // latter are added above for masking.
8722       // FIXME: Migrate code relying on the underlying instruction from VPlan0
8723       // to construct recipes below to not use the underlying instruction.
8724       if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
8725               &R) ||
8726           (isa<VPInstruction>(&R) && !UnderlyingValue))
8727         continue;
8728 
8729       // FIXME: VPlan0, which models a copy of the original scalar loop, should
8730       // not use VPWidenPHIRecipe to model the phis.
8731       assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
8732              UnderlyingValue && "unsupported recipe");
8733 
8734       // TODO: Gradually replace uses of underlying instruction by analyses on
8735       // VPlan.
8736       Instruction *Instr = cast<Instruction>(UnderlyingValue);
8737       Builder.setInsertPoint(SingleDef);
8738 
8739       // The stores with invariant address inside the loop will be deleted, and
8740       // in the exit block, a uniform store recipe will be created for the final
8741       // invariant store of the reduction.
8742       StoreInst *SI;
8743       if ((SI = dyn_cast<StoreInst>(Instr)) &&
8744           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
8745         // Only create recipe for the final invariant store of the reduction.
8746         if (Legal->isInvariantStoreOfReduction(SI)) {
8747           auto *Recipe =
8748               new VPReplicateRecipe(SI, R.operands(), true /* IsUniform */,
8749                                     nullptr /*Mask*/, VPIRMetadata(*SI, LVer));
8750           Recipe->insertBefore(*MiddleVPBB, MBIP);
8751         }
8752         R.eraseFromParent();
8753         continue;
8754       }
8755 
8756       VPRecipeBase *Recipe =
8757           RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range);
8758       if (!Recipe) {
8759         SmallVector<VPValue *, 4> Operands(R.operands());
8760         Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
8761       }
8762 
8763       RecipeBuilder.setRecipe(Instr, Recipe);
8764       if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
8765         // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8766         // moved to the phi section in the header.
8767         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8768       } else {
8769         Builder.insert(Recipe);
8770       }
8771       if (Recipe->getNumDefinedValues() == 1) {
8772         SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
8773         Old2New[SingleDef] = Recipe->getVPSingleValue();
8774       } else {
8775         assert(Recipe->getNumDefinedValues() == 0 &&
8776                "Unexpected multidef recipe");
8777         R.eraseFromParent();
8778       }
8779     }
8780   }
8781 
8782   // replaceAllUsesWith above may invalidate the block masks. Update them here.
8783   // TODO: Include the masks as operands in the predicated VPlan directly
8784   // to remove the need to keep a map of masks beyond the predication
8785   // transform.
8786   RecipeBuilder.updateBlockMaskCache(Old2New);
8787   for (const auto &[Old, _] : Old2New)
8788     Old->getDefiningRecipe()->eraseFromParent();
8789 
8790   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8791          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8792          "entry block must be set to a VPRegionBlock having a non-empty entry "
8793          "VPBasicBlock");
8794 
8795   // Update wide induction increments to use the same step as the corresponding
8796   // wide induction. This enables detecting induction increments directly in
8797   // VPlan and removes redundant splats.
8798   for (const auto &[Phi, ID] : Legal->getInductionVars()) {
8799     auto *IVInc = cast<Instruction>(
8800         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8801     if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
8802       continue;
8803     VPWidenInductionRecipe *WideIV =
8804         cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
8805     VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
8806     R->setOperand(1, WideIV->getStepValue());
8807   }
8808 
8809   DenseMap<VPValue *, VPValue *> IVEndValues;
8810   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
8811   SetVector<VPIRInstruction *> ExitUsersToFix =
8812       collectUsersInLatchExitBlock(*Plan);
8813   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix, Range);
8814   addUsersInExitBlocks(*Plan, ExitUsersToFix);
8815 
8816   // ---------------------------------------------------------------------------
8817   // Transform initial VPlan: Apply previously taken decisions, in order, to
8818   // bring the VPlan to its final state.
8819   // ---------------------------------------------------------------------------
8820 
8821   // Adjust the recipes for any inloop reductions.
8822   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
8823 
8824   // Apply mandatory transformation to handle FP maxnum/minnum reduction with
8825   // NaNs if possible, bail out otherwise.
8826   if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
8827                                 *Plan))
8828     return nullptr;
8829 
8830   // Transform recipes to abstract recipes if it is legal and beneficial and
8831   // clamp the range for better cost estimation.
8832   // TODO: Enable following transform when the EVL-version of extended-reduction
8833   // and mulacc-reduction are implemented.
8834   if (!CM.foldTailWithEVL()) {
8835     VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
8836                           CM.CostKind);
8837     VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
8838                              CostCtx, Range);
8839   }
8840 
8841   for (ElementCount VF : Range)
8842     Plan->addVF(VF);
8843   Plan->setName("Initial VPlan");
8844 
8845   // Interleave memory: for each Interleave Group we marked earlier as relevant
8846   // for this VPlan, replace the Recipes widening its memory instructions with a
8847   // single VPInterleaveRecipe at its insertion point.
8848   VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan,
8849                            InterleaveGroups, RecipeBuilder,
8850                            CM.isScalarEpilogueAllowed());
8851 
8852   // Replace VPValues for known constant strides guaranteed by predicate scalar
8853   // evolution.
8854   auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
8855     auto *R = cast<VPRecipeBase>(&U);
8856     return R->getParent()->getParent() ||
8857            R->getParent() ==
8858                Plan->getVectorLoopRegion()->getSinglePredecessor();
8859   };
8860   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8861     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8862     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8863     // Only handle constant strides for now.
8864     if (!ScevStride)
8865       continue;
8866 
8867     auto *CI = Plan->getOrAddLiveIn(
8868         ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
8869     if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
8870       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
8871 
8872     // The versioned value may not be used in the loop directly but through a
8873     // sext/zext. Add new live-ins in those cases.
8874     for (Value *U : StrideV->users()) {
8875       if (!isa<SExtInst, ZExtInst>(U))
8876         continue;
8877       VPValue *StrideVPV = Plan->getLiveIn(U);
8878       if (!StrideVPV)
8879         continue;
8880       unsigned BW = U->getType()->getScalarSizeInBits();
8881       APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
8882                                  : ScevStride->getAPInt().zext(BW);
8883       VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
8884       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
8885     }
8886   }
8887 
8888   auto BlockNeedsPredication = [this](BasicBlock *BB) {
8889     return Legal->blockNeedsPredication(BB);
8890   };
8891   VPlanTransforms::runPass(VPlanTransforms::dropPoisonGeneratingRecipes, *Plan,
8892                            BlockNeedsPredication);
8893 
8894   // Sink users of fixed-order recurrence past the recipe defining the previous
8895   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8896   if (!VPlanTransforms::runPass(VPlanTransforms::adjustFixedOrderRecurrences,
8897                                 *Plan, Builder))
8898     return nullptr;
8899 
8900   if (useActiveLaneMask(Style)) {
8901     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8902     // TailFoldingStyle is visible there.
8903     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8904     bool WithoutRuntimeCheck =
8905         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8906     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8907                                        WithoutRuntimeCheck);
8908   }
8909   VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
8910 
8911   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8912   return Plan;
8913 }
8914 
tryToBuildVPlan(VFRange & Range)8915 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8916   // Outer loop handling: They may require CFG and instruction level
8917   // transformations before even evaluating whether vectorization is profitable.
8918   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8919   // the vectorization pipeline.
8920   assert(!OrigLoop->isInnermost());
8921   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8922 
8923   auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
8924   VPlanTransforms::prepareForVectorization(
8925       *Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop,
8926       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), false,
8927       Range);
8928   VPlanTransforms::createLoopRegions(*Plan);
8929 
8930   for (ElementCount VF : Range)
8931     Plan->addVF(VF);
8932 
8933   if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
8934           Plan,
8935           [this](PHINode *P) {
8936             return Legal->getIntOrFpInductionDescriptor(P);
8937           },
8938           *PSE.getSE(), *TLI))
8939     return nullptr;
8940 
8941   // Collect mapping of IR header phis to header phi recipes, to be used in
8942   // addScalarResumePhis.
8943   DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
8944   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8945                                 Builder, BlockMaskCache, nullptr /*LVer*/);
8946   for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8947     if (isa<VPCanonicalIVPHIRecipe>(&R))
8948       continue;
8949     auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
8950     RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
8951   }
8952   DenseMap<VPValue *, VPValue *> IVEndValues;
8953   // TODO: IVEndValues are not used yet in the native path, to optimize exit
8954   // values.
8955   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
8956 
8957   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8958   return Plan;
8959 }
8960 
8961 // Adjust the recipes for reductions. For in-loop reductions the chain of
8962 // instructions leading from the loop exit instr to the phi need to be converted
8963 // to reductions, with one operand being vector and the other being the scalar
8964 // reduction chain. For other reductions, a select is introduced between the phi
8965 // and users outside the vector region when folding the tail.
8966 //
8967 // A ComputeReductionResult recipe is added to the middle block, also for
8968 // in-loop reductions which compute their result in-loop, because generating
8969 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8970 //
8971 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8972 // with a boolean reduction phi node to check if the condition is true in any
8973 // iteration. The final value is selected by the final ComputeReductionResult.
adjustRecipesForReductions(VPlanPtr & Plan,VPRecipeBuilder & RecipeBuilder,ElementCount MinVF)8974 void LoopVectorizationPlanner::adjustRecipesForReductions(
8975     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8976   using namespace VPlanPatternMatch;
8977   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8978   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8979   VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8980   SmallVector<VPRecipeBase *> ToDelete;
8981 
8982   for (VPRecipeBase &R : Header->phis()) {
8983     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8984     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8985       continue;
8986 
8987     RecurKind Kind = PhiR->getRecurrenceKind();
8988     assert(
8989         !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
8990         !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
8991         "AnyOf and FindIV reductions are not allowed for in-loop reductions");
8992 
8993     // Collect the chain of "link" recipes for the reduction starting at PhiR.
8994     SetVector<VPSingleDefRecipe *> Worklist;
8995     Worklist.insert(PhiR);
8996     for (unsigned I = 0; I != Worklist.size(); ++I) {
8997       VPSingleDefRecipe *Cur = Worklist[I];
8998       for (VPUser *U : Cur->users()) {
8999         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9000         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9001           assert((UserRecipe->getParent() == MiddleVPBB ||
9002                   UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9003                  "U must be either in the loop region, the middle block or the "
9004                  "scalar preheader.");
9005           continue;
9006         }
9007         Worklist.insert(UserRecipe);
9008       }
9009     }
9010 
9011     // Visit operation "Links" along the reduction chain top-down starting from
9012     // the phi until LoopExitValue. We keep track of the previous item
9013     // (PreviousLink) to tell which of the two operands of a Link will remain
9014     // scalar and which will be reduced. For minmax by select(cmp), Link will be
9015     // the select instructions. Blend recipes of in-loop reduction phi's  will
9016     // get folded to their non-phi operand, as the reduction recipe handles the
9017     // condition directly.
9018     VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9019     for (VPSingleDefRecipe *CurrentLink : drop_begin(Worklist)) {
9020       if (auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink)) {
9021         assert(Blend->getNumIncomingValues() == 2 &&
9022                "Blend must have 2 incoming values");
9023         if (Blend->getIncomingValue(0) == PhiR) {
9024           Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9025         } else {
9026           assert(Blend->getIncomingValue(1) == PhiR &&
9027                  "PhiR must be an operand of the blend");
9028           Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9029         }
9030         continue;
9031       }
9032 
9033       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9034 
9035       // Index of the first operand which holds a non-mask vector operand.
9036       unsigned IndexOfFirstOperand;
9037       // Recognize a call to the llvm.fmuladd intrinsic.
9038       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9039       VPValue *VecOp;
9040       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9041       if (IsFMulAdd) {
9042         assert(
9043             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9044             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9045         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9046                 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9047                CurrentLink->getOperand(2) == PreviousLink &&
9048                "expected a call where the previous link is the added operand");
9049 
9050         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9051         // need to create an fmul recipe (multiplying the first two operands of
9052         // the fmuladd together) to use as the vector operand for the fadd
9053         // reduction.
9054         VPInstruction *FMulRecipe = new VPInstruction(
9055             Instruction::FMul,
9056             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9057             CurrentLinkI->getFastMathFlags());
9058         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9059         VecOp = FMulRecipe;
9060       } else {
9061         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9062           if (isa<VPWidenRecipe>(CurrentLink)) {
9063             assert(isa<CmpInst>(CurrentLinkI) &&
9064                    "need to have the compare of the select");
9065             continue;
9066           }
9067           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9068                  "must be a select recipe");
9069           IndexOfFirstOperand = 1;
9070         } else {
9071           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9072                  "Expected to replace a VPWidenSC");
9073           IndexOfFirstOperand = 0;
9074         }
9075         // Note that for non-commutable operands (cmp-selects), the semantics of
9076         // the cmp-select are captured in the recurrence kind.
9077         unsigned VecOpId =
9078             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9079                 ? IndexOfFirstOperand + 1
9080                 : IndexOfFirstOperand;
9081         VecOp = CurrentLink->getOperand(VecOpId);
9082         assert(VecOp != PreviousLink &&
9083                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9084                                        (VecOpId - IndexOfFirstOperand)) ==
9085                    PreviousLink &&
9086                "PreviousLink must be the operand other than VecOp");
9087       }
9088 
9089       VPValue *CondOp = nullptr;
9090       if (CM.blockNeedsPredicationForAnyReason(CurrentLinkI->getParent()))
9091         CondOp = RecipeBuilder.getBlockInMask(CurrentLink->getParent());
9092 
9093       // TODO: Retrieve FMFs from recipes directly.
9094       RecurrenceDescriptor RdxDesc = Legal->getRecurrenceDescriptor(
9095           cast<PHINode>(PhiR->getUnderlyingInstr()));
9096       // Non-FP RdxDescs will have all fast math flags set, so clear them.
9097       FastMathFlags FMFs = isa<FPMathOperator>(CurrentLinkI)
9098                                ? RdxDesc.getFastMathFlags()
9099                                : FastMathFlags();
9100       auto *RedRecipe = new VPReductionRecipe(
9101           Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
9102           PhiR->isOrdered(), CurrentLinkI->getDebugLoc());
9103       // Append the recipe to the end of the VPBasicBlock because we need to
9104       // ensure that it comes after all of it's inputs, including CondOp.
9105       // Delete CurrentLink as it will be invalid if its operand is replaced
9106       // with a reduction defined at the bottom of the block in the next link.
9107       if (LinkVPBB->getNumSuccessors() == 0)
9108         RedRecipe->insertBefore(&*std::prev(std::prev(LinkVPBB->end())));
9109       else
9110         LinkVPBB->appendRecipe(RedRecipe);
9111 
9112       CurrentLink->replaceAllUsesWith(RedRecipe);
9113       ToDelete.push_back(CurrentLink);
9114       PreviousLink = RedRecipe;
9115     }
9116   }
9117   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9118   Builder.setInsertPoint(&*std::prev(std::prev(LatchVPBB->end())));
9119   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9120   for (VPRecipeBase &R :
9121        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9122     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9123     if (!PhiR)
9124       continue;
9125 
9126     const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
9127         cast<PHINode>(PhiR->getUnderlyingInstr()));
9128     Type *PhiTy = PhiR->getUnderlyingValue()->getType();
9129     // If tail is folded by masking, introduce selects between the phi
9130     // and the users outside the vector region of each reduction, at the
9131     // beginning of the dedicated latch block.
9132     auto *OrigExitingVPV = PhiR->getBackedgeValue();
9133     auto *NewExitingVPV = PhiR->getBackedgeValue();
9134     // Don't output selects for partial reductions because they have an output
9135     // with fewer lanes than the VF. So the operands of the select would have
9136     // different numbers of lanes. Partial reductions mask the input instead.
9137     if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
9138         !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
9139       VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent());
9140       std::optional<FastMathFlags> FMFs =
9141           PhiTy->isFloatingPointTy()
9142               ? std::make_optional(RdxDesc.getFastMathFlags())
9143               : std::nullopt;
9144       NewExitingVPV =
9145           Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9146       OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9147         return isa<VPInstruction>(&U) &&
9148                (cast<VPInstruction>(&U)->getOpcode() ==
9149                     VPInstruction::ComputeAnyOfResult ||
9150                 cast<VPInstruction>(&U)->getOpcode() ==
9151                     VPInstruction::ComputeReductionResult ||
9152                 cast<VPInstruction>(&U)->getOpcode() ==
9153                     VPInstruction::ComputeFindIVResult);
9154       });
9155       if (CM.usePredicatedReductionSelect())
9156         PhiR->setOperand(1, NewExitingVPV);
9157     }
9158 
9159     // We want code in the middle block to appear to execute on the location of
9160     // the scalar loop's latch terminator because: (a) it is all compiler
9161     // generated, (b) these instructions are always executed after evaluating
9162     // the latch conditional branch, and (c) other passes may add new
9163     // predecessors which terminate on this line. This is the easiest way to
9164     // ensure we don't accidentally cause an extra step back into the loop while
9165     // debugging.
9166     DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9167 
9168     // TODO: At the moment ComputeReductionResult also drives creation of the
9169     // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9170     // even for in-loop reductions, until the reduction resume value handling is
9171     // also modeled in VPlan.
9172     VPInstruction *FinalReductionResult;
9173     VPBuilder::InsertPointGuard Guard(Builder);
9174     Builder.setInsertPoint(MiddleVPBB, IP);
9175     RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
9176     if (RecurrenceDescriptor::isFindIVRecurrenceKind(RecurrenceKind)) {
9177       VPValue *Start = PhiR->getStartValue();
9178       VPValue *Sentinel = Plan->getOrAddLiveIn(RdxDesc.getSentinelValue());
9179       FinalReductionResult =
9180           Builder.createNaryOp(VPInstruction::ComputeFindIVResult,
9181                                {PhiR, Start, Sentinel, NewExitingVPV}, ExitDL);
9182     } else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
9183       VPValue *Start = PhiR->getStartValue();
9184       FinalReductionResult =
9185           Builder.createNaryOp(VPInstruction::ComputeAnyOfResult,
9186                                {PhiR, Start, NewExitingVPV}, ExitDL);
9187     } else {
9188       VPIRFlags Flags =
9189           RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind)
9190               ? VPIRFlags(RdxDesc.getFastMathFlags())
9191               : VPIRFlags();
9192       FinalReductionResult =
9193           Builder.createNaryOp(VPInstruction::ComputeReductionResult,
9194                                {PhiR, NewExitingVPV}, Flags, ExitDL);
9195     }
9196     // If the vector reduction can be performed in a smaller type, we truncate
9197     // then extend the loop exit value to enable InstCombine to evaluate the
9198     // entire expression in the smaller type.
9199     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9200         !RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
9201       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9202       assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(RecurrenceKind) &&
9203              "Unexpected truncated min-max recurrence!");
9204       Type *RdxTy = RdxDesc.getRecurrenceType();
9205       auto *Trunc =
9206           new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9207       Instruction::CastOps ExtendOpc =
9208           RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
9209       auto *Extnd = new VPWidenCastRecipe(ExtendOpc, Trunc, PhiTy);
9210       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9211       Extnd->insertAfter(Trunc);
9212       if (PhiR->getOperand(1) == NewExitingVPV)
9213         PhiR->setOperand(1, Extnd->getVPSingleValue());
9214 
9215       // Update ComputeReductionResult with the truncated exiting value and
9216       // extend its result.
9217       FinalReductionResult->setOperand(1, Trunc);
9218       FinalReductionResult =
9219           Builder.createScalarCast(ExtendOpc, FinalReductionResult, PhiTy, {});
9220     }
9221 
9222     // Update all users outside the vector region. Also replace redundant
9223     // ExtractLastElement.
9224     for (auto *U : to_vector(OrigExitingVPV->users())) {
9225       auto *Parent = cast<VPRecipeBase>(U)->getParent();
9226       if (FinalReductionResult == U || Parent->getParent())
9227         continue;
9228       U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
9229       if (match(U, m_VPInstruction<VPInstruction::ExtractLastElement>(
9230                        m_VPValue())))
9231         cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
9232     }
9233 
9234     // Adjust AnyOf reductions; replace the reduction phi for the selected value
9235     // with a boolean reduction phi node to check if the condition is true in
9236     // any iteration. The final value is selected by the final
9237     // ComputeReductionResult.
9238     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
9239       auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9240         return isa<VPWidenSelectRecipe>(U) ||
9241                (isa<VPReplicateRecipe>(U) &&
9242                 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9243                     Instruction::Select);
9244       }));
9245       VPValue *Cmp = Select->getOperand(0);
9246       // If the compare is checking the reduction PHI node, adjust it to check
9247       // the start value.
9248       if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
9249         CmpR->replaceUsesOfWith(PhiR, PhiR->getStartValue());
9250       Builder.setInsertPoint(Select);
9251 
9252       // If the true value of the select is the reduction phi, the new value is
9253       // selected if the negated condition is true in any iteration.
9254       if (Select->getOperand(1) == PhiR)
9255         Cmp = Builder.createNot(Cmp);
9256       VPValue *Or = Builder.createOr(PhiR, Cmp);
9257       Select->getVPSingleValue()->replaceAllUsesWith(Or);
9258       // Delete Select now that it has invalid types.
9259       ToDelete.push_back(Select);
9260 
9261       // Convert the reduction phi to operate on bools.
9262       PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9263                               OrigLoop->getHeader()->getContext())));
9264       continue;
9265     }
9266 
9267     if (RecurrenceDescriptor::isFindIVRecurrenceKind(
9268             RdxDesc.getRecurrenceKind())) {
9269       // Adjust the start value for FindFirstIV/FindLastIV recurrences to use
9270       // the sentinel value after generating the ResumePhi recipe, which uses
9271       // the original start value.
9272       PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9273     }
9274     RecurKind RK = RdxDesc.getRecurrenceKind();
9275     if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
9276          !RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
9277          !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
9278       VPBuilder PHBuilder(Plan->getVectorPreheader());
9279       VPValue *Iden = Plan->getOrAddLiveIn(
9280           getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
9281       // If the PHI is used by a partial reduction, set the scale factor.
9282       unsigned ScaleFactor =
9283           RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
9284               .value_or(1);
9285       Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext());
9286       auto *ScaleFactorVPV =
9287           Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor));
9288       VPValue *StartV = PHBuilder.createNaryOp(
9289           VPInstruction::ReductionStartVector,
9290           {PhiR->getStartValue(), Iden, ScaleFactorVPV},
9291           PhiTy->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
9292                                      : FastMathFlags());
9293       PhiR->setOperand(0, StartV);
9294     }
9295   }
9296   for (VPRecipeBase *R : ToDelete)
9297     R->eraseFromParent();
9298 
9299   VPlanTransforms::runPass(VPlanTransforms::clearReductionWrapFlags, *Plan);
9300 }
9301 
attachRuntimeChecks(VPlan & Plan,GeneratedRTChecks & RTChecks,bool HasBranchWeights) const9302 void LoopVectorizationPlanner::attachRuntimeChecks(
9303     VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
9304   const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
9305   if (SCEVCheckBlock) {
9306     assert((!CM.OptForSize ||
9307             CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
9308            "Cannot SCEV check stride or overflow when optimizing for size");
9309     VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
9310                                       HasBranchWeights);
9311   }
9312   const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
9313   if (MemCheckBlock) {
9314     // VPlan-native path does not do any analysis for runtime checks
9315     // currently.
9316     assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
9317            "Runtime checks are not supported for outer loops yet");
9318 
9319     if (CM.OptForSize) {
9320       assert(
9321           CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
9322           "Cannot emit memory checks when optimizing for size, unless forced "
9323           "to vectorize.");
9324       ORE->emit([&]() {
9325         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
9326                                           OrigLoop->getStartLoc(),
9327                                           OrigLoop->getHeader())
9328                << "Code-size may be reduced by not forcing "
9329                   "vectorization, or by source-code modifications "
9330                   "eliminating the need for runtime checks "
9331                   "(e.g., adding 'restrict').";
9332       });
9333     }
9334     VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
9335                                       HasBranchWeights);
9336   }
9337 }
9338 
execute(VPTransformState & State)9339 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9340   assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9341 
9342   // Fast-math-flags propagate from the original induction instruction.
9343   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9344   if (FPBinOp)
9345     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9346 
9347   Value *Step = State.get(getStepValue(), VPLane(0));
9348   Value *Index = State.get(getOperand(1), VPLane(0));
9349   Value *DerivedIV = emitTransformedIndex(
9350       State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9351       cast_if_present<BinaryOperator>(FPBinOp));
9352   DerivedIV->setName(Name);
9353   // If index is the vector trip count, the concrete value will only be set in
9354   // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9355   // TODO: Remove the special case for the vector trip count once it is computed
9356   // in VPlan and can be used during VPlan simplification.
9357   assert((DerivedIV != Index ||
9358           getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
9359          "IV didn't need transforming?");
9360   State.set(this, DerivedIV, VPLane(0));
9361 }
9362 
9363 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9364 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9365 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9366 // for predication.
getScalarEpilogueLowering(Function * F,Loop * L,LoopVectorizeHints & Hints,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,LoopVectorizationLegality & LVL,InterleavedAccessInfo * IAI)9367 static ScalarEpilogueLowering getScalarEpilogueLowering(
9368     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9369     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9370     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9371   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9372   // don't look at hints or options, and don't request a scalar epilogue.
9373   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9374   // LoopAccessInfo (due to code dependency and not being able to reliably get
9375   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9376   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9377   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9378   // back to the old way and vectorize with versioning when forced. See D81345.)
9379   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9380                                                       PGSOQueryType::IRPass) &&
9381                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9382     return CM_ScalarEpilogueNotAllowedOptSize;
9383 
9384   // 2) If set, obey the directives
9385   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9386     switch (PreferPredicateOverEpilogue) {
9387     case PreferPredicateTy::ScalarEpilogue:
9388       return CM_ScalarEpilogueAllowed;
9389     case PreferPredicateTy::PredicateElseScalarEpilogue:
9390       return CM_ScalarEpilogueNotNeededUsePredicate;
9391     case PreferPredicateTy::PredicateOrDontVectorize:
9392       return CM_ScalarEpilogueNotAllowedUsePredicate;
9393     };
9394   }
9395 
9396   // 3) If set, obey the hints
9397   switch (Hints.getPredicate()) {
9398   case LoopVectorizeHints::FK_Enabled:
9399     return CM_ScalarEpilogueNotNeededUsePredicate;
9400   case LoopVectorizeHints::FK_Disabled:
9401     return CM_ScalarEpilogueAllowed;
9402   };
9403 
9404   // 4) if the TTI hook indicates this is profitable, request predication.
9405   TailFoldingInfo TFI(TLI, &LVL, IAI);
9406   if (TTI->preferPredicateOverEpilogue(&TFI))
9407     return CM_ScalarEpilogueNotNeededUsePredicate;
9408 
9409   return CM_ScalarEpilogueAllowed;
9410 }
9411 
9412 // Process the loop in the VPlan-native vectorization path. This path builds
9413 // VPlan upfront in the vectorization pipeline, which allows to apply
9414 // VPlan-to-VPlan transformations from the very beginning without modifying the
9415 // input LLVM IR.
processLoopInVPlanNativePath(Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,LoopVectorizationLegality * LVL,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,LoopVectorizeHints & Hints,LoopVectorizationRequirements & Requirements)9416 static bool processLoopInVPlanNativePath(
9417     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9418     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9419     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9420     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9421     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9422     LoopVectorizationRequirements &Requirements) {
9423 
9424   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9425     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9426     return false;
9427   }
9428   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9429   Function *F = L->getHeader()->getParent();
9430   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9431 
9432   ScalarEpilogueLowering SEL =
9433       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9434 
9435   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9436                                 &Hints, IAI, PSI, BFI);
9437   // Use the planner for outer loop vectorization.
9438   // TODO: CM is not used at this point inside the planner. Turn CM into an
9439   // optional argument if we don't need it in the future.
9440   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9441                                ORE);
9442 
9443   // Get user vectorization factor.
9444   ElementCount UserVF = Hints.getWidth();
9445 
9446   CM.collectElementTypesForWidening();
9447 
9448   // Plan how to best vectorize, return the best VF and its cost.
9449   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9450 
9451   // If we are stress testing VPlan builds, do not attempt to generate vector
9452   // code. Masked vector code generation support will follow soon.
9453   // Also, do not attempt to vectorize if no vector code will be produced.
9454   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9455     return false;
9456 
9457   VPlan &BestPlan = LVP.getPlanFor(VF.Width);
9458 
9459   {
9460     GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
9461     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9462                            VF.Width, 1, &CM, BFI, PSI, Checks, BestPlan);
9463     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9464                       << L->getHeader()->getParent()->getName() << "\"\n");
9465     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9466   }
9467 
9468   reportVectorization(ORE, L, VF, 1);
9469 
9470   // Mark the loop as already vectorized to avoid vectorizing again.
9471   Hints.setAlreadyVectorized();
9472   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9473   return true;
9474 }
9475 
9476 // Emit a remark if there are stores to floats that required a floating point
9477 // extension. If the vectorized loop was generated with floating point there
9478 // will be a performance penalty from the conversion overhead and the change in
9479 // the vector width.
checkMixedPrecision(Loop * L,OptimizationRemarkEmitter * ORE)9480 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9481   SmallVector<Instruction *, 4> Worklist;
9482   for (BasicBlock *BB : L->getBlocks()) {
9483     for (Instruction &Inst : *BB) {
9484       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9485         if (S->getValueOperand()->getType()->isFloatTy())
9486           Worklist.push_back(S);
9487       }
9488     }
9489   }
9490 
9491   // Traverse the floating point stores upwards searching, for floating point
9492   // conversions.
9493   SmallPtrSet<const Instruction *, 4> Visited;
9494   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9495   while (!Worklist.empty()) {
9496     auto *I = Worklist.pop_back_val();
9497     if (!L->contains(I))
9498       continue;
9499     if (!Visited.insert(I).second)
9500       continue;
9501 
9502     // Emit a remark if the floating point store required a floating
9503     // point conversion.
9504     // TODO: More work could be done to identify the root cause such as a
9505     // constant or a function return type and point the user to it.
9506     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9507       ORE->emit([&]() {
9508         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9509                                           I->getDebugLoc(), L->getHeader())
9510                << "floating point conversion changes vector width. "
9511                << "Mixed floating point precision requires an up/down "
9512                << "cast that will negatively impact performance.";
9513       });
9514 
9515     for (Use &Op : I->operands())
9516       if (auto *OpI = dyn_cast<Instruction>(Op))
9517         Worklist.push_back(OpI);
9518   }
9519 }
9520 
9521 /// For loops with uncountable early exits, find the cost of doing work when
9522 /// exiting the loop early, such as calculating the final exit values of
9523 /// variables used outside the loop.
9524 /// TODO: This is currently overly pessimistic because the loop may not take
9525 /// the early exit, but better to keep this conservative for now. In future,
9526 /// it might be possible to relax this by using branch probabilities.
calculateEarlyExitCost(VPCostContext & CostCtx,VPlan & Plan,ElementCount VF)9527 static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
9528                                               VPlan &Plan, ElementCount VF) {
9529   InstructionCost Cost = 0;
9530   for (auto *ExitVPBB : Plan.getExitBlocks()) {
9531     for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
9532       // If the predecessor is not the middle.block, then it must be the
9533       // vector.early.exit block, which may contain work to calculate the exit
9534       // values of variables used outside the loop.
9535       if (PredVPBB != Plan.getMiddleBlock()) {
9536         LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
9537                           << PredVPBB->getName() << ":\n");
9538         Cost += PredVPBB->cost(VF, CostCtx);
9539       }
9540     }
9541   }
9542   return Cost;
9543 }
9544 
9545 /// This function determines whether or not it's still profitable to vectorize
9546 /// the loop given the extra work we have to do outside of the loop:
9547 ///  1. Perform the runtime checks before entering the loop to ensure it's safe
9548 ///     to vectorize.
9549 ///  2. In the case of loops with uncountable early exits, we may have to do
9550 ///     extra work when exiting the loop early, such as calculating the final
9551 ///     exit values of variables used outside the loop.
isOutsideLoopWorkProfitable(GeneratedRTChecks & Checks,VectorizationFactor & VF,Loop * L,PredicatedScalarEvolution & PSE,VPCostContext & CostCtx,VPlan & Plan,ScalarEpilogueLowering SEL,std::optional<unsigned> VScale)9552 static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
9553                                         VectorizationFactor &VF, Loop *L,
9554                                         PredicatedScalarEvolution &PSE,
9555                                         VPCostContext &CostCtx, VPlan &Plan,
9556                                         ScalarEpilogueLowering SEL,
9557                                         std::optional<unsigned> VScale) {
9558   InstructionCost TotalCost = Checks.getCost();
9559   if (!TotalCost.isValid())
9560     return false;
9561 
9562   // Add on the cost of any work required in the vector early exit block, if
9563   // one exists.
9564   TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
9565 
9566   // When interleaving only scalar and vector cost will be equal, which in turn
9567   // would lead to a divide by 0. Fall back to hard threshold.
9568   if (VF.Width.isScalar()) {
9569     // TODO: Should we rename VectorizeMemoryCheckThreshold?
9570     if (TotalCost > VectorizeMemoryCheckThreshold) {
9571       LLVM_DEBUG(
9572           dbgs()
9573           << "LV: Interleaving only is not profitable due to runtime checks\n");
9574       return false;
9575     }
9576     return true;
9577   }
9578 
9579   // The scalar cost should only be 0 when vectorizing with a user specified
9580   // VF/IC. In those cases, runtime checks should always be generated.
9581   uint64_t ScalarC = VF.ScalarCost.getValue();
9582   if (ScalarC == 0)
9583     return true;
9584 
9585   // First, compute the minimum iteration count required so that the vector
9586   // loop outperforms the scalar loop.
9587   //  The total cost of the scalar loop is
9588   //   ScalarC * TC
9589   //  where
9590   //  * TC is the actual trip count of the loop.
9591   //  * ScalarC is the cost of a single scalar iteration.
9592   //
9593   //  The total cost of the vector loop is
9594   //    RtC + VecC * (TC / VF) + EpiC
9595   //  where
9596   //  * RtC is the cost of the generated runtime checks plus the cost of
9597   //    performing any additional work in the vector.early.exit block for loops
9598   //    with uncountable early exits.
9599   //  * VecC is the cost of a single vector iteration.
9600   //  * TC is the actual trip count of the loop
9601   //  * VF is the vectorization factor
9602   //  * EpiCost is the cost of the generated epilogue, including the cost
9603   //    of the remaining scalar operations.
9604   //
9605   // Vectorization is profitable once the total vector cost is less than the
9606   // total scalar cost:
9607   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
9608   //
9609   // Now we can compute the minimum required trip count TC as
9610   //   VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9611   //
9612   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9613   // the computations are performed on doubles, not integers and the result
9614   // is rounded up, hence we get an upper estimate of the TC.
9615   unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
9616   uint64_t RtC = TotalCost.getValue();
9617   uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
9618   uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9619 
9620   // Second, compute a minimum iteration count so that the cost of the
9621   // runtime checks is only a fraction of the total scalar loop cost. This
9622   // adds a loop-dependent bound on the overhead incurred if the runtime
9623   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9624   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9625   // cost, compute
9626   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
9627   uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9628 
9629   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9630   // epilogue is allowed, choose the next closest multiple of VF. This should
9631   // partly compensate for ignoring the epilogue cost.
9632   uint64_t MinTC = std::max(MinTC1, MinTC2);
9633   if (SEL == CM_ScalarEpilogueAllowed)
9634     MinTC = alignTo(MinTC, IntVF);
9635   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
9636 
9637   LLVM_DEBUG(
9638       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9639              << VF.MinProfitableTripCount << "\n");
9640 
9641   // Skip vectorization if the expected trip count is less than the minimum
9642   // required trip count.
9643   if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
9644     if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
9645       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9646                            "trip count < minimum profitable VF ("
9647                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
9648                         << ")\n");
9649 
9650       return false;
9651     }
9652   }
9653   return true;
9654 }
9655 
LoopVectorizePass(LoopVectorizeOptions Opts)9656 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9657     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9658                                !EnableLoopInterleaving),
9659       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9660                               !EnableLoopVectorization) {}
9661 
9662 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
9663 /// vectorization. Remove ResumePhis from \p MainPlan for inductions that
9664 /// don't have a corresponding wide induction in \p EpiPlan.
preparePlanForMainVectorLoop(VPlan & MainPlan,VPlan & EpiPlan)9665 static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
9666   // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
9667   // will need their resume-values computed in the main vector loop. Others
9668   // can be removed from the main VPlan.
9669   SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
9670   for (VPRecipeBase &R :
9671        EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9672     if (isa<VPCanonicalIVPHIRecipe>(&R))
9673       continue;
9674     EpiWidenedPhis.insert(
9675         cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
9676   }
9677   for (VPRecipeBase &R :
9678        make_early_inc_range(MainPlan.getScalarHeader()->phis())) {
9679     auto *VPIRInst = cast<VPIRPhi>(&R);
9680     if (EpiWidenedPhis.contains(&VPIRInst->getIRPhi()))
9681       continue;
9682     // There is no corresponding wide induction in the epilogue plan that would
9683     // need a resume value. Remove the VPIRInst wrapping the scalar header phi
9684     // together with the corresponding ResumePhi. The resume values for the
9685     // scalar loop will be created during execution of EpiPlan.
9686     VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
9687     VPIRInst->eraseFromParent();
9688     ResumePhi->eraseFromParent();
9689   }
9690   VPlanTransforms::runPass(VPlanTransforms::removeDeadRecipes, MainPlan);
9691 
9692   using namespace VPlanPatternMatch;
9693   // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
9694   // introduce multiple uses of undef/poison. If the reduction start value may
9695   // be undef or poison it needs to be frozen and the frozen start has to be
9696   // used when computing the reduction result. We also need to use the frozen
9697   // value in the resume phi generated by the main vector loop, as this is also
9698   // used to compute the reduction result after the epilogue vector loop.
9699   auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
9700                                              bool UpdateResumePhis) {
9701     VPBuilder Builder(Plan.getEntry());
9702     for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
9703       auto *VPI = dyn_cast<VPInstruction>(&R);
9704       if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindIVResult)
9705         continue;
9706       VPValue *OrigStart = VPI->getOperand(1);
9707       if (isGuaranteedNotToBeUndefOrPoison(OrigStart->getLiveInIRValue()))
9708         continue;
9709       VPInstruction *Freeze =
9710           Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr");
9711       VPI->setOperand(1, Freeze);
9712       if (UpdateResumePhis)
9713         OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
9714           return Freeze != &U && isa<VPPhi>(&U);
9715         });
9716     }
9717   };
9718   AddFreezeForFindLastIVReductions(MainPlan, true);
9719   AddFreezeForFindLastIVReductions(EpiPlan, false);
9720 
9721   VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
9722   VPValue *VectorTC = &MainPlan.getVectorTripCount();
9723   // If there is a suitable resume value for the canonical induction in the
9724   // scalar (which will become vector) epilogue loop we are done. Otherwise
9725   // create it below.
9726   if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
9727         return match(&R, m_VPInstruction<Instruction::PHI>(m_Specific(VectorTC),
9728                                                            m_SpecificInt(0)));
9729       }))
9730     return;
9731   VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
9732   ScalarPHBuilder.createScalarPhi(
9733       {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
9734       "vec.epilog.resume.val");
9735 }
9736 
9737 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9738 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
9739 static void
preparePlanForEpilogueVectorLoop(VPlan & Plan,Loop * L,const SCEV2ValueTy & ExpandedSCEVs,const EpilogueLoopVectorizationInfo & EPI)9740 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
9741                                  const SCEV2ValueTy &ExpandedSCEVs,
9742                                  const EpilogueLoopVectorizationInfo &EPI) {
9743   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9744   VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9745   Header->setName("vec.epilog.vector.body");
9746 
9747   DenseMap<Value *, Value *> ToFrozen;
9748   // Ensure that the start values for all header phi recipes are updated before
9749   // vectorizing the epilogue loop.
9750   for (VPRecipeBase &R : Header->phis()) {
9751     if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
9752       // When vectorizing the epilogue loop, the canonical induction start
9753       // value needs to be changed from zero to the value after the main
9754       // vector loop. Find the resume value created during execution of the main
9755       // VPlan.
9756       // FIXME: Improve modeling for canonical IV start values in the epilogue
9757       // loop.
9758       using namespace llvm::PatternMatch;
9759       Type *IdxTy = IV->getScalarType();
9760       PHINode *EPResumeVal = find_singleton<PHINode>(
9761           L->getLoopPreheader()->phis(),
9762           [&EPI, IdxTy](PHINode &P, bool) -> PHINode * {
9763             if (P.getType() == IdxTy &&
9764                 match(
9765                     P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
9766                     m_SpecificInt(0)) &&
9767                 all_of(P.incoming_values(), [&EPI](Value *Inc) {
9768                   return Inc == EPI.VectorTripCount ||
9769                          match(Inc, m_SpecificInt(0));
9770                 }))
9771               return &P;
9772             return nullptr;
9773           });
9774       assert(EPResumeVal && "must have a resume value for the canonical IV");
9775       VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
9776       assert(all_of(IV->users(),
9777                     [](const VPUser *U) {
9778                       return isa<VPScalarIVStepsRecipe>(U) ||
9779                              isa<VPDerivedIVRecipe>(U) ||
9780                              cast<VPRecipeBase>(U)->isScalarCast() ||
9781                              cast<VPInstruction>(U)->getOpcode() ==
9782                                  Instruction::Add;
9783                     }) &&
9784              "the canonical IV should only be used by its increment or "
9785              "ScalarIVSteps when resetting the start value");
9786       IV->setOperand(0, VPV);
9787       continue;
9788     }
9789 
9790     Value *ResumeV = nullptr;
9791     // TODO: Move setting of resume values to prepareToExecute.
9792     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
9793       auto *RdxResult =
9794           cast<VPInstruction>(*find_if(ReductionPhi->users(), [](VPUser *U) {
9795             auto *VPI = dyn_cast<VPInstruction>(U);
9796             return VPI &&
9797                    (VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
9798                     VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
9799                     VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
9800           }));
9801       ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
9802                     ->getIncomingValueForBlock(L->getLoopPreheader());
9803       RecurKind RK = ReductionPhi->getRecurrenceKind();
9804       if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
9805         Value *StartV = RdxResult->getOperand(1)->getLiveInIRValue();
9806         // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9807         // start value; compare the final value from the main vector loop
9808         // to the start value.
9809         BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
9810         IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9811         ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
9812       } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(RK)) {
9813         Value *StartV = getStartValueFromReductionResult(RdxResult);
9814         ToFrozen[StartV] = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
9815             EPI.MainLoopIterationCountCheck);
9816 
9817         // VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions requires
9818         // an adjustment to the resume value. The resume value is adjusted to
9819         // the sentinel value when the final value from the main vector loop
9820         // equals the start value. This ensures correctness when the start value
9821         // might not be less than the minimum value of a monotonically
9822         // increasing induction variable.
9823         BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
9824         IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9825         Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
9826         Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
9827         ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
9828       } else {
9829         VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9830         auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9831         if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
9832           assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9833                  "unexpected start value");
9834           VPI->setOperand(0, StartVal);
9835           continue;
9836         }
9837       }
9838     } else {
9839       // Retrieve the induction resume values for wide inductions from
9840       // their original phi nodes in the scalar loop.
9841       PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
9842       // Hook up to the PHINode generated by a ResumePhi recipe of main
9843       // loop VPlan, which feeds the scalar loop.
9844       ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
9845     }
9846     assert(ResumeV && "Must have a resume value");
9847     VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9848     cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
9849   }
9850 
9851   // For some VPValues in the epilogue plan we must re-use the generated IR
9852   // values from the main plan. Replace them with live-in VPValues.
9853   // TODO: This is a workaround needed for epilogue vectorization and it
9854   // should be removed once induction resume value creation is done
9855   // directly in VPlan.
9856   for (auto &R : make_early_inc_range(*Plan.getEntry())) {
9857     // Re-use frozen values from the main plan for Freeze VPInstructions in the
9858     // epilogue plan. This ensures all users use the same frozen value.
9859     auto *VPI = dyn_cast<VPInstruction>(&R);
9860     if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9861       VPI->replaceAllUsesWith(Plan.getOrAddLiveIn(
9862           ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
9863       continue;
9864     }
9865 
9866     // Re-use the trip count and steps expanded for the main loop, as
9867     // skeleton creation needs it as a value that dominates both the scalar
9868     // and vector epilogue loops
9869     auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
9870     if (!ExpandR)
9871       continue;
9872     VPValue *ExpandedVal =
9873         Plan.getOrAddLiveIn(ExpandedSCEVs.lookup(ExpandR->getSCEV()));
9874     ExpandR->replaceAllUsesWith(ExpandedVal);
9875     if (Plan.getTripCount() == ExpandR)
9876       Plan.resetTripCount(ExpandedVal);
9877     ExpandR->eraseFromParent();
9878   }
9879 }
9880 
9881 // Generate bypass values from the additional bypass block. Note that when the
9882 // vectorized epilogue is skipped due to iteration count check, then the
9883 // resume value for the induction variable comes from the trip count of the
9884 // main vector loop, passed as the second argument.
createInductionAdditionalBypassValues(PHINode * OrigPhi,const InductionDescriptor & II,IRBuilder<> & BypassBuilder,const SCEV2ValueTy & ExpandedSCEVs,Value * MainVectorTripCount,Instruction * OldInduction)9885 static Value *createInductionAdditionalBypassValues(
9886     PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
9887     const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
9888     Instruction *OldInduction) {
9889   Value *Step = getExpandedStep(II, ExpandedSCEVs);
9890   // For the primary induction the additional bypass end value is known.
9891   // Otherwise it is computed.
9892   Value *EndValueFromAdditionalBypass = MainVectorTripCount;
9893   if (OrigPhi != OldInduction) {
9894     auto *BinOp = II.getInductionBinOp();
9895     // Fast-math-flags propagate from the original induction instruction.
9896     if (isa_and_nonnull<FPMathOperator>(BinOp))
9897       BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
9898 
9899     // Compute the end value for the additional bypass.
9900     EndValueFromAdditionalBypass =
9901         emitTransformedIndex(BypassBuilder, MainVectorTripCount,
9902                              II.getStartValue(), Step, II.getKind(), BinOp);
9903     EndValueFromAdditionalBypass->setName("ind.end");
9904   }
9905   return EndValueFromAdditionalBypass;
9906 }
9907 
processLoop(Loop * L)9908 bool LoopVectorizePass::processLoop(Loop *L) {
9909   assert((EnableVPlanNativePath || L->isInnermost()) &&
9910          "VPlan-native path is not enabled. Only process inner loops.");
9911 
9912   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9913                     << L->getHeader()->getParent()->getName() << "' from "
9914                     << L->getLocStr() << "\n");
9915 
9916   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9917 
9918   LLVM_DEBUG(
9919       dbgs() << "LV: Loop hints:"
9920              << " force="
9921              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9922                      ? "disabled"
9923                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9924                             ? "enabled"
9925                             : "?"))
9926              << " width=" << Hints.getWidth()
9927              << " interleave=" << Hints.getInterleave() << "\n");
9928 
9929   // Function containing loop
9930   Function *F = L->getHeader()->getParent();
9931 
9932   // Looking at the diagnostic output is the only way to determine if a loop
9933   // was vectorized (other than looking at the IR or machine code), so it
9934   // is important to generate an optimization remark for each loop. Most of
9935   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9936   // generated as OptimizationRemark and OptimizationRemarkMissed are
9937   // less verbose reporting vectorized loops and unvectorized loops that may
9938   // benefit from vectorization, respectively.
9939 
9940   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9941     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9942     return false;
9943   }
9944 
9945   PredicatedScalarEvolution PSE(*SE, *L);
9946 
9947   // Check if it is legal to vectorize the loop.
9948   LoopVectorizationRequirements Requirements;
9949   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9950                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9951   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9952     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9953     Hints.emitRemarkWithHints();
9954     return false;
9955   }
9956 
9957   if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
9958     reportVectorizationFailure("Auto-vectorization of loops with uncountable "
9959                                "early exit is not enabled",
9960                                "UncountableEarlyExitLoopsDisabled", ORE, L);
9961     return false;
9962   }
9963 
9964   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9965   // here. They may require CFG and instruction level transformations before
9966   // even evaluating whether vectorization is profitable. Since we cannot modify
9967   // the incoming IR, we need to build VPlan upfront in the vectorization
9968   // pipeline.
9969   if (!L->isInnermost())
9970     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9971                                         ORE, BFI, PSI, Hints, Requirements);
9972 
9973   assert(L->isInnermost() && "Inner loop expected.");
9974 
9975   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9976   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9977 
9978   // If an override option has been passed in for interleaved accesses, use it.
9979   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9980     UseInterleaved = EnableInterleavedMemAccesses;
9981 
9982   // Analyze interleaved memory accesses.
9983   if (UseInterleaved)
9984     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9985 
9986   if (LVL.hasUncountableEarlyExit()) {
9987     BasicBlock *LoopLatch = L->getLoopLatch();
9988     if (IAI.requiresScalarEpilogue() ||
9989         any_of(LVL.getCountableExitingBlocks(),
9990                [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
9991       reportVectorizationFailure("Auto-vectorization of early exit loops "
9992                                  "requiring a scalar epilogue is unsupported",
9993                                  "UncountableEarlyExitUnsupported", ORE, L);
9994       return false;
9995     }
9996   }
9997 
9998   // Check the function attributes and profiles to find out if this function
9999   // should be optimized for size.
10000   ScalarEpilogueLowering SEL =
10001       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10002 
10003   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10004   // count by optimizing for size, to minimize overheads.
10005   auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10006   if (ExpectedTC && ExpectedTC->isFixed() &&
10007       ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
10008     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10009                       << "This loop is worth vectorizing only if no scalar "
10010                       << "iteration overheads are incurred.");
10011     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10012       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10013     else {
10014       LLVM_DEBUG(dbgs() << "\n");
10015       // Predicate tail-folded loops are efficient even when the loop
10016       // iteration count is low. However, setting the epilogue policy to
10017       // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10018       // with runtime checks. It's more effective to let
10019       // `isOutsideLoopWorkProfitable` determine if vectorization is
10020       // beneficial for the loop.
10021       if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10022         SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10023     }
10024   }
10025 
10026   // Check the function attributes to see if implicit floats or vectors are
10027   // allowed.
10028   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10029     reportVectorizationFailure(
10030         "Can't vectorize when the NoImplicitFloat attribute is used",
10031         "loop not vectorized due to NoImplicitFloat attribute",
10032         "NoImplicitFloat", ORE, L);
10033     Hints.emitRemarkWithHints();
10034     return false;
10035   }
10036 
10037   // Check if the target supports potentially unsafe FP vectorization.
10038   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10039   // for the target we're vectorizing for, to make sure none of the
10040   // additional fp-math flags can help.
10041   if (Hints.isPotentiallyUnsafe() &&
10042       TTI->isFPVectorizationPotentiallyUnsafe()) {
10043     reportVectorizationFailure(
10044         "Potentially unsafe FP op prevents vectorization",
10045         "loop not vectorized due to unsafe FP support.",
10046         "UnsafeFP", ORE, L);
10047     Hints.emitRemarkWithHints();
10048     return false;
10049   }
10050 
10051   bool AllowOrderedReductions;
10052   // If the flag is set, use that instead and override the TTI behaviour.
10053   if (ForceOrderedReductions.getNumOccurrences() > 0)
10054     AllowOrderedReductions = ForceOrderedReductions;
10055   else
10056     AllowOrderedReductions = TTI->enableOrderedReductions();
10057   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10058     ORE->emit([&]() {
10059       auto *ExactFPMathInst = Requirements.getExactFPInst();
10060       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10061                                                  ExactFPMathInst->getDebugLoc(),
10062                                                  ExactFPMathInst->getParent())
10063              << "loop not vectorized: cannot prove it is safe to reorder "
10064                 "floating-point operations";
10065     });
10066     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10067                          "reorder floating-point operations\n");
10068     Hints.emitRemarkWithHints();
10069     return false;
10070   }
10071 
10072   // Use the cost model.
10073   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10074                                 F, &Hints, IAI, PSI, BFI);
10075   // Use the planner for vectorization.
10076   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10077                                ORE);
10078 
10079   // Get user vectorization factor and interleave count.
10080   ElementCount UserVF = Hints.getWidth();
10081   unsigned UserIC = Hints.getInterleave();
10082   if (LVL.hasUncountableEarlyExit() && UserIC != 1) {
10083     UserIC = 1;
10084     reportVectorizationInfo("Interleaving not supported for loops "
10085                             "with uncountable early exits",
10086                             "InterleaveEarlyExitDisabled", ORE, L);
10087   }
10088 
10089   // Plan how to best vectorize.
10090   LVP.plan(UserVF, UserIC);
10091   VectorizationFactor VF = LVP.computeBestVF();
10092   unsigned IC = 1;
10093 
10094   if (ORE->allowExtraAnalysis(LV_NAME))
10095     LVP.emitInvalidCostRemarks(ORE);
10096 
10097   GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
10098   if (LVP.hasPlanWithVF(VF.Width)) {
10099     // Select the interleave count.
10100     IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
10101 
10102     unsigned SelectedIC = std::max(IC, UserIC);
10103     //  Optimistically generate runtime checks if they are needed. Drop them if
10104     //  they turn out to not be profitable.
10105     if (VF.Width.isVector() || SelectedIC > 1)
10106       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10107 
10108     // Check if it is profitable to vectorize with runtime checks.
10109     bool ForceVectorization =
10110         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10111     VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
10112                           CM, CM.CostKind);
10113     if (!ForceVectorization &&
10114         !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
10115                                      LVP.getPlanFor(VF.Width), SEL,
10116                                      CM.getVScaleForTuning())) {
10117       ORE->emit([&]() {
10118         return OptimizationRemarkAnalysisAliasing(
10119                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10120                    L->getHeader())
10121                << "loop not vectorized: cannot prove it is safe to reorder "
10122                   "memory operations";
10123       });
10124       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10125       Hints.emitRemarkWithHints();
10126       return false;
10127     }
10128   }
10129 
10130   // Identify the diagnostic messages that should be produced.
10131   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10132   bool VectorizeLoop = true, InterleaveLoop = true;
10133   if (VF.Width.isScalar()) {
10134     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10135     VecDiagMsg = {
10136         "VectorizationNotBeneficial",
10137         "the cost-model indicates that vectorization is not beneficial"};
10138     VectorizeLoop = false;
10139   }
10140 
10141   if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10142     // Tell the user interleaving was avoided up-front, despite being explicitly
10143     // requested.
10144     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10145                          "interleaving should be avoided up front\n");
10146     IntDiagMsg = {"InterleavingAvoided",
10147                   "Ignoring UserIC, because interleaving was avoided up front"};
10148     InterleaveLoop = false;
10149   } else if (IC == 1 && UserIC <= 1) {
10150     // Tell the user interleaving is not beneficial.
10151     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10152     IntDiagMsg = {
10153         "InterleavingNotBeneficial",
10154         "the cost-model indicates that interleaving is not beneficial"};
10155     InterleaveLoop = false;
10156     if (UserIC == 1) {
10157       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10158       IntDiagMsg.second +=
10159           " and is explicitly disabled or interleave count is set to 1";
10160     }
10161   } else if (IC > 1 && UserIC == 1) {
10162     // Tell the user interleaving is beneficial, but it explicitly disabled.
10163     LLVM_DEBUG(
10164         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10165     IntDiagMsg = {"InterleavingBeneficialButDisabled",
10166                   "the cost-model indicates that interleaving is beneficial "
10167                   "but is explicitly disabled or interleave count is set to 1"};
10168     InterleaveLoop = false;
10169   }
10170 
10171   // If there is a histogram in the loop, do not just interleave without
10172   // vectorizing. The order of operations will be incorrect without the
10173   // histogram intrinsics, which are only used for recipes with VF > 1.
10174   if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10175     LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10176                       << "to histogram operations.\n");
10177     IntDiagMsg = {
10178         "HistogramPreventsScalarInterleaving",
10179         "Unable to interleave without vectorization due to constraints on "
10180         "the order of histogram operations"};
10181     InterleaveLoop = false;
10182   }
10183 
10184   // Override IC if user provided an interleave count.
10185   IC = UserIC > 0 ? UserIC : IC;
10186 
10187   // Emit diagnostic messages, if any.
10188   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10189   if (!VectorizeLoop && !InterleaveLoop) {
10190     // Do not vectorize or interleaving the loop.
10191     ORE->emit([&]() {
10192       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10193                                       L->getStartLoc(), L->getHeader())
10194              << VecDiagMsg.second;
10195     });
10196     ORE->emit([&]() {
10197       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10198                                       L->getStartLoc(), L->getHeader())
10199              << IntDiagMsg.second;
10200     });
10201     return false;
10202   }
10203 
10204   if (!VectorizeLoop && InterleaveLoop) {
10205     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10206     ORE->emit([&]() {
10207       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10208                                         L->getStartLoc(), L->getHeader())
10209              << VecDiagMsg.second;
10210     });
10211   } else if (VectorizeLoop && !InterleaveLoop) {
10212     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10213                       << ") in " << L->getLocStr() << '\n');
10214     ORE->emit([&]() {
10215       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10216                                         L->getStartLoc(), L->getHeader())
10217              << IntDiagMsg.second;
10218     });
10219   } else if (VectorizeLoop && InterleaveLoop) {
10220     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10221                       << ") in " << L->getLocStr() << '\n');
10222     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10223   }
10224 
10225   bool DisableRuntimeUnroll = false;
10226   MDNode *OrigLoopID = L->getLoopID();
10227   {
10228     using namespace ore;
10229     if (!VectorizeLoop) {
10230       assert(IC > 1 && "interleave count should not be 1 or 0");
10231       // If we decided that it is not legal to vectorize the loop, then
10232       // interleave it.
10233       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10234       InnerLoopVectorizer Unroller(
10235           L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10236           ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
10237 
10238       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10239 
10240       ORE->emit([&]() {
10241         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10242                                   L->getHeader())
10243                << "interleaved loop (interleaved count: "
10244                << NV("InterleaveCount", IC) << ")";
10245       });
10246     } else {
10247       // If we decided that it is *legal* to vectorize the loop, then do it.
10248 
10249       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10250       // Consider vectorizing the epilogue too if it's profitable.
10251       VectorizationFactor EpilogueVF =
10252           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10253       if (EpilogueVF.Width.isVector()) {
10254         std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10255 
10256         // The first pass vectorizes the main loop and creates a scalar epilogue
10257         // to be vectorized by executing the plan (potentially with a different
10258         // factor) again shortly afterwards.
10259         VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10260         BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
10261         preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10262         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10263                                           BestEpiPlan);
10264         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10265                                            EPI, &CM, BFI, PSI, Checks,
10266                                            *BestMainPlan);
10267         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10268                                              *BestMainPlan, MainILV, DT, false);
10269         ++LoopsVectorized;
10270 
10271         // Second pass vectorizes the epilogue and adjusts the control flow
10272         // edges from the first pass.
10273         EPI.MainLoopVF = EPI.EpilogueVF;
10274         EPI.MainLoopUF = EPI.EpilogueUF;
10275         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10276                                                  ORE, EPI, &CM, BFI, PSI,
10277                                                  Checks, BestEpiPlan);
10278         EpilogILV.setTripCount(MainILV.getTripCount());
10279         preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10280 
10281         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10282                         DT, true);
10283 
10284         // Fix induction resume values from the additional bypass block.
10285         BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock();
10286         IRBuilder<> BypassBuilder(BypassBlock,
10287                                   BypassBlock->getFirstInsertionPt());
10288         BasicBlock *PH = L->getLoopPreheader();
10289         for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
10290           auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
10291           Value *V = createInductionAdditionalBypassValues(
10292               IVPhi, II, BypassBuilder, ExpandedSCEVs, EPI.VectorTripCount,
10293               LVL.getPrimaryInduction());
10294           // TODO: Directly add as extra operand to the VPResumePHI recipe.
10295           Inc->setIncomingValueForBlock(BypassBlock, V);
10296         }
10297         ++LoopsEpilogueVectorized;
10298 
10299         if (!Checks.hasChecks())
10300           DisableRuntimeUnroll = true;
10301       } else {
10302         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10303                                VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
10304                                Checks, BestPlan);
10305         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10306         ++LoopsVectorized;
10307 
10308         // Add metadata to disable runtime unrolling a scalar loop when there
10309         // are no runtime checks about strides and memory. A scalar loop that is
10310         // rarely used is not worth unrolling.
10311         if (!Checks.hasChecks())
10312           DisableRuntimeUnroll = true;
10313       }
10314       // Report the vectorization decision.
10315       reportVectorization(ORE, L, VF, IC);
10316     }
10317 
10318     if (ORE->allowExtraAnalysis(LV_NAME))
10319       checkMixedPrecision(L, ORE);
10320   }
10321 
10322   assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10323          "DT not preserved correctly");
10324 
10325   std::optional<MDNode *> RemainderLoopID =
10326       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10327                                       LLVMLoopVectorizeFollowupEpilogue});
10328   if (RemainderLoopID) {
10329     L->setLoopID(*RemainderLoopID);
10330   } else {
10331     if (DisableRuntimeUnroll)
10332       addRuntimeUnrollDisableMetaData(L);
10333 
10334     // Mark the loop as already vectorized to avoid vectorizing again.
10335     Hints.setAlreadyVectorized();
10336   }
10337 
10338   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10339   return true;
10340 }
10341 
runImpl(Function & F)10342 LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
10343 
10344   // Don't attempt if
10345   // 1. the target claims to have no vector registers, and
10346   // 2. interleaving won't help ILP.
10347   //
10348   // The second condition is necessary because, even if the target has no
10349   // vector registers, loop vectorization may still enable scalar
10350   // interleaving.
10351   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10352       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10353     return LoopVectorizeResult(false, false);
10354 
10355   bool Changed = false, CFGChanged = false;
10356 
10357   // The vectorizer requires loops to be in simplified form.
10358   // Since simplification may add new inner loops, it has to run before the
10359   // legality and profitability checks. This means running the loop vectorizer
10360   // will simplify all loops, regardless of whether anything end up being
10361   // vectorized.
10362   for (const auto &L : *LI)
10363     Changed |= CFGChanged |=
10364         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10365 
10366   // Build up a worklist of inner-loops to vectorize. This is necessary as
10367   // the act of vectorizing or partially unrolling a loop creates new loops
10368   // and can invalidate iterators across the loops.
10369   SmallVector<Loop *, 8> Worklist;
10370 
10371   for (Loop *L : *LI)
10372     collectSupportedLoops(*L, LI, ORE, Worklist);
10373 
10374   LoopsAnalyzed += Worklist.size();
10375 
10376   // Now walk the identified inner loops.
10377   while (!Worklist.empty()) {
10378     Loop *L = Worklist.pop_back_val();
10379 
10380     // For the inner loops we actually process, form LCSSA to simplify the
10381     // transform.
10382     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10383 
10384     Changed |= CFGChanged |= processLoop(L);
10385 
10386     if (Changed) {
10387       LAIs->clear();
10388 
10389 #ifndef NDEBUG
10390       if (VerifySCEV)
10391         SE->verify();
10392 #endif
10393     }
10394   }
10395 
10396   // Process each loop nest in the function.
10397   return LoopVectorizeResult(Changed, CFGChanged);
10398 }
10399 
run(Function & F,FunctionAnalysisManager & AM)10400 PreservedAnalyses LoopVectorizePass::run(Function &F,
10401                                          FunctionAnalysisManager &AM) {
10402   LI = &AM.getResult<LoopAnalysis>(F);
10403   // There are no loops in the function. Return before computing other
10404   // expensive analyses.
10405   if (LI->empty())
10406     return PreservedAnalyses::all();
10407   SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
10408   TTI = &AM.getResult<TargetIRAnalysis>(F);
10409   DT = &AM.getResult<DominatorTreeAnalysis>(F);
10410   TLI = &AM.getResult<TargetLibraryAnalysis>(F);
10411   AC = &AM.getResult<AssumptionAnalysis>(F);
10412   DB = &AM.getResult<DemandedBitsAnalysis>(F);
10413   ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10414   LAIs = &AM.getResult<LoopAccessAnalysis>(F);
10415 
10416   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10417   PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10418   BFI = nullptr;
10419   if (PSI && PSI->hasProfileSummary())
10420     BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10421   LoopVectorizeResult Result = runImpl(F);
10422   if (!Result.MadeAnyChange)
10423     return PreservedAnalyses::all();
10424   PreservedAnalyses PA;
10425 
10426   if (isAssignmentTrackingEnabled(*F.getParent())) {
10427     for (auto &BB : F)
10428       RemoveRedundantDbgInstrs(&BB);
10429   }
10430 
10431   PA.preserve<LoopAnalysis>();
10432   PA.preserve<DominatorTreeAnalysis>();
10433   PA.preserve<ScalarEvolutionAnalysis>();
10434   PA.preserve<LoopAccessAnalysis>();
10435 
10436   if (Result.MadeCFGChange) {
10437     // Making CFG changes likely means a loop got vectorized. Indicate that
10438     // extra simplification passes should be run.
10439     // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10440     // be run if runtime checks have been added.
10441     AM.getResult<ShouldRunExtraVectorPasses>(F);
10442     PA.preserve<ShouldRunExtraVectorPasses>();
10443   } else {
10444     PA.preserveSet<CFGAnalyses>();
10445   }
10446   return PA;
10447 }
10448 
printPipeline(raw_ostream & OS,function_ref<StringRef (StringRef)> MapClassName2PassName)10449 void LoopVectorizePass::printPipeline(
10450     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10451   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10452       OS, MapClassName2PassName);
10453 
10454   OS << '<';
10455   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10456   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10457   OS << '>';
10458 }
10459