xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 36b606ae6aa4b24061096ba18582e0a08ccd5dba)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanPatternMatch.h"
63 #include "VPlanTransforms.h"
64 #include "VPlanVerifier.h"
65 #include "llvm/ADT/APInt.h"
66 #include "llvm/ADT/ArrayRef.h"
67 #include "llvm/ADT/DenseMap.h"
68 #include "llvm/ADT/DenseMapInfo.h"
69 #include "llvm/ADT/Hashing.h"
70 #include "llvm/ADT/MapVector.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/ValueTracking.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfo.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/MDBuilder.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/PatternMatch.h"
122 #include "llvm/IR/ProfDataUtils.h"
123 #include "llvm/IR/Type.h"
124 #include "llvm/IR/Use.h"
125 #include "llvm/IR/User.h"
126 #include "llvm/IR/Value.h"
127 #include "llvm/IR/ValueHandle.h"
128 #include "llvm/IR/VectorBuilder.h"
129 #include "llvm/IR/Verifier.h"
130 #include "llvm/Support/Casting.h"
131 #include "llvm/Support/CommandLine.h"
132 #include "llvm/Support/Compiler.h"
133 #include "llvm/Support/Debug.h"
134 #include "llvm/Support/ErrorHandling.h"
135 #include "llvm/Support/InstructionCost.h"
136 #include "llvm/Support/MathExtras.h"
137 #include "llvm/Support/raw_ostream.h"
138 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
139 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
140 #include "llvm/Transforms/Utils/LoopSimplify.h"
141 #include "llvm/Transforms/Utils/LoopUtils.h"
142 #include "llvm/Transforms/Utils/LoopVersioning.h"
143 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
144 #include "llvm/Transforms/Utils/SizeOpts.h"
145 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
146 #include <algorithm>
147 #include <cassert>
148 #include <cmath>
149 #include <cstdint>
150 #include <functional>
151 #include <iterator>
152 #include <limits>
153 #include <map>
154 #include <memory>
155 #include <string>
156 #include <tuple>
157 #include <utility>
158 
159 using namespace llvm;
160 
161 #define LV_NAME "loop-vectorize"
162 #define DEBUG_TYPE LV_NAME
163 
164 #ifndef NDEBUG
165 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
166 #endif
167 
168 /// @{
169 /// Metadata attribute names
170 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
171 const char LLVMLoopVectorizeFollowupVectorized[] =
172     "llvm.loop.vectorize.followup_vectorized";
173 const char LLVMLoopVectorizeFollowupEpilogue[] =
174     "llvm.loop.vectorize.followup_epilogue";
175 /// @}
176 
177 STATISTIC(LoopsVectorized, "Number of loops vectorized");
178 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
179 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
180 
181 static cl::opt<bool> EnableEpilogueVectorization(
182     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
183     cl::desc("Enable vectorization of epilogue loops."));
184 
185 static cl::opt<unsigned> EpilogueVectorizationForceVF(
186     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
187     cl::desc("When epilogue vectorization is enabled, and a value greater than "
188              "1 is specified, forces the given VF for all applicable epilogue "
189              "loops."));
190 
191 static cl::opt<unsigned> EpilogueVectorizationMinVF(
192     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
193     cl::desc("Only loops with vectorization factor equal to or larger than "
194              "the specified value are considered for epilogue vectorization."));
195 
196 /// Loops with a known constant trip count below this number are vectorized only
197 /// if no scalar iteration overheads are incurred.
198 static cl::opt<unsigned> TinyTripCountVectorThreshold(
199     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
200     cl::desc("Loops with a constant trip count that is smaller than this "
201              "value are vectorized only if no scalar iteration overheads "
202              "are incurred."));
203 
204 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
205     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
206     cl::desc("The maximum allowed number of runtime memory checks"));
207 
208 static cl::opt<bool> UseLegacyCostModel(
209     "vectorize-use-legacy-cost-model", cl::init(true), cl::Hidden,
210     cl::desc("Use the legacy cost model instead of the VPlan-based cost model. "
211              "This option will be removed in the future."));
212 
213 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
214 // that predication is preferred, and this lists all options. I.e., the
215 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
216 // and predicate the instructions accordingly. If tail-folding fails, there are
217 // different fallback strategies depending on these values:
218 namespace PreferPredicateTy {
219   enum Option {
220     ScalarEpilogue = 0,
221     PredicateElseScalarEpilogue,
222     PredicateOrDontVectorize
223   };
224 } // namespace PreferPredicateTy
225 
226 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
227     "prefer-predicate-over-epilogue",
228     cl::init(PreferPredicateTy::ScalarEpilogue),
229     cl::Hidden,
230     cl::desc("Tail-folding and predication preferences over creating a scalar "
231              "epilogue loop."),
232     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
233                          "scalar-epilogue",
234                          "Don't tail-predicate loops, create scalar epilogue"),
235               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
236                          "predicate-else-scalar-epilogue",
237                          "prefer tail-folding, create scalar epilogue if tail "
238                          "folding fails."),
239               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
240                          "predicate-dont-vectorize",
241                          "prefers tail-folding, don't attempt vectorization if "
242                          "tail-folding fails.")));
243 
244 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
245     "force-tail-folding-style", cl::desc("Force the tail folding style"),
246     cl::init(TailFoldingStyle::None),
247     cl::values(
248         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
249         clEnumValN(
250             TailFoldingStyle::Data, "data",
251             "Create lane mask for data only, using active.lane.mask intrinsic"),
252         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
253                    "data-without-lane-mask",
254                    "Create lane mask with compare/stepvector"),
255         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
256                    "Create lane mask using active.lane.mask intrinsic, and use "
257                    "it for both data and control flow"),
258         clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
259                    "data-and-control-without-rt-check",
260                    "Similar to data-and-control, but remove the runtime check"),
261         clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
262                    "Use predicated EVL instructions for tail folding. If EVL "
263                    "is unsupported, fallback to data-without-lane-mask.")));
264 
265 static cl::opt<bool> MaximizeBandwidth(
266     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
267     cl::desc("Maximize bandwidth when selecting vectorization factor which "
268              "will be determined by the smallest type in loop."));
269 
270 static cl::opt<bool> EnableInterleavedMemAccesses(
271     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
272     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
273 
274 /// An interleave-group may need masking if it resides in a block that needs
275 /// predication, or in order to mask away gaps.
276 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
277     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
278     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
279 
280 static cl::opt<unsigned> ForceTargetNumScalarRegs(
281     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
282     cl::desc("A flag that overrides the target's number of scalar registers."));
283 
284 static cl::opt<unsigned> ForceTargetNumVectorRegs(
285     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
286     cl::desc("A flag that overrides the target's number of vector registers."));
287 
288 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
289     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
290     cl::desc("A flag that overrides the target's max interleave factor for "
291              "scalar loops."));
292 
293 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
294     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
295     cl::desc("A flag that overrides the target's max interleave factor for "
296              "vectorized loops."));
297 
298 cl::opt<unsigned> ForceTargetInstructionCost(
299     "force-target-instruction-cost", cl::init(0), cl::Hidden,
300     cl::desc("A flag that overrides the target's expected cost for "
301              "an instruction to a single constant value. Mostly "
302              "useful for getting consistent testing."));
303 
304 static cl::opt<bool> ForceTargetSupportsScalableVectors(
305     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
306     cl::desc(
307         "Pretend that scalable vectors are supported, even if the target does "
308         "not support them. This flag should only be used for testing."));
309 
310 static cl::opt<unsigned> SmallLoopCost(
311     "small-loop-cost", cl::init(20), cl::Hidden,
312     cl::desc(
313         "The cost of a loop that is considered 'small' by the interleaver."));
314 
315 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
316     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
317     cl::desc("Enable the use of the block frequency analysis to access PGO "
318              "heuristics minimizing code growth in cold regions and being more "
319              "aggressive in hot regions."));
320 
321 // Runtime interleave loops for load/store throughput.
322 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
323     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
324     cl::desc(
325         "Enable runtime interleaving until load/store ports are saturated"));
326 
327 /// The number of stores in a loop that are allowed to need predication.
328 static cl::opt<unsigned> NumberOfStoresToPredicate(
329     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
330     cl::desc("Max number of stores to be predicated behind an if."));
331 
332 static cl::opt<bool> EnableIndVarRegisterHeur(
333     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
334     cl::desc("Count the induction variable only once when interleaving"));
335 
336 static cl::opt<bool> EnableCondStoresVectorization(
337     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
338     cl::desc("Enable if predication of stores during vectorization."));
339 
340 static cl::opt<unsigned> MaxNestedScalarReductionIC(
341     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
342     cl::desc("The maximum interleave count to use when interleaving a scalar "
343              "reduction in a nested loop."));
344 
345 static cl::opt<bool>
346     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
347                            cl::Hidden,
348                            cl::desc("Prefer in-loop vector reductions, "
349                                     "overriding the targets preference."));
350 
351 static cl::opt<bool> ForceOrderedReductions(
352     "force-ordered-reductions", cl::init(false), cl::Hidden,
353     cl::desc("Enable the vectorisation of loops with in-order (strict) "
354              "FP reductions"));
355 
356 static cl::opt<bool> PreferPredicatedReductionSelect(
357     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
358     cl::desc(
359         "Prefer predicating a reduction operation over an after loop select."));
360 
361 namespace llvm {
362 cl::opt<bool> EnableVPlanNativePath(
363     "enable-vplan-native-path", cl::Hidden,
364     cl::desc("Enable VPlan-native vectorization path with "
365              "support for outer loop vectorization."));
366 }
367 
368 // This flag enables the stress testing of the VPlan H-CFG construction in the
369 // VPlan-native vectorization path. It must be used in conjuction with
370 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
371 // verification of the H-CFGs built.
372 static cl::opt<bool> VPlanBuildStressTest(
373     "vplan-build-stress-test", cl::init(false), cl::Hidden,
374     cl::desc(
375         "Build VPlan for every supported loop nest in the function and bail "
376         "out right after the build (stress test the VPlan H-CFG construction "
377         "in the VPlan-native vectorization path)."));
378 
379 cl::opt<bool> llvm::EnableLoopInterleaving(
380     "interleave-loops", cl::init(true), cl::Hidden,
381     cl::desc("Enable loop interleaving in Loop vectorization passes"));
382 cl::opt<bool> llvm::EnableLoopVectorization(
383     "vectorize-loops", cl::init(true), cl::Hidden,
384     cl::desc("Run the Loop vectorization passes"));
385 
386 static cl::opt<bool> PrintVPlansInDotFormat(
387     "vplan-print-in-dot-format", cl::Hidden,
388     cl::desc("Use dot format instead of plain text when dumping VPlans"));
389 
390 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
391     "force-widen-divrem-via-safe-divisor", cl::Hidden,
392     cl::desc(
393         "Override cost based safe divisor widening for div/rem instructions"));
394 
395 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
396     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
397     cl::Hidden,
398     cl::desc("Try wider VFs if they enable the use of vector variants"));
399 
400 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
401 // variables not overflowing do not hold. See `emitSCEVChecks`.
402 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
403 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
404 // `emitMemRuntimeChecks`.
405 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
406 // Likelyhood of bypassing the vectorized loop because there are zero trips left
407 // after prolog. See `emitIterationCountCheck`.
408 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
409 
410 /// A helper function that returns true if the given type is irregular. The
411 /// type is irregular if its allocated size doesn't equal the store size of an
412 /// element of the corresponding vector type.
hasIrregularType(Type * Ty,const DataLayout & DL)413 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
414   // Determine if an array of N elements of type Ty is "bitcast compatible"
415   // with a <N x Ty> vector.
416   // This is only true if there is no padding between the array elements.
417   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
418 }
419 
420 /// Returns "best known" trip count for the specified loop \p L as defined by
421 /// the following procedure:
422 ///   1) Returns exact trip count if it is known.
423 ///   2) Returns expected trip count according to profile data if any.
424 ///   3) Returns upper bound estimate if it is known.
425 ///   4) Returns std::nullopt if all of the above failed.
getSmallBestKnownTC(ScalarEvolution & SE,Loop * L)426 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
427                                                    Loop *L) {
428   // Check if exact trip count is known.
429   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
430     return ExpectedTC;
431 
432   // Check if there is an expected trip count available from profile data.
433   if (LoopVectorizeWithBlockFrequency)
434     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
435       return *EstimatedTC;
436 
437   // Check if upper bound estimate is known.
438   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
439     return ExpectedTC;
440 
441   return std::nullopt;
442 }
443 
444 namespace {
445 // Forward declare GeneratedRTChecks.
446 class GeneratedRTChecks;
447 
448 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
449 } // namespace
450 
451 namespace llvm {
452 
453 AnalysisKey ShouldRunExtraVectorPasses::Key;
454 
455 /// InnerLoopVectorizer vectorizes loops which contain only one basic
456 /// block to a specified vectorization factor (VF).
457 /// This class performs the widening of scalars into vectors, or multiple
458 /// scalars. This class also implements the following features:
459 /// * It inserts an epilogue loop for handling loops that don't have iteration
460 ///   counts that are known to be a multiple of the vectorization factor.
461 /// * It handles the code generation for reduction variables.
462 /// * Scalarization (implementation using scalars) of un-vectorizable
463 ///   instructions.
464 /// InnerLoopVectorizer does not perform any vectorization-legality
465 /// checks, and relies on the caller to check for the different legality
466 /// aspects. The InnerLoopVectorizer relies on the
467 /// LoopVectorizationLegality class to provide information about the induction
468 /// and reduction variables that were found to a given vectorization factor.
469 class InnerLoopVectorizer {
470 public:
InnerLoopVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,ElementCount VecWidth,ElementCount MinProfitableTripCount,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & RTChecks)471   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
472                       LoopInfo *LI, DominatorTree *DT,
473                       const TargetLibraryInfo *TLI,
474                       const TargetTransformInfo *TTI, AssumptionCache *AC,
475                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
476                       ElementCount MinProfitableTripCount,
477                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
478                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
479                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
480       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
481         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
482         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
483         PSI(PSI), RTChecks(RTChecks) {
484     // Query this against the original loop and save it here because the profile
485     // of the original loop header may change as the transformation happens.
486     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
487         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
488 
489     if (MinProfitableTripCount.isZero())
490       this->MinProfitableTripCount = VecWidth;
491     else
492       this->MinProfitableTripCount = MinProfitableTripCount;
493   }
494 
495   virtual ~InnerLoopVectorizer() = default;
496 
497   /// Create a new empty loop that will contain vectorized instructions later
498   /// on, while the old loop will be used as the scalar remainder. Control flow
499   /// is generated around the vectorized (and scalar epilogue) loops consisting
500   /// of various checks and bypasses. Return the pre-header block of the new
501   /// loop and the start value for the canonical induction, if it is != 0. The
502   /// latter is the case when vectorizing the epilogue loop. In the case of
503   /// epilogue vectorization, this function is overriden to handle the more
504   /// complex control flow around the loops.  \p ExpandedSCEVs is used to
505   /// look up SCEV expansions for expressions needed during skeleton creation.
506   virtual std::pair<BasicBlock *, Value *>
507   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
508 
509   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
510   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
511 
512   // Return true if any runtime check is added.
areSafetyChecksAdded()513   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
514 
515   /// A helper function to scalarize a single Instruction in the innermost loop.
516   /// Generates a sequence of scalar instances for each lane between \p MinLane
517   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
518   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
519   /// Instr's operands.
520   void scalarizeInstruction(const Instruction *Instr,
521                             VPReplicateRecipe *RepRecipe,
522                             const VPIteration &Instance,
523                             VPTransformState &State);
524 
525   /// Fix the non-induction PHIs in \p Plan.
526   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
527 
528   /// Create a new phi node for the induction variable \p OrigPhi to resume
529   /// iteration count in the scalar epilogue, from where the vectorized loop
530   /// left off. \p Step is the SCEV-expanded induction step to use. In cases
531   /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
532   /// and the resume values can come from an additional bypass block, the \p
533   /// AdditionalBypass pair provides information about the bypass block and the
534   /// end value on the edge from bypass to this loop.
535   PHINode *createInductionResumeValue(
536       PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
537       ArrayRef<BasicBlock *> BypassBlocks,
538       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
539 
540   /// Returns the original loop trip count.
getTripCount() const541   Value *getTripCount() const { return TripCount; }
542 
543   /// Used to set the trip count after ILV's construction and after the
544   /// preheader block has been executed. Note that this always holds the trip
545   /// count of the original loop for both main loop and epilogue vectorization.
setTripCount(Value * TC)546   void setTripCount(Value *TC) { TripCount = TC; }
547 
548 protected:
549   friend class LoopVectorizationPlanner;
550 
551   /// A small list of PHINodes.
552   using PhiVector = SmallVector<PHINode *, 4>;
553 
554   /// A type for scalarized values in the new loop. Each value from the
555   /// original loop, when scalarized, is represented by UF x VF scalar values
556   /// in the new unrolled loop, where UF is the unroll factor and VF is the
557   /// vectorization factor.
558   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
559 
560   /// Set up the values of the IVs correctly when exiting the vector loop.
561   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
562                     Value *VectorTripCount, Value *EndValue,
563                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
564                     VPlan &Plan, VPTransformState &State);
565 
566   /// Iteratively sink the scalarized operands of a predicated instruction into
567   /// the block that was created for it.
568   void sinkScalarOperands(Instruction *PredInst);
569 
570   /// Returns (and creates if needed) the trip count of the widened loop.
571   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
572 
573   /// Emit a bypass check to see if the vector trip count is zero, including if
574   /// it overflows.
575   void emitIterationCountCheck(BasicBlock *Bypass);
576 
577   /// Emit a bypass check to see if all of the SCEV assumptions we've
578   /// had to make are correct. Returns the block containing the checks or
579   /// nullptr if no checks have been added.
580   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
581 
582   /// Emit bypass checks to check any memory assumptions we may have made.
583   /// Returns the block containing the checks or nullptr if no checks have been
584   /// added.
585   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
586 
587   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
588   /// vector loop preheader, middle block and scalar preheader.
589   void createVectorLoopSkeleton(StringRef Prefix);
590 
591   /// Create new phi nodes for the induction variables to resume iteration count
592   /// in the scalar epilogue, from where the vectorized loop left off.
593   /// In cases where the loop skeleton is more complicated (eg. epilogue
594   /// vectorization) and the resume values can come from an additional bypass
595   /// block, the \p AdditionalBypass pair provides information about the bypass
596   /// block and the end value on the edge from bypass to this loop.
597   void createInductionResumeValues(
598       const SCEV2ValueTy &ExpandedSCEVs,
599       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
600 
601   /// Complete the loop skeleton by adding debug MDs, creating appropriate
602   /// conditional branches in the middle block, preparing the builder and
603   /// running the verifier. Return the preheader of the completed vector loop.
604   BasicBlock *completeLoopSkeleton();
605 
606   /// Allow subclasses to override and print debug traces before/after vplan
607   /// execution, when trace information is requested.
printDebugTracesAtStart()608   virtual void printDebugTracesAtStart(){};
printDebugTracesAtEnd()609   virtual void printDebugTracesAtEnd(){};
610 
611   /// The original loop.
612   Loop *OrigLoop;
613 
614   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
615   /// dynamic knowledge to simplify SCEV expressions and converts them to a
616   /// more usable form.
617   PredicatedScalarEvolution &PSE;
618 
619   /// Loop Info.
620   LoopInfo *LI;
621 
622   /// Dominator Tree.
623   DominatorTree *DT;
624 
625   /// Target Library Info.
626   const TargetLibraryInfo *TLI;
627 
628   /// Target Transform Info.
629   const TargetTransformInfo *TTI;
630 
631   /// Assumption Cache.
632   AssumptionCache *AC;
633 
634   /// Interface to emit optimization remarks.
635   OptimizationRemarkEmitter *ORE;
636 
637   /// The vectorization SIMD factor to use. Each vector will have this many
638   /// vector elements.
639   ElementCount VF;
640 
641   ElementCount MinProfitableTripCount;
642 
643   /// The vectorization unroll factor to use. Each scalar is vectorized to this
644   /// many different vector instructions.
645   unsigned UF;
646 
647   /// The builder that we use
648   IRBuilder<> Builder;
649 
650   // --- Vectorization state ---
651 
652   /// The vector-loop preheader.
653   BasicBlock *LoopVectorPreHeader;
654 
655   /// The scalar-loop preheader.
656   BasicBlock *LoopScalarPreHeader;
657 
658   /// Middle Block between the vector and the scalar.
659   BasicBlock *LoopMiddleBlock;
660 
661   /// The unique ExitBlock of the scalar loop if one exists.  Note that
662   /// there can be multiple exiting edges reaching this block.
663   BasicBlock *LoopExitBlock;
664 
665   /// The scalar loop body.
666   BasicBlock *LoopScalarBody;
667 
668   /// A list of all bypass blocks. The first block is the entry of the loop.
669   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
670 
671   /// Store instructions that were predicated.
672   SmallVector<Instruction *, 4> PredicatedInstructions;
673 
674   /// Trip count of the original loop.
675   Value *TripCount = nullptr;
676 
677   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
678   Value *VectorTripCount = nullptr;
679 
680   /// The legality analysis.
681   LoopVectorizationLegality *Legal;
682 
683   /// The profitablity analysis.
684   LoopVectorizationCostModel *Cost;
685 
686   // Record whether runtime checks are added.
687   bool AddedSafetyChecks = false;
688 
689   // Holds the end values for each induction variable. We save the end values
690   // so we can later fix-up the external users of the induction variables.
691   DenseMap<PHINode *, Value *> IVEndValues;
692 
693   /// BFI and PSI are used to check for profile guided size optimizations.
694   BlockFrequencyInfo *BFI;
695   ProfileSummaryInfo *PSI;
696 
697   // Whether this loop should be optimized for size based on profile guided size
698   // optimizatios.
699   bool OptForSizeBasedOnProfile;
700 
701   /// Structure to hold information about generated runtime checks, responsible
702   /// for cleaning the checks, if vectorization turns out unprofitable.
703   GeneratedRTChecks &RTChecks;
704 
705   // Holds the resume values for reductions in the loops, used to set the
706   // correct start value of reduction PHIs when vectorizing the epilogue.
707   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
708       ReductionResumeValues;
709 };
710 
711 class InnerLoopUnroller : public InnerLoopVectorizer {
712 public:
InnerLoopUnroller(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)713   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
714                     LoopInfo *LI, DominatorTree *DT,
715                     const TargetLibraryInfo *TLI,
716                     const TargetTransformInfo *TTI, AssumptionCache *AC,
717                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
718                     LoopVectorizationLegality *LVL,
719                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
720                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
721       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
722                             ElementCount::getFixed(1),
723                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
724                             BFI, PSI, Check) {}
725 };
726 
727 /// Encapsulate information regarding vectorization of a loop and its epilogue.
728 /// This information is meant to be updated and used across two stages of
729 /// epilogue vectorization.
730 struct EpilogueLoopVectorizationInfo {
731   ElementCount MainLoopVF = ElementCount::getFixed(0);
732   unsigned MainLoopUF = 0;
733   ElementCount EpilogueVF = ElementCount::getFixed(0);
734   unsigned EpilogueUF = 0;
735   BasicBlock *MainLoopIterationCountCheck = nullptr;
736   BasicBlock *EpilogueIterationCountCheck = nullptr;
737   BasicBlock *SCEVSafetyCheck = nullptr;
738   BasicBlock *MemSafetyCheck = nullptr;
739   Value *TripCount = nullptr;
740   Value *VectorTripCount = nullptr;
741 
EpilogueLoopVectorizationInfollvm::EpilogueLoopVectorizationInfo742   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
743                                 ElementCount EVF, unsigned EUF)
744       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
745     assert(EUF == 1 &&
746            "A high UF for the epilogue loop is likely not beneficial.");
747   }
748 };
749 
750 /// An extension of the inner loop vectorizer that creates a skeleton for a
751 /// vectorized loop that has its epilogue (residual) also vectorized.
752 /// The idea is to run the vplan on a given loop twice, firstly to setup the
753 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
754 /// from the first step and vectorize the epilogue.  This is achieved by
755 /// deriving two concrete strategy classes from this base class and invoking
756 /// them in succession from the loop vectorizer planner.
757 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
758 public:
InnerLoopAndEpilogueVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)759   InnerLoopAndEpilogueVectorizer(
760       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
761       DominatorTree *DT, const TargetLibraryInfo *TLI,
762       const TargetTransformInfo *TTI, AssumptionCache *AC,
763       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
764       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
765       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
766       GeneratedRTChecks &Checks)
767       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
768                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
769                             CM, BFI, PSI, Checks),
770         EPI(EPI) {}
771 
772   // Override this function to handle the more complex control flow around the
773   // three loops.
createVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)774   std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
775       const SCEV2ValueTy &ExpandedSCEVs) final {
776     return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
777   }
778 
779   /// The interface for creating a vectorized skeleton using one of two
780   /// different strategies, each corresponding to one execution of the vplan
781   /// as described above.
782   virtual std::pair<BasicBlock *, Value *>
783   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
784 
785   /// Holds and updates state information required to vectorize the main loop
786   /// and its epilogue in two separate passes. This setup helps us avoid
787   /// regenerating and recomputing runtime safety checks. It also helps us to
788   /// shorten the iteration-count-check path length for the cases where the
789   /// iteration count of the loop is so small that the main vector loop is
790   /// completely skipped.
791   EpilogueLoopVectorizationInfo &EPI;
792 };
793 
794 /// A specialized derived class of inner loop vectorizer that performs
795 /// vectorization of *main* loops in the process of vectorizing loops and their
796 /// epilogues.
797 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
798 public:
EpilogueVectorizerMainLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)799   EpilogueVectorizerMainLoop(
800       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
801       DominatorTree *DT, const TargetLibraryInfo *TLI,
802       const TargetTransformInfo *TTI, AssumptionCache *AC,
803       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
804       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
805       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
806       GeneratedRTChecks &Check)
807       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
808                                        EPI, LVL, CM, BFI, PSI, Check) {}
809   /// Implements the interface for creating a vectorized skeleton using the
810   /// *main loop* strategy (ie the first pass of vplan execution).
811   std::pair<BasicBlock *, Value *>
812   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
813 
814 protected:
815   /// Emits an iteration count bypass check once for the main loop (when \p
816   /// ForEpilogue is false) and once for the epilogue loop (when \p
817   /// ForEpilogue is true).
818   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
819   void printDebugTracesAtStart() override;
820   void printDebugTracesAtEnd() override;
821 };
822 
823 // A specialized derived class of inner loop vectorizer that performs
824 // vectorization of *epilogue* loops in the process of vectorizing loops and
825 // their epilogues.
826 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
827 public:
EpilogueVectorizerEpilogueLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)828   EpilogueVectorizerEpilogueLoop(
829       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
830       DominatorTree *DT, const TargetLibraryInfo *TLI,
831       const TargetTransformInfo *TTI, AssumptionCache *AC,
832       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
833       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
834       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
835       GeneratedRTChecks &Checks)
836       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
837                                        EPI, LVL, CM, BFI, PSI, Checks) {
838     TripCount = EPI.TripCount;
839   }
840   /// Implements the interface for creating a vectorized skeleton using the
841   /// *epilogue loop* strategy (ie the second pass of vplan execution).
842   std::pair<BasicBlock *, Value *>
843   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
844 
845 protected:
846   /// Emits an iteration count bypass check after the main vector loop has
847   /// finished to see if there are any iterations left to execute by either
848   /// the vector epilogue or the scalar epilogue.
849   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
850                                                       BasicBlock *Bypass,
851                                                       BasicBlock *Insert);
852   void printDebugTracesAtStart() override;
853   void printDebugTracesAtEnd() override;
854 };
855 } // end namespace llvm
856 
857 /// Look for a meaningful debug location on the instruction or it's
858 /// operands.
getDebugLocFromInstOrOperands(Instruction * I)859 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
860   if (!I)
861     return DebugLoc();
862 
863   DebugLoc Empty;
864   if (I->getDebugLoc() != Empty)
865     return I->getDebugLoc();
866 
867   for (Use &Op : I->operands()) {
868     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
869       if (OpInst->getDebugLoc() != Empty)
870         return OpInst->getDebugLoc();
871   }
872 
873   return I->getDebugLoc();
874 }
875 
876 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
877 /// is passed, the message relates to that particular instruction.
878 #ifndef NDEBUG
debugVectorizationMessage(const StringRef Prefix,const StringRef DebugMsg,Instruction * I)879 static void debugVectorizationMessage(const StringRef Prefix,
880                                       const StringRef DebugMsg,
881                                       Instruction *I) {
882   dbgs() << "LV: " << Prefix << DebugMsg;
883   if (I != nullptr)
884     dbgs() << " " << *I;
885   else
886     dbgs() << '.';
887   dbgs() << '\n';
888 }
889 #endif
890 
891 /// Create an analysis remark that explains why vectorization failed
892 ///
893 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
894 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
895 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
896 /// the location of the remark.  \return the remark object that can be
897 /// streamed to.
createLVAnalysis(const char * PassName,StringRef RemarkName,Loop * TheLoop,Instruction * I)898 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
899     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
900   Value *CodeRegion = TheLoop->getHeader();
901   DebugLoc DL = TheLoop->getStartLoc();
902 
903   if (I) {
904     CodeRegion = I->getParent();
905     // If there is no debug location attached to the instruction, revert back to
906     // using the loop's.
907     if (I->getDebugLoc())
908       DL = I->getDebugLoc();
909   }
910 
911   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
912 }
913 
914 namespace llvm {
915 
916 /// Return a value for Step multiplied by VF.
createStepForVF(IRBuilderBase & B,Type * Ty,ElementCount VF,int64_t Step)917 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
918                        int64_t Step) {
919   assert(Ty->isIntegerTy() && "Expected an integer step");
920   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
921 }
922 
923 /// Return the runtime value for VF.
getRuntimeVF(IRBuilderBase & B,Type * Ty,ElementCount VF)924 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
925   return B.CreateElementCount(Ty, VF);
926 }
927 
createTripCountSCEV(Type * IdxTy,PredicatedScalarEvolution & PSE,Loop * OrigLoop)928 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
929                                 Loop *OrigLoop) {
930   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
931   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
932 
933   ScalarEvolution &SE = *PSE.getSE();
934   return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
935 }
936 
reportVectorizationFailure(const StringRef DebugMsg,const StringRef OREMsg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)937 void reportVectorizationFailure(const StringRef DebugMsg,
938                                 const StringRef OREMsg, const StringRef ORETag,
939                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
940                                 Instruction *I) {
941   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
942   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
943   ORE->emit(
944       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
945       << "loop not vectorized: " << OREMsg);
946 }
947 
948 /// Reports an informative message: print \p Msg for debugging purposes as well
949 /// as an optimization remark. Uses either \p I as location of the remark, or
950 /// otherwise \p TheLoop.
reportVectorizationInfo(const StringRef Msg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I=nullptr)951 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
952                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
953                              Instruction *I = nullptr) {
954   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
955   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
956   ORE->emit(
957       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
958       << Msg);
959 }
960 
961 /// Report successful vectorization of the loop. In case an outer loop is
962 /// vectorized, prepend "outer" to the vectorization remark.
reportVectorization(OptimizationRemarkEmitter * ORE,Loop * TheLoop,VectorizationFactor VF,unsigned IC)963 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
964                                 VectorizationFactor VF, unsigned IC) {
965   LLVM_DEBUG(debugVectorizationMessage(
966       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
967       nullptr));
968   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
969   ORE->emit([&]() {
970     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
971                               TheLoop->getHeader())
972            << "vectorized " << LoopType << "loop (vectorization width: "
973            << ore::NV("VectorizationFactor", VF.Width)
974            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
975   });
976 }
977 
978 } // end namespace llvm
979 
980 namespace llvm {
981 
982 // Loop vectorization cost-model hints how the scalar epilogue loop should be
983 // lowered.
984 enum ScalarEpilogueLowering {
985 
986   // The default: allowing scalar epilogues.
987   CM_ScalarEpilogueAllowed,
988 
989   // Vectorization with OptForSize: don't allow epilogues.
990   CM_ScalarEpilogueNotAllowedOptSize,
991 
992   // A special case of vectorisation with OptForSize: loops with a very small
993   // trip count are considered for vectorization under OptForSize, thereby
994   // making sure the cost of their loop body is dominant, free of runtime
995   // guards and scalar iteration overheads.
996   CM_ScalarEpilogueNotAllowedLowTripLoop,
997 
998   // Loop hint predicate indicating an epilogue is undesired.
999   CM_ScalarEpilogueNotNeededUsePredicate,
1000 
1001   // Directive indicating we must either tail fold or not vectorize
1002   CM_ScalarEpilogueNotAllowedUsePredicate
1003 };
1004 
1005 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1006 
1007 /// LoopVectorizationCostModel - estimates the expected speedups due to
1008 /// vectorization.
1009 /// In many cases vectorization is not profitable. This can happen because of
1010 /// a number of reasons. In this class we mainly attempt to predict the
1011 /// expected speedup/slowdowns due to the supported instruction set. We use the
1012 /// TargetTransformInfo to query the different backends for the cost of
1013 /// different operations.
1014 class LoopVectorizationCostModel {
1015 public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL,Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,LoopVectorizationLegality * Legal,const TargetTransformInfo & TTI,const TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,const Function * F,const LoopVectorizeHints * Hints,InterleavedAccessInfo & IAI)1016   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1017                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1018                              LoopVectorizationLegality *Legal,
1019                              const TargetTransformInfo &TTI,
1020                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1021                              AssumptionCache *AC,
1022                              OptimizationRemarkEmitter *ORE, const Function *F,
1023                              const LoopVectorizeHints *Hints,
1024                              InterleavedAccessInfo &IAI)
1025       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1026         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1027         Hints(Hints), InterleaveInfo(IAI) {}
1028 
1029   /// \return An upper bound for the vectorization factors (both fixed and
1030   /// scalable). If the factors are 0, vectorization and interleaving should be
1031   /// avoided up front.
1032   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1033 
1034   /// \return True if runtime checks are required for vectorization, and false
1035   /// otherwise.
1036   bool runtimeChecksRequired();
1037 
1038   /// Setup cost-based decisions for user vectorization factor.
1039   /// \return true if the UserVF is a feasible VF to be chosen.
selectUserVectorizationFactor(ElementCount UserVF)1040   bool selectUserVectorizationFactor(ElementCount UserVF) {
1041     collectUniformsAndScalars(UserVF);
1042     collectInstsToScalarize(UserVF);
1043     return expectedCost(UserVF).isValid();
1044   }
1045 
1046   /// \return The size (in bits) of the smallest and widest types in the code
1047   /// that needs to be vectorized. We ignore values that remain scalar such as
1048   /// 64 bit loop indices.
1049   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1050 
1051   /// \return The desired interleave count.
1052   /// If interleave count has been specified by metadata it will be returned.
1053   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1054   /// are the selected vectorization factor and the cost of the selected VF.
1055   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1056 
1057   /// Memory access instruction may be vectorized in more than one way.
1058   /// Form of instruction after vectorization depends on cost.
1059   /// This function takes cost-based decisions for Load/Store instructions
1060   /// and collects them in a map. This decisions map is used for building
1061   /// the lists of loop-uniform and loop-scalar instructions.
1062   /// The calculated cost is saved with widening decision in order to
1063   /// avoid redundant calculations.
1064   void setCostBasedWideningDecision(ElementCount VF);
1065 
1066   /// A call may be vectorized in different ways depending on whether we have
1067   /// vectorized variants available and whether the target supports masking.
1068   /// This function analyzes all calls in the function at the supplied VF,
1069   /// makes a decision based on the costs of available options, and stores that
1070   /// decision in a map for use in planning and plan execution.
1071   void setVectorizedCallDecision(ElementCount VF);
1072 
1073   /// A struct that represents some properties of the register usage
1074   /// of a loop.
1075   struct RegisterUsage {
1076     /// Holds the number of loop invariant values that are used in the loop.
1077     /// The key is ClassID of target-provided register class.
1078     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1079     /// Holds the maximum number of concurrent live intervals in the loop.
1080     /// The key is ClassID of target-provided register class.
1081     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1082   };
1083 
1084   /// \return Returns information about the register usages of the loop for the
1085   /// given vectorization factors.
1086   SmallVector<RegisterUsage, 8>
1087   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1088 
1089   /// Collect values we want to ignore in the cost model.
1090   void collectValuesToIgnore();
1091 
1092   /// Collect all element types in the loop for which widening is needed.
1093   void collectElementTypesForWidening();
1094 
1095   /// Split reductions into those that happen in the loop, and those that happen
1096   /// outside. In loop reductions are collected into InLoopReductions.
1097   void collectInLoopReductions();
1098 
1099   /// Returns true if we should use strict in-order reductions for the given
1100   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1101   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1102   /// of FP operations.
useOrderedReductions(const RecurrenceDescriptor & RdxDesc) const1103   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1104     return !Hints->allowReordering() && RdxDesc.isOrdered();
1105   }
1106 
1107   /// \returns The smallest bitwidth each instruction can be represented with.
1108   /// The vector equivalents of these instructions should be truncated to this
1109   /// type.
getMinimalBitwidths() const1110   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1111     return MinBWs;
1112   }
1113 
1114   /// \returns True if it is more profitable to scalarize instruction \p I for
1115   /// vectorization factor \p VF.
isProfitableToScalarize(Instruction * I,ElementCount VF) const1116   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1117     assert(VF.isVector() &&
1118            "Profitable to scalarize relevant only for VF > 1.");
1119     assert(
1120         TheLoop->isInnermost() &&
1121         "cost-model should not be used for outer loops (in VPlan-native path)");
1122 
1123     auto Scalars = InstsToScalarize.find(VF);
1124     assert(Scalars != InstsToScalarize.end() &&
1125            "VF not yet analyzed for scalarization profitability");
1126     return Scalars->second.contains(I);
1127   }
1128 
1129   /// Returns true if \p I is known to be uniform after vectorization.
isUniformAfterVectorization(Instruction * I,ElementCount VF) const1130   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1131     assert(
1132         TheLoop->isInnermost() &&
1133         "cost-model should not be used for outer loops (in VPlan-native path)");
1134     // Pseudo probe needs to be duplicated for each unrolled iteration and
1135     // vector lane so that profiled loop trip count can be accurately
1136     // accumulated instead of being under counted.
1137     if (isa<PseudoProbeInst>(I))
1138       return false;
1139 
1140     if (VF.isScalar())
1141       return true;
1142 
1143     auto UniformsPerVF = Uniforms.find(VF);
1144     assert(UniformsPerVF != Uniforms.end() &&
1145            "VF not yet analyzed for uniformity");
1146     return UniformsPerVF->second.count(I);
1147   }
1148 
1149   /// Returns true if \p I is known to be scalar after vectorization.
isScalarAfterVectorization(Instruction * I,ElementCount VF) const1150   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1151     assert(
1152         TheLoop->isInnermost() &&
1153         "cost-model should not be used for outer loops (in VPlan-native path)");
1154     if (VF.isScalar())
1155       return true;
1156 
1157     auto ScalarsPerVF = Scalars.find(VF);
1158     assert(ScalarsPerVF != Scalars.end() &&
1159            "Scalar values are not calculated for VF");
1160     return ScalarsPerVF->second.count(I);
1161   }
1162 
1163   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1164   /// for vectorization factor \p VF.
canTruncateToMinimalBitwidth(Instruction * I,ElementCount VF) const1165   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1166     return VF.isVector() && MinBWs.contains(I) &&
1167            !isProfitableToScalarize(I, VF) &&
1168            !isScalarAfterVectorization(I, VF);
1169   }
1170 
1171   /// Decision that was taken during cost calculation for memory instruction.
1172   enum InstWidening {
1173     CM_Unknown,
1174     CM_Widen,         // For consecutive accesses with stride +1.
1175     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1176     CM_Interleave,
1177     CM_GatherScatter,
1178     CM_Scalarize,
1179     CM_VectorCall,
1180     CM_IntrinsicCall
1181   };
1182 
1183   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1184   /// instruction \p I and vector width \p VF.
setWideningDecision(Instruction * I,ElementCount VF,InstWidening W,InstructionCost Cost)1185   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1186                            InstructionCost Cost) {
1187     assert(VF.isVector() && "Expected VF >=2");
1188     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1189   }
1190 
1191   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1192   /// interleaving group \p Grp and vector width \p VF.
setWideningDecision(const InterleaveGroup<Instruction> * Grp,ElementCount VF,InstWidening W,InstructionCost Cost)1193   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1194                            ElementCount VF, InstWidening W,
1195                            InstructionCost Cost) {
1196     assert(VF.isVector() && "Expected VF >=2");
1197     /// Broadcast this decicion to all instructions inside the group.
1198     /// But the cost will be assigned to one instruction only.
1199     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1200       if (auto *I = Grp->getMember(i)) {
1201         if (Grp->getInsertPos() == I)
1202           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1203         else
1204           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1205       }
1206     }
1207   }
1208 
1209   /// Return the cost model decision for the given instruction \p I and vector
1210   /// width \p VF. Return CM_Unknown if this instruction did not pass
1211   /// through the cost modeling.
getWideningDecision(Instruction * I,ElementCount VF) const1212   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1213     assert(VF.isVector() && "Expected VF to be a vector VF");
1214     assert(
1215         TheLoop->isInnermost() &&
1216         "cost-model should not be used for outer loops (in VPlan-native path)");
1217 
1218     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1219     auto Itr = WideningDecisions.find(InstOnVF);
1220     if (Itr == WideningDecisions.end())
1221       return CM_Unknown;
1222     return Itr->second.first;
1223   }
1224 
1225   /// Return the vectorization cost for the given instruction \p I and vector
1226   /// width \p VF.
getWideningCost(Instruction * I,ElementCount VF)1227   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1228     assert(VF.isVector() && "Expected VF >=2");
1229     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1230     assert(WideningDecisions.contains(InstOnVF) &&
1231            "The cost is not calculated");
1232     return WideningDecisions[InstOnVF].second;
1233   }
1234 
1235   struct CallWideningDecision {
1236     InstWidening Kind;
1237     Function *Variant;
1238     Intrinsic::ID IID;
1239     std::optional<unsigned> MaskPos;
1240     InstructionCost Cost;
1241   };
1242 
setCallWideningDecision(CallInst * CI,ElementCount VF,InstWidening Kind,Function * Variant,Intrinsic::ID IID,std::optional<unsigned> MaskPos,InstructionCost Cost)1243   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1244                                Function *Variant, Intrinsic::ID IID,
1245                                std::optional<unsigned> MaskPos,
1246                                InstructionCost Cost) {
1247     assert(!VF.isScalar() && "Expected vector VF");
1248     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1249                                                      MaskPos, Cost};
1250   }
1251 
getCallWideningDecision(CallInst * CI,ElementCount VF) const1252   CallWideningDecision getCallWideningDecision(CallInst *CI,
1253                                                ElementCount VF) const {
1254     assert(!VF.isScalar() && "Expected vector VF");
1255     return CallWideningDecisions.at(std::make_pair(CI, VF));
1256   }
1257 
1258   /// Return True if instruction \p I is an optimizable truncate whose operand
1259   /// is an induction variable. Such a truncate will be removed by adding a new
1260   /// induction variable with the destination type.
isOptimizableIVTruncate(Instruction * I,ElementCount VF)1261   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1262     // If the instruction is not a truncate, return false.
1263     auto *Trunc = dyn_cast<TruncInst>(I);
1264     if (!Trunc)
1265       return false;
1266 
1267     // Get the source and destination types of the truncate.
1268     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1269     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1270 
1271     // If the truncate is free for the given types, return false. Replacing a
1272     // free truncate with an induction variable would add an induction variable
1273     // update instruction to each iteration of the loop. We exclude from this
1274     // check the primary induction variable since it will need an update
1275     // instruction regardless.
1276     Value *Op = Trunc->getOperand(0);
1277     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1278       return false;
1279 
1280     // If the truncated value is not an induction variable, return false.
1281     return Legal->isInductionPhi(Op);
1282   }
1283 
1284   /// Collects the instructions to scalarize for each predicated instruction in
1285   /// the loop.
1286   void collectInstsToScalarize(ElementCount VF);
1287 
1288   /// Collect Uniform and Scalar values for the given \p VF.
1289   /// The sets depend on CM decision for Load/Store instructions
1290   /// that may be vectorized as interleave, gather-scatter or scalarized.
1291   /// Also make a decision on what to do about call instructions in the loop
1292   /// at that VF -- scalarize, call a known vector routine, or call a
1293   /// vector intrinsic.
collectUniformsAndScalars(ElementCount VF)1294   void collectUniformsAndScalars(ElementCount VF) {
1295     // Do the analysis once.
1296     if (VF.isScalar() || Uniforms.contains(VF))
1297       return;
1298     setCostBasedWideningDecision(VF);
1299     setVectorizedCallDecision(VF);
1300     collectLoopUniforms(VF);
1301     collectLoopScalars(VF);
1302   }
1303 
1304   /// Returns true if the target machine supports masked store operation
1305   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedStore(Type * DataType,Value * Ptr,Align Alignment) const1306   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1307     return Legal->isConsecutivePtr(DataType, Ptr) &&
1308            TTI.isLegalMaskedStore(DataType, Alignment);
1309   }
1310 
1311   /// Returns true if the target machine supports masked load operation
1312   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedLoad(Type * DataType,Value * Ptr,Align Alignment) const1313   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1314     return Legal->isConsecutivePtr(DataType, Ptr) &&
1315            TTI.isLegalMaskedLoad(DataType, Alignment);
1316   }
1317 
1318   /// Returns true if the target machine can represent \p V as a masked gather
1319   /// or scatter operation.
isLegalGatherOrScatter(Value * V,ElementCount VF)1320   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1321     bool LI = isa<LoadInst>(V);
1322     bool SI = isa<StoreInst>(V);
1323     if (!LI && !SI)
1324       return false;
1325     auto *Ty = getLoadStoreType(V);
1326     Align Align = getLoadStoreAlignment(V);
1327     if (VF.isVector())
1328       Ty = VectorType::get(Ty, VF);
1329     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1330            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1331   }
1332 
1333   /// Returns true if the target machine supports all of the reduction
1334   /// variables found for the given VF.
canVectorizeReductions(ElementCount VF) const1335   bool canVectorizeReductions(ElementCount VF) const {
1336     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1337       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1338       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1339     }));
1340   }
1341 
1342   /// Given costs for both strategies, return true if the scalar predication
1343   /// lowering should be used for div/rem.  This incorporates an override
1344   /// option so it is not simply a cost comparison.
isDivRemScalarWithPredication(InstructionCost ScalarCost,InstructionCost SafeDivisorCost) const1345   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1346                                      InstructionCost SafeDivisorCost) const {
1347     switch (ForceSafeDivisor) {
1348     case cl::BOU_UNSET:
1349       return ScalarCost < SafeDivisorCost;
1350     case cl::BOU_TRUE:
1351       return false;
1352     case cl::BOU_FALSE:
1353       return true;
1354     };
1355     llvm_unreachable("impossible case value");
1356   }
1357 
1358   /// Returns true if \p I is an instruction which requires predication and
1359   /// for which our chosen predication strategy is scalarization (i.e. we
1360   /// don't have an alternate strategy such as masking available).
1361   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1362   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1363 
1364   /// Returns true if \p I is an instruction that needs to be predicated
1365   /// at runtime.  The result is independent of the predication mechanism.
1366   /// Superset of instructions that return true for isScalarWithPredication.
1367   bool isPredicatedInst(Instruction *I) const;
1368 
1369   /// Return the costs for our two available strategies for lowering a
1370   /// div/rem operation which requires speculating at least one lane.
1371   /// First result is for scalarization (will be invalid for scalable
1372   /// vectors); second is for the safe-divisor strategy.
1373   std::pair<InstructionCost, InstructionCost>
1374   getDivRemSpeculationCost(Instruction *I,
1375                            ElementCount VF) const;
1376 
1377   /// Returns true if \p I is a memory instruction with consecutive memory
1378   /// access that can be widened.
1379   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1380 
1381   /// Returns true if \p I is a memory instruction in an interleaved-group
1382   /// of memory accesses that can be vectorized with wide vector loads/stores
1383   /// and shuffles.
1384   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1385 
1386   /// Check if \p Instr belongs to any interleaved access group.
isAccessInterleaved(Instruction * Instr) const1387   bool isAccessInterleaved(Instruction *Instr) const {
1388     return InterleaveInfo.isInterleaved(Instr);
1389   }
1390 
1391   /// Get the interleaved access group that \p Instr belongs to.
1392   const InterleaveGroup<Instruction> *
getInterleavedAccessGroup(Instruction * Instr) const1393   getInterleavedAccessGroup(Instruction *Instr) const {
1394     return InterleaveInfo.getInterleaveGroup(Instr);
1395   }
1396 
1397   /// Returns true if we're required to use a scalar epilogue for at least
1398   /// the final iteration of the original loop.
requiresScalarEpilogue(bool IsVectorizing) const1399   bool requiresScalarEpilogue(bool IsVectorizing) const {
1400     if (!isScalarEpilogueAllowed()) {
1401       LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1402       return false;
1403     }
1404     // If we might exit from anywhere but the latch, must run the exiting
1405     // iteration in scalar form.
1406     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1407       LLVM_DEBUG(
1408           dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1409       return true;
1410     }
1411     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1412       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1413                            "interleaved group requires scalar epilogue\n");
1414       return true;
1415     }
1416     LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1417     return false;
1418   }
1419 
1420   /// Returns true if we're required to use a scalar epilogue for at least
1421   /// the final iteration of the original loop for all VFs in \p Range.
1422   /// A scalar epilogue must either be required for all VFs in \p Range or for
1423   /// none.
requiresScalarEpilogue(VFRange Range) const1424   bool requiresScalarEpilogue(VFRange Range) const {
1425     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1426       return requiresScalarEpilogue(VF.isVector());
1427     };
1428     bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1429     assert(
1430         (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1431         "all VFs in range must agree on whether a scalar epilogue is required");
1432     return IsRequired;
1433   }
1434 
1435   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1436   /// loop hint annotation.
isScalarEpilogueAllowed() const1437   bool isScalarEpilogueAllowed() const {
1438     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1439   }
1440 
1441   /// Returns the TailFoldingStyle that is best for the current loop.
getTailFoldingStyle(bool IVUpdateMayOverflow=true) const1442   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1443     if (!ChosenTailFoldingStyle)
1444       return TailFoldingStyle::None;
1445     return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1446                                : ChosenTailFoldingStyle->second;
1447   }
1448 
1449   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1450   /// overflow or not.
1451   /// \param IsScalableVF true if scalable vector factors enabled.
1452   /// \param UserIC User specific interleave count.
setTailFoldingStyles(bool IsScalableVF,unsigned UserIC)1453   void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1454     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1455     if (!Legal->canFoldTailByMasking()) {
1456       ChosenTailFoldingStyle =
1457           std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
1458       return;
1459     }
1460 
1461     if (!ForceTailFoldingStyle.getNumOccurrences()) {
1462       ChosenTailFoldingStyle = std::make_pair(
1463           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1464           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1465       return;
1466     }
1467 
1468     // Set styles when forced.
1469     ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1470                                             ForceTailFoldingStyle.getValue());
1471     if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1472       return;
1473     // Override forced styles if needed.
1474     // FIXME: use actual opcode/data type for analysis here.
1475     // FIXME: Investigate opportunity for fixed vector factor.
1476     bool EVLIsLegal =
1477         IsScalableVF && UserIC <= 1 &&
1478         TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1479         !EnableVPlanNativePath &&
1480         // FIXME: implement support for max safe dependency distance.
1481         Legal->isSafeForAnyVectorWidth();
1482     if (!EVLIsLegal) {
1483       // If for some reason EVL mode is unsupported, fallback to
1484       // DataWithoutLaneMask to try to vectorize the loop with folded tail
1485       // in a generic way.
1486       ChosenTailFoldingStyle =
1487           std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
1488                          TailFoldingStyle::DataWithoutLaneMask);
1489       LLVM_DEBUG(
1490           dbgs()
1491           << "LV: Preference for VP intrinsics indicated. Will "
1492              "not try to generate VP Intrinsics "
1493           << (UserIC > 1
1494                   ? "since interleave count specified is greater than 1.\n"
1495                   : "due to non-interleaving reasons.\n"));
1496     }
1497   }
1498 
1499   /// Returns true if all loop blocks should be masked to fold tail loop.
foldTailByMasking() const1500   bool foldTailByMasking() const {
1501     // TODO: check if it is possible to check for None style independent of
1502     // IVUpdateMayOverflow flag in getTailFoldingStyle.
1503     return getTailFoldingStyle() != TailFoldingStyle::None;
1504   }
1505 
1506   /// Returns true if the instructions in this block requires predication
1507   /// for any reason, e.g. because tail folding now requires a predicate
1508   /// or because the block in the original loop was predicated.
blockNeedsPredicationForAnyReason(BasicBlock * BB) const1509   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1510     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1511   }
1512 
1513   /// Returns true if VP intrinsics with explicit vector length support should
1514   /// be generated in the tail folded loop.
foldTailWithEVL() const1515   bool foldTailWithEVL() const {
1516     return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1517   }
1518 
1519   /// Returns true if the Phi is part of an inloop reduction.
isInLoopReduction(PHINode * Phi) const1520   bool isInLoopReduction(PHINode *Phi) const {
1521     return InLoopReductions.contains(Phi);
1522   }
1523 
1524   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1525   /// with factor VF.  Return the cost of the instruction, including
1526   /// scalarization overhead if it's needed.
1527   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1528 
1529   /// Estimate cost of a call instruction CI if it were vectorized with factor
1530   /// VF. Return the cost of the instruction, including scalarization overhead
1531   /// if it's needed.
1532   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1533 
1534   /// Invalidates decisions already taken by the cost model.
invalidateCostModelingDecisions()1535   void invalidateCostModelingDecisions() {
1536     WideningDecisions.clear();
1537     CallWideningDecisions.clear();
1538     Uniforms.clear();
1539     Scalars.clear();
1540   }
1541 
1542   /// Returns the expected execution cost. The unit of the cost does
1543   /// not matter because we use the 'cost' units to compare different
1544   /// vector widths. The cost that is returned is *not* normalized by
1545   /// the factor width. If \p Invalid is not nullptr, this function
1546   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1547   /// each instruction that has an Invalid cost for the given VF.
1548   InstructionCost
1549   expectedCost(ElementCount VF,
1550                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1551 
hasPredStores() const1552   bool hasPredStores() const { return NumPredStores > 0; }
1553 
1554   /// Returns true if epilogue vectorization is considered profitable, and
1555   /// false otherwise.
1556   /// \p VF is the vectorization factor chosen for the original loop.
1557   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1558 
1559   /// Returns the execution time cost of an instruction for a given vector
1560   /// width. Vector width of one means scalar.
1561   InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1562 
1563   /// Return the cost of instructions in an inloop reduction pattern, if I is
1564   /// part of that pattern.
1565   std::optional<InstructionCost>
1566   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1567                           TTI::TargetCostKind CostKind) const;
1568 
1569 private:
1570   unsigned NumPredStores = 0;
1571 
1572   /// \return An upper bound for the vectorization factors for both
1573   /// fixed and scalable vectorization, where the minimum-known number of
1574   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1575   /// disabled or unsupported, then the scalable part will be equal to
1576   /// ElementCount::getScalable(0).
1577   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1578                                            ElementCount UserVF,
1579                                            bool FoldTailByMasking);
1580 
1581   /// \return the maximized element count based on the targets vector
1582   /// registers and the loop trip-count, but limited to a maximum safe VF.
1583   /// This is a helper function of computeFeasibleMaxVF.
1584   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1585                                        unsigned SmallestType,
1586                                        unsigned WidestType,
1587                                        ElementCount MaxSafeVF,
1588                                        bool FoldTailByMasking);
1589 
1590   /// Checks if scalable vectorization is supported and enabled. Caches the
1591   /// result to avoid repeated debug dumps for repeated queries.
1592   bool isScalableVectorizationAllowed();
1593 
1594   /// \return the maximum legal scalable VF, based on the safe max number
1595   /// of elements.
1596   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1597 
1598   /// Calculate vectorization cost of memory instruction \p I.
1599   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1600 
1601   /// The cost computation for scalarized memory instruction.
1602   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1603 
1604   /// The cost computation for interleaving group of memory instructions.
1605   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1606 
1607   /// The cost computation for Gather/Scatter instruction.
1608   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1609 
1610   /// The cost computation for widening instruction \p I with consecutive
1611   /// memory access.
1612   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1613 
1614   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1615   /// Load: scalar load + broadcast.
1616   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1617   /// element)
1618   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1619 
1620   /// Estimate the overhead of scalarizing an instruction. This is a
1621   /// convenience wrapper for the type-based getScalarizationOverhead API.
1622   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1623                                            TTI::TargetCostKind CostKind) const;
1624 
1625   /// Returns true if an artificially high cost for emulated masked memrefs
1626   /// should be used.
1627   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1628 
1629   /// Map of scalar integer values to the smallest bitwidth they can be legally
1630   /// represented as. The vector equivalents of these values should be truncated
1631   /// to this type.
1632   MapVector<Instruction *, uint64_t> MinBWs;
1633 
1634   /// A type representing the costs for instructions if they were to be
1635   /// scalarized rather than vectorized. The entries are Instruction-Cost
1636   /// pairs.
1637   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1638 
1639   /// A set containing all BasicBlocks that are known to present after
1640   /// vectorization as a predicated block.
1641   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1642       PredicatedBBsAfterVectorization;
1643 
1644   /// Records whether it is allowed to have the original scalar loop execute at
1645   /// least once. This may be needed as a fallback loop in case runtime
1646   /// aliasing/dependence checks fail, or to handle the tail/remainder
1647   /// iterations when the trip count is unknown or doesn't divide by the VF,
1648   /// or as a peel-loop to handle gaps in interleave-groups.
1649   /// Under optsize and when the trip count is very small we don't allow any
1650   /// iterations to execute in the scalar loop.
1651   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1652 
1653   /// Control finally chosen tail folding style. The first element is used if
1654   /// the IV update may overflow, the second element - if it does not.
1655   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1656       ChosenTailFoldingStyle;
1657 
1658   /// true if scalable vectorization is supported and enabled.
1659   std::optional<bool> IsScalableVectorizationAllowed;
1660 
1661   /// A map holding scalar costs for different vectorization factors. The
1662   /// presence of a cost for an instruction in the mapping indicates that the
1663   /// instruction will be scalarized when vectorizing with the associated
1664   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1665   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1666 
1667   /// Holds the instructions known to be uniform after vectorization.
1668   /// The data is collected per VF.
1669   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1670 
1671   /// Holds the instructions known to be scalar after vectorization.
1672   /// The data is collected per VF.
1673   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1674 
1675   /// Holds the instructions (address computations) that are forced to be
1676   /// scalarized.
1677   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1678 
1679   /// PHINodes of the reductions that should be expanded in-loop.
1680   SmallPtrSet<PHINode *, 4> InLoopReductions;
1681 
1682   /// A Map of inloop reduction operations and their immediate chain operand.
1683   /// FIXME: This can be removed once reductions can be costed correctly in
1684   /// VPlan. This was added to allow quick lookup of the inloop operations.
1685   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1686 
1687   /// Returns the expected difference in cost from scalarizing the expression
1688   /// feeding a predicated instruction \p PredInst. The instructions to
1689   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1690   /// non-negative return value implies the expression will be scalarized.
1691   /// Currently, only single-use chains are considered for scalarization.
1692   InstructionCost computePredInstDiscount(Instruction *PredInst,
1693                                           ScalarCostsTy &ScalarCosts,
1694                                           ElementCount VF);
1695 
1696   /// Collect the instructions that are uniform after vectorization. An
1697   /// instruction is uniform if we represent it with a single scalar value in
1698   /// the vectorized loop corresponding to each vector iteration. Examples of
1699   /// uniform instructions include pointer operands of consecutive or
1700   /// interleaved memory accesses. Note that although uniformity implies an
1701   /// instruction will be scalar, the reverse is not true. In general, a
1702   /// scalarized instruction will be represented by VF scalar values in the
1703   /// vectorized loop, each corresponding to an iteration of the original
1704   /// scalar loop.
1705   void collectLoopUniforms(ElementCount VF);
1706 
1707   /// Collect the instructions that are scalar after vectorization. An
1708   /// instruction is scalar if it is known to be uniform or will be scalarized
1709   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1710   /// to the list if they are used by a load/store instruction that is marked as
1711   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1712   /// VF values in the vectorized loop, each corresponding to an iteration of
1713   /// the original scalar loop.
1714   void collectLoopScalars(ElementCount VF);
1715 
1716   /// Keeps cost model vectorization decision and cost for instructions.
1717   /// Right now it is used for memory instructions only.
1718   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1719                                 std::pair<InstWidening, InstructionCost>>;
1720 
1721   DecisionList WideningDecisions;
1722 
1723   using CallDecisionList =
1724       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1725 
1726   CallDecisionList CallWideningDecisions;
1727 
1728   /// Returns true if \p V is expected to be vectorized and it needs to be
1729   /// extracted.
needsExtract(Value * V,ElementCount VF) const1730   bool needsExtract(Value *V, ElementCount VF) const {
1731     Instruction *I = dyn_cast<Instruction>(V);
1732     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1733         TheLoop->isLoopInvariant(I))
1734       return false;
1735 
1736     // Assume we can vectorize V (and hence we need extraction) if the
1737     // scalars are not computed yet. This can happen, because it is called
1738     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1739     // the scalars are collected. That should be a safe assumption in most
1740     // cases, because we check if the operands have vectorizable types
1741     // beforehand in LoopVectorizationLegality.
1742     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1743   };
1744 
1745   /// Returns a range containing only operands needing to be extracted.
filterExtractingOperands(Instruction::op_range Ops,ElementCount VF) const1746   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1747                                                    ElementCount VF) const {
1748     return SmallVector<Value *, 4>(make_filter_range(
1749         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1750   }
1751 
1752 public:
1753   /// The loop that we evaluate.
1754   Loop *TheLoop;
1755 
1756   /// Predicated scalar evolution analysis.
1757   PredicatedScalarEvolution &PSE;
1758 
1759   /// Loop Info analysis.
1760   LoopInfo *LI;
1761 
1762   /// Vectorization legality.
1763   LoopVectorizationLegality *Legal;
1764 
1765   /// Vector target information.
1766   const TargetTransformInfo &TTI;
1767 
1768   /// Target Library Info.
1769   const TargetLibraryInfo *TLI;
1770 
1771   /// Demanded bits analysis.
1772   DemandedBits *DB;
1773 
1774   /// Assumption cache.
1775   AssumptionCache *AC;
1776 
1777   /// Interface to emit optimization remarks.
1778   OptimizationRemarkEmitter *ORE;
1779 
1780   const Function *TheFunction;
1781 
1782   /// Loop Vectorize Hint.
1783   const LoopVectorizeHints *Hints;
1784 
1785   /// The interleave access information contains groups of interleaved accesses
1786   /// with the same stride and close to each other.
1787   InterleavedAccessInfo &InterleaveInfo;
1788 
1789   /// Values to ignore in the cost model.
1790   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1791 
1792   /// Values to ignore in the cost model when VF > 1.
1793   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1794 
1795   /// All element types found in the loop.
1796   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1797 };
1798 } // end namespace llvm
1799 
1800 namespace {
1801 /// Helper struct to manage generating runtime checks for vectorization.
1802 ///
1803 /// The runtime checks are created up-front in temporary blocks to allow better
1804 /// estimating the cost and un-linked from the existing IR. After deciding to
1805 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1806 /// temporary blocks are completely removed.
1807 class GeneratedRTChecks {
1808   /// Basic block which contains the generated SCEV checks, if any.
1809   BasicBlock *SCEVCheckBlock = nullptr;
1810 
1811   /// The value representing the result of the generated SCEV checks. If it is
1812   /// nullptr, either no SCEV checks have been generated or they have been used.
1813   Value *SCEVCheckCond = nullptr;
1814 
1815   /// Basic block which contains the generated memory runtime checks, if any.
1816   BasicBlock *MemCheckBlock = nullptr;
1817 
1818   /// The value representing the result of the generated memory runtime checks.
1819   /// If it is nullptr, either no memory runtime checks have been generated or
1820   /// they have been used.
1821   Value *MemRuntimeCheckCond = nullptr;
1822 
1823   DominatorTree *DT;
1824   LoopInfo *LI;
1825   TargetTransformInfo *TTI;
1826 
1827   SCEVExpander SCEVExp;
1828   SCEVExpander MemCheckExp;
1829 
1830   bool CostTooHigh = false;
1831   const bool AddBranchWeights;
1832 
1833   Loop *OuterLoop = nullptr;
1834 
1835 public:
GeneratedRTChecks(ScalarEvolution & SE,DominatorTree * DT,LoopInfo * LI,TargetTransformInfo * TTI,const DataLayout & DL,bool AddBranchWeights)1836   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1837                     TargetTransformInfo *TTI, const DataLayout &DL,
1838                     bool AddBranchWeights)
1839       : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1840         MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1841 
1842   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1843   /// accurately estimate the cost of the runtime checks. The blocks are
1844   /// un-linked from the IR and is added back during vector code generation. If
1845   /// there is no vector code generation, the check blocks are removed
1846   /// completely.
Create(Loop * L,const LoopAccessInfo & LAI,const SCEVPredicate & UnionPred,ElementCount VF,unsigned IC)1847   void Create(Loop *L, const LoopAccessInfo &LAI,
1848               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1849 
1850     // Hard cutoff to limit compile-time increase in case a very large number of
1851     // runtime checks needs to be generated.
1852     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1853     // profile info.
1854     CostTooHigh =
1855         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1856     if (CostTooHigh)
1857       return;
1858 
1859     BasicBlock *LoopHeader = L->getHeader();
1860     BasicBlock *Preheader = L->getLoopPreheader();
1861 
1862     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1863     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1864     // may be used by SCEVExpander. The blocks will be un-linked from their
1865     // predecessors and removed from LI & DT at the end of the function.
1866     if (!UnionPred.isAlwaysTrue()) {
1867       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1868                                   nullptr, "vector.scevcheck");
1869 
1870       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1871           &UnionPred, SCEVCheckBlock->getTerminator());
1872     }
1873 
1874     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1875     if (RtPtrChecking.Need) {
1876       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1877       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1878                                  "vector.memcheck");
1879 
1880       auto DiffChecks = RtPtrChecking.getDiffChecks();
1881       if (DiffChecks) {
1882         Value *RuntimeVF = nullptr;
1883         MemRuntimeCheckCond = addDiffRuntimeChecks(
1884             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1885             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1886               if (!RuntimeVF)
1887                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1888               return RuntimeVF;
1889             },
1890             IC);
1891       } else {
1892         MemRuntimeCheckCond = addRuntimeChecks(
1893             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1894             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
1895       }
1896       assert(MemRuntimeCheckCond &&
1897              "no RT checks generated although RtPtrChecking "
1898              "claimed checks are required");
1899     }
1900 
1901     if (!MemCheckBlock && !SCEVCheckBlock)
1902       return;
1903 
1904     // Unhook the temporary block with the checks, update various places
1905     // accordingly.
1906     if (SCEVCheckBlock)
1907       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1908     if (MemCheckBlock)
1909       MemCheckBlock->replaceAllUsesWith(Preheader);
1910 
1911     if (SCEVCheckBlock) {
1912       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1913       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1914       Preheader->getTerminator()->eraseFromParent();
1915     }
1916     if (MemCheckBlock) {
1917       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1918       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1919       Preheader->getTerminator()->eraseFromParent();
1920     }
1921 
1922     DT->changeImmediateDominator(LoopHeader, Preheader);
1923     if (MemCheckBlock) {
1924       DT->eraseNode(MemCheckBlock);
1925       LI->removeBlock(MemCheckBlock);
1926     }
1927     if (SCEVCheckBlock) {
1928       DT->eraseNode(SCEVCheckBlock);
1929       LI->removeBlock(SCEVCheckBlock);
1930     }
1931 
1932     // Outer loop is used as part of the later cost calculations.
1933     OuterLoop = L->getParentLoop();
1934   }
1935 
getCost()1936   InstructionCost getCost() {
1937     if (SCEVCheckBlock || MemCheckBlock)
1938       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1939 
1940     if (CostTooHigh) {
1941       InstructionCost Cost;
1942       Cost.setInvalid();
1943       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
1944       return Cost;
1945     }
1946 
1947     InstructionCost RTCheckCost = 0;
1948     if (SCEVCheckBlock)
1949       for (Instruction &I : *SCEVCheckBlock) {
1950         if (SCEVCheckBlock->getTerminator() == &I)
1951           continue;
1952         InstructionCost C =
1953             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1954         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1955         RTCheckCost += C;
1956       }
1957     if (MemCheckBlock) {
1958       InstructionCost MemCheckCost = 0;
1959       for (Instruction &I : *MemCheckBlock) {
1960         if (MemCheckBlock->getTerminator() == &I)
1961           continue;
1962         InstructionCost C =
1963             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1964         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1965         MemCheckCost += C;
1966       }
1967 
1968       // If the runtime memory checks are being created inside an outer loop
1969       // we should find out if these checks are outer loop invariant. If so,
1970       // the checks will likely be hoisted out and so the effective cost will
1971       // reduce according to the outer loop trip count.
1972       if (OuterLoop) {
1973         ScalarEvolution *SE = MemCheckExp.getSE();
1974         // TODO: If profitable, we could refine this further by analysing every
1975         // individual memory check, since there could be a mixture of loop
1976         // variant and invariant checks that mean the final condition is
1977         // variant.
1978         const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1979         if (SE->isLoopInvariant(Cond, OuterLoop)) {
1980           // It seems reasonable to assume that we can reduce the effective
1981           // cost of the checks even when we know nothing about the trip
1982           // count. Assume that the outer loop executes at least twice.
1983           unsigned BestTripCount = 2;
1984 
1985           // If exact trip count is known use that.
1986           if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
1987             BestTripCount = SmallTC;
1988           else if (LoopVectorizeWithBlockFrequency) {
1989             // Else use profile data if available.
1990             if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
1991               BestTripCount = *EstimatedTC;
1992           }
1993 
1994           BestTripCount = std::max(BestTripCount, 1U);
1995           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1996 
1997           // Let's ensure the cost is always at least 1.
1998           NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
1999                                      (InstructionCost::CostType)1);
2000 
2001           if (BestTripCount > 1)
2002             LLVM_DEBUG(dbgs()
2003                        << "We expect runtime memory checks to be hoisted "
2004                        << "out of the outer loop. Cost reduced from "
2005                        << MemCheckCost << " to " << NewMemCheckCost << '\n');
2006 
2007           MemCheckCost = NewMemCheckCost;
2008         }
2009       }
2010 
2011       RTCheckCost += MemCheckCost;
2012     }
2013 
2014     if (SCEVCheckBlock || MemCheckBlock)
2015       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2016                         << "\n");
2017 
2018     return RTCheckCost;
2019   }
2020 
2021   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2022   /// unused.
~GeneratedRTChecks()2023   ~GeneratedRTChecks() {
2024     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2025     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2026     if (!SCEVCheckCond)
2027       SCEVCleaner.markResultUsed();
2028 
2029     if (!MemRuntimeCheckCond)
2030       MemCheckCleaner.markResultUsed();
2031 
2032     if (MemRuntimeCheckCond) {
2033       auto &SE = *MemCheckExp.getSE();
2034       // Memory runtime check generation creates compares that use expanded
2035       // values. Remove them before running the SCEVExpanderCleaners.
2036       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2037         if (MemCheckExp.isInsertedInstruction(&I))
2038           continue;
2039         SE.forgetValue(&I);
2040         I.eraseFromParent();
2041       }
2042     }
2043     MemCheckCleaner.cleanup();
2044     SCEVCleaner.cleanup();
2045 
2046     if (SCEVCheckCond)
2047       SCEVCheckBlock->eraseFromParent();
2048     if (MemRuntimeCheckCond)
2049       MemCheckBlock->eraseFromParent();
2050   }
2051 
2052   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2053   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2054   /// depending on the generated condition.
emitSCEVChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader,BasicBlock * LoopExitBlock)2055   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2056                              BasicBlock *LoopVectorPreHeader,
2057                              BasicBlock *LoopExitBlock) {
2058     if (!SCEVCheckCond)
2059       return nullptr;
2060 
2061     Value *Cond = SCEVCheckCond;
2062     // Mark the check as used, to prevent it from being removed during cleanup.
2063     SCEVCheckCond = nullptr;
2064     if (auto *C = dyn_cast<ConstantInt>(Cond))
2065       if (C->isZero())
2066         return nullptr;
2067 
2068     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2069 
2070     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2071     // Create new preheader for vector loop.
2072     if (OuterLoop)
2073       OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2074 
2075     SCEVCheckBlock->getTerminator()->eraseFromParent();
2076     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2077     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2078                                                 SCEVCheckBlock);
2079 
2080     DT->addNewBlock(SCEVCheckBlock, Pred);
2081     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2082 
2083     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2084     if (AddBranchWeights)
2085       setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2086     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2087     return SCEVCheckBlock;
2088   }
2089 
2090   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2091   /// the branches to branch to the vector preheader or \p Bypass, depending on
2092   /// the generated condition.
emitMemRuntimeChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader)2093   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2094                                    BasicBlock *LoopVectorPreHeader) {
2095     // Check if we generated code that checks in runtime if arrays overlap.
2096     if (!MemRuntimeCheckCond)
2097       return nullptr;
2098 
2099     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2100     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2101                                                 MemCheckBlock);
2102 
2103     DT->addNewBlock(MemCheckBlock, Pred);
2104     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2105     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2106 
2107     if (OuterLoop)
2108       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2109 
2110     BranchInst &BI =
2111         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2112     if (AddBranchWeights) {
2113       setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2114     }
2115     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2116     MemCheckBlock->getTerminator()->setDebugLoc(
2117         Pred->getTerminator()->getDebugLoc());
2118 
2119     // Mark the check as used, to prevent it from being removed during cleanup.
2120     MemRuntimeCheckCond = nullptr;
2121     return MemCheckBlock;
2122   }
2123 };
2124 } // namespace
2125 
useActiveLaneMask(TailFoldingStyle Style)2126 static bool useActiveLaneMask(TailFoldingStyle Style) {
2127   return Style == TailFoldingStyle::Data ||
2128          Style == TailFoldingStyle::DataAndControlFlow ||
2129          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2130 }
2131 
useActiveLaneMaskForControlFlow(TailFoldingStyle Style)2132 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2133   return Style == TailFoldingStyle::DataAndControlFlow ||
2134          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2135 }
2136 
2137 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2138 // vectorization. The loop needs to be annotated with #pragma omp simd
2139 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2140 // vector length information is not provided, vectorization is not considered
2141 // explicit. Interleave hints are not allowed either. These limitations will be
2142 // relaxed in the future.
2143 // Please, note that we are currently forced to abuse the pragma 'clang
2144 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2145 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2146 // provides *explicit vectorization hints* (LV can bypass legal checks and
2147 // assume that vectorization is legal). However, both hints are implemented
2148 // using the same metadata (llvm.loop.vectorize, processed by
2149 // LoopVectorizeHints). This will be fixed in the future when the native IR
2150 // representation for pragma 'omp simd' is introduced.
isExplicitVecOuterLoop(Loop * OuterLp,OptimizationRemarkEmitter * ORE)2151 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2152                                    OptimizationRemarkEmitter *ORE) {
2153   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2154   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2155 
2156   // Only outer loops with an explicit vectorization hint are supported.
2157   // Unannotated outer loops are ignored.
2158   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2159     return false;
2160 
2161   Function *Fn = OuterLp->getHeader()->getParent();
2162   if (!Hints.allowVectorization(Fn, OuterLp,
2163                                 true /*VectorizeOnlyWhenForced*/)) {
2164     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2165     return false;
2166   }
2167 
2168   if (Hints.getInterleave() > 1) {
2169     // TODO: Interleave support is future work.
2170     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2171                          "outer loops.\n");
2172     Hints.emitRemarkWithHints();
2173     return false;
2174   }
2175 
2176   return true;
2177 }
2178 
collectSupportedLoops(Loop & L,LoopInfo * LI,OptimizationRemarkEmitter * ORE,SmallVectorImpl<Loop * > & V)2179 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2180                                   OptimizationRemarkEmitter *ORE,
2181                                   SmallVectorImpl<Loop *> &V) {
2182   // Collect inner loops and outer loops without irreducible control flow. For
2183   // now, only collect outer loops that have explicit vectorization hints. If we
2184   // are stress testing the VPlan H-CFG construction, we collect the outermost
2185   // loop of every loop nest.
2186   if (L.isInnermost() || VPlanBuildStressTest ||
2187       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2188     LoopBlocksRPO RPOT(&L);
2189     RPOT.perform(LI);
2190     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2191       V.push_back(&L);
2192       // TODO: Collect inner loops inside marked outer loops in case
2193       // vectorization fails for the outer loop. Do not invoke
2194       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2195       // already known to be reducible. We can use an inherited attribute for
2196       // that.
2197       return;
2198     }
2199   }
2200   for (Loop *InnerL : L)
2201     collectSupportedLoops(*InnerL, LI, ORE, V);
2202 }
2203 
2204 //===----------------------------------------------------------------------===//
2205 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2206 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2207 //===----------------------------------------------------------------------===//
2208 
2209 /// Compute the transformed value of Index at offset StartValue using step
2210 /// StepValue.
2211 /// For integer induction, returns StartValue + Index * StepValue.
2212 /// For pointer induction, returns StartValue[Index * StepValue].
2213 /// FIXME: The newly created binary instructions should contain nsw/nuw
2214 /// flags, which can be found from the original scalar operations.
2215 static Value *
emitTransformedIndex(IRBuilderBase & B,Value * Index,Value * StartValue,Value * Step,InductionDescriptor::InductionKind InductionKind,const BinaryOperator * InductionBinOp)2216 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2217                      Value *Step,
2218                      InductionDescriptor::InductionKind InductionKind,
2219                      const BinaryOperator *InductionBinOp) {
2220   Type *StepTy = Step->getType();
2221   Value *CastedIndex = StepTy->isIntegerTy()
2222                            ? B.CreateSExtOrTrunc(Index, StepTy)
2223                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2224   if (CastedIndex != Index) {
2225     CastedIndex->setName(CastedIndex->getName() + ".cast");
2226     Index = CastedIndex;
2227   }
2228 
2229   // Note: the IR at this point is broken. We cannot use SE to create any new
2230   // SCEV and then expand it, hoping that SCEV's simplification will give us
2231   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2232   // lead to various SCEV crashes. So all we can do is to use builder and rely
2233   // on InstCombine for future simplifications. Here we handle some trivial
2234   // cases only.
2235   auto CreateAdd = [&B](Value *X, Value *Y) {
2236     assert(X->getType() == Y->getType() && "Types don't match!");
2237     if (auto *CX = dyn_cast<ConstantInt>(X))
2238       if (CX->isZero())
2239         return Y;
2240     if (auto *CY = dyn_cast<ConstantInt>(Y))
2241       if (CY->isZero())
2242         return X;
2243     return B.CreateAdd(X, Y);
2244   };
2245 
2246   // We allow X to be a vector type, in which case Y will potentially be
2247   // splatted into a vector with the same element count.
2248   auto CreateMul = [&B](Value *X, Value *Y) {
2249     assert(X->getType()->getScalarType() == Y->getType() &&
2250            "Types don't match!");
2251     if (auto *CX = dyn_cast<ConstantInt>(X))
2252       if (CX->isOne())
2253         return Y;
2254     if (auto *CY = dyn_cast<ConstantInt>(Y))
2255       if (CY->isOne())
2256         return X;
2257     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2258     if (XVTy && !isa<VectorType>(Y->getType()))
2259       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2260     return B.CreateMul(X, Y);
2261   };
2262 
2263   switch (InductionKind) {
2264   case InductionDescriptor::IK_IntInduction: {
2265     assert(!isa<VectorType>(Index->getType()) &&
2266            "Vector indices not supported for integer inductions yet");
2267     assert(Index->getType() == StartValue->getType() &&
2268            "Index type does not match StartValue type");
2269     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2270       return B.CreateSub(StartValue, Index);
2271     auto *Offset = CreateMul(Index, Step);
2272     return CreateAdd(StartValue, Offset);
2273   }
2274   case InductionDescriptor::IK_PtrInduction:
2275     return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2276   case InductionDescriptor::IK_FpInduction: {
2277     assert(!isa<VectorType>(Index->getType()) &&
2278            "Vector indices not supported for FP inductions yet");
2279     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2280     assert(InductionBinOp &&
2281            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2282             InductionBinOp->getOpcode() == Instruction::FSub) &&
2283            "Original bin op should be defined for FP induction");
2284 
2285     Value *MulExp = B.CreateFMul(Step, Index);
2286     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2287                          "induction");
2288   }
2289   case InductionDescriptor::IK_NoInduction:
2290     return nullptr;
2291   }
2292   llvm_unreachable("invalid enum");
2293 }
2294 
getMaxVScale(const Function & F,const TargetTransformInfo & TTI)2295 std::optional<unsigned> getMaxVScale(const Function &F,
2296                                      const TargetTransformInfo &TTI) {
2297   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2298     return MaxVScale;
2299 
2300   if (F.hasFnAttribute(Attribute::VScaleRange))
2301     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2302 
2303   return std::nullopt;
2304 }
2305 
2306 /// For the given VF and UF and maximum trip count computed for the loop, return
2307 /// whether the induction variable might overflow in the vectorized loop. If not,
2308 /// then we know a runtime overflow check always evaluates to false and can be
2309 /// removed.
isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel * Cost,ElementCount VF,std::optional<unsigned> UF=std::nullopt)2310 static bool isIndvarOverflowCheckKnownFalse(
2311     const LoopVectorizationCostModel *Cost,
2312     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2313   // Always be conservative if we don't know the exact unroll factor.
2314   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2315 
2316   Type *IdxTy = Cost->Legal->getWidestInductionType();
2317   APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2318 
2319   // We know the runtime overflow check is known false iff the (max) trip-count
2320   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2321   // the vector loop induction variable.
2322   if (unsigned TC =
2323           Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2324     uint64_t MaxVF = VF.getKnownMinValue();
2325     if (VF.isScalable()) {
2326       std::optional<unsigned> MaxVScale =
2327           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2328       if (!MaxVScale)
2329         return false;
2330       MaxVF *= *MaxVScale;
2331     }
2332 
2333     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2334   }
2335 
2336   return false;
2337 }
2338 
2339 // Return whether we allow using masked interleave-groups (for dealing with
2340 // strided loads/stores that reside in predicated blocks, or for dealing
2341 // with gaps).
useMaskedInterleavedAccesses(const TargetTransformInfo & TTI)2342 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2343   // If an override option has been passed in for interleaved accesses, use it.
2344   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2345     return EnableMaskedInterleavedMemAccesses;
2346 
2347   return TTI.enableMaskedInterleavedAccessVectorization();
2348 }
2349 
scalarizeInstruction(const Instruction * Instr,VPReplicateRecipe * RepRecipe,const VPIteration & Instance,VPTransformState & State)2350 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2351                                                VPReplicateRecipe *RepRecipe,
2352                                                const VPIteration &Instance,
2353                                                VPTransformState &State) {
2354   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2355 
2356   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2357   // the first lane and part.
2358   if (isa<NoAliasScopeDeclInst>(Instr))
2359     if (!Instance.isFirstIteration())
2360       return;
2361 
2362   // Does this instruction return a value ?
2363   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2364 
2365   Instruction *Cloned = Instr->clone();
2366   if (!IsVoidRetTy) {
2367     Cloned->setName(Instr->getName() + ".cloned");
2368 #if !defined(NDEBUG)
2369     // Verify that VPlan type inference results agree with the type of the
2370     // generated values.
2371     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2372            "inferred type and type from generated instructions do not match");
2373 #endif
2374   }
2375 
2376   RepRecipe->setFlags(Cloned);
2377 
2378   if (auto DL = Instr->getDebugLoc())
2379     State.setDebugLocFrom(DL);
2380 
2381   // Replace the operands of the cloned instructions with their scalar
2382   // equivalents in the new loop.
2383   for (const auto &I : enumerate(RepRecipe->operands())) {
2384     auto InputInstance = Instance;
2385     VPValue *Operand = I.value();
2386     if (vputils::isUniformAfterVectorization(Operand))
2387       InputInstance.Lane = VPLane::getFirstLane();
2388     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2389   }
2390   State.addNewMetadata(Cloned, Instr);
2391 
2392   // Place the cloned scalar in the new loop.
2393   State.Builder.Insert(Cloned);
2394 
2395   State.set(RepRecipe, Cloned, Instance);
2396 
2397   // If we just cloned a new assumption, add it the assumption cache.
2398   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2399     AC->registerAssumption(II);
2400 
2401   // End if-block.
2402   bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2403   if (IfPredicateInstr)
2404     PredicatedInstructions.push_back(Cloned);
2405 }
2406 
2407 Value *
getOrCreateVectorTripCount(BasicBlock * InsertBlock)2408 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2409   if (VectorTripCount)
2410     return VectorTripCount;
2411 
2412   Value *TC = getTripCount();
2413   IRBuilder<> Builder(InsertBlock->getTerminator());
2414 
2415   Type *Ty = TC->getType();
2416   // This is where we can make the step a runtime constant.
2417   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2418 
2419   // If the tail is to be folded by masking, round the number of iterations N
2420   // up to a multiple of Step instead of rounding down. This is done by first
2421   // adding Step-1 and then rounding down. Note that it's ok if this addition
2422   // overflows: the vector induction variable will eventually wrap to zero given
2423   // that it starts at zero and its Step is a power of two; the loop will then
2424   // exit, with the last early-exit vector comparison also producing all-true.
2425   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2426   // is accounted for in emitIterationCountCheck that adds an overflow check.
2427   if (Cost->foldTailByMasking()) {
2428     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2429            "VF*UF must be a power of 2 when folding tail by masking");
2430     TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2431                            "n.rnd.up");
2432   }
2433 
2434   // Now we need to generate the expression for the part of the loop that the
2435   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2436   // iterations are not required for correctness, or N - Step, otherwise. Step
2437   // is equal to the vectorization factor (number of SIMD elements) times the
2438   // unroll factor (number of SIMD instructions).
2439   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2440 
2441   // There are cases where we *must* run at least one iteration in the remainder
2442   // loop.  See the cost model for when this can happen.  If the step evenly
2443   // divides the trip count, we set the remainder to be equal to the step. If
2444   // the step does not evenly divide the trip count, no adjustment is necessary
2445   // since there will already be scalar iterations. Note that the minimum
2446   // iterations check ensures that N >= Step.
2447   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2448     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2449     R = Builder.CreateSelect(IsZero, Step, R);
2450   }
2451 
2452   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2453 
2454   return VectorTripCount;
2455 }
2456 
emitIterationCountCheck(BasicBlock * Bypass)2457 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2458   Value *Count = getTripCount();
2459   // Reuse existing vector loop preheader for TC checks.
2460   // Note that new preheader block is generated for vector loop.
2461   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2462   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2463 
2464   // Generate code to check if the loop's trip count is less than VF * UF, or
2465   // equal to it in case a scalar epilogue is required; this implies that the
2466   // vector trip count is zero. This check also covers the case where adding one
2467   // to the backedge-taken count overflowed leading to an incorrect trip count
2468   // of zero. In this case we will also jump to the scalar loop.
2469   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2470                                                        : ICmpInst::ICMP_ULT;
2471 
2472   // If tail is to be folded, vector loop takes care of all iterations.
2473   Type *CountTy = Count->getType();
2474   Value *CheckMinIters = Builder.getFalse();
2475   auto CreateStep = [&]() -> Value * {
2476     // Create step with max(MinProTripCount, UF * VF).
2477     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2478       return createStepForVF(Builder, CountTy, VF, UF);
2479 
2480     Value *MinProfTC =
2481         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2482     if (!VF.isScalable())
2483       return MinProfTC;
2484     return Builder.CreateBinaryIntrinsic(
2485         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2486   };
2487 
2488   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2489   if (Style == TailFoldingStyle::None)
2490     CheckMinIters =
2491         Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2492   else if (VF.isScalable() &&
2493            !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2494            Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2495     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2496     // an overflow to zero when updating induction variables and so an
2497     // additional overflow check is required before entering the vector loop.
2498 
2499     // Get the maximum unsigned value for the type.
2500     Value *MaxUIntTripCount =
2501         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2502     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2503 
2504     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2505     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2506   }
2507 
2508   // Create new preheader for vector loop.
2509   LoopVectorPreHeader =
2510       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2511                  "vector.ph");
2512 
2513   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2514                                DT->getNode(Bypass)->getIDom()) &&
2515          "TC check is expected to dominate Bypass");
2516 
2517   // Update dominator for Bypass & LoopExit (if needed).
2518   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2519   BranchInst &BI =
2520       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2521   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2522     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2523   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2524   LoopBypassBlocks.push_back(TCCheckBlock);
2525 }
2526 
emitSCEVChecks(BasicBlock * Bypass)2527 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2528   BasicBlock *const SCEVCheckBlock =
2529       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2530   if (!SCEVCheckBlock)
2531     return nullptr;
2532 
2533   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2534            (OptForSizeBasedOnProfile &&
2535             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2536          "Cannot SCEV check stride or overflow when optimizing for size");
2537 
2538 
2539   // Update dominator only if this is first RT check.
2540   if (LoopBypassBlocks.empty()) {
2541     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2542     if (!Cost->requiresScalarEpilogue(VF.isVector()))
2543       // If there is an epilogue which must run, there's no edge from the
2544       // middle block to exit blocks  and thus no need to update the immediate
2545       // dominator of the exit blocks.
2546       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2547   }
2548 
2549   LoopBypassBlocks.push_back(SCEVCheckBlock);
2550   AddedSafetyChecks = true;
2551   return SCEVCheckBlock;
2552 }
2553 
emitMemRuntimeChecks(BasicBlock * Bypass)2554 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2555   // VPlan-native path does not do any analysis for runtime checks currently.
2556   if (EnableVPlanNativePath)
2557     return nullptr;
2558 
2559   BasicBlock *const MemCheckBlock =
2560       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2561 
2562   // Check if we generated code that checks in runtime if arrays overlap. We put
2563   // the checks into a separate block to make the more common case of few
2564   // elements faster.
2565   if (!MemCheckBlock)
2566     return nullptr;
2567 
2568   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2569     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2570            "Cannot emit memory checks when optimizing for size, unless forced "
2571            "to vectorize.");
2572     ORE->emit([&]() {
2573       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2574                                         OrigLoop->getStartLoc(),
2575                                         OrigLoop->getHeader())
2576              << "Code-size may be reduced by not forcing "
2577                 "vectorization, or by source-code modifications "
2578                 "eliminating the need for runtime checks "
2579                 "(e.g., adding 'restrict').";
2580     });
2581   }
2582 
2583   LoopBypassBlocks.push_back(MemCheckBlock);
2584 
2585   AddedSafetyChecks = true;
2586 
2587   return MemCheckBlock;
2588 }
2589 
createVectorLoopSkeleton(StringRef Prefix)2590 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2591   LoopScalarBody = OrigLoop->getHeader();
2592   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2593   assert(LoopVectorPreHeader && "Invalid loop structure");
2594   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2595   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2596          "multiple exit loop without required epilogue?");
2597 
2598   LoopMiddleBlock =
2599       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2600                  LI, nullptr, Twine(Prefix) + "middle.block");
2601   LoopScalarPreHeader =
2602       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2603                  nullptr, Twine(Prefix) + "scalar.ph");
2604 }
2605 
createInductionResumeValue(PHINode * OrigPhi,const InductionDescriptor & II,Value * Step,ArrayRef<BasicBlock * > BypassBlocks,std::pair<BasicBlock *,Value * > AdditionalBypass)2606 PHINode *InnerLoopVectorizer::createInductionResumeValue(
2607     PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2608     ArrayRef<BasicBlock *> BypassBlocks,
2609     std::pair<BasicBlock *, Value *> AdditionalBypass) {
2610   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
2611   assert(VectorTripCount && "Expected valid arguments");
2612 
2613   Instruction *OldInduction = Legal->getPrimaryInduction();
2614   Value *&EndValue = IVEndValues[OrigPhi];
2615   Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
2616   if (OrigPhi == OldInduction) {
2617     // We know what the end value is.
2618     EndValue = VectorTripCount;
2619   } else {
2620     IRBuilder<> B(LoopVectorPreHeader->getTerminator());
2621 
2622     // Fast-math-flags propagate from the original induction instruction.
2623     if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2624       B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2625 
2626     EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
2627                                     Step, II.getKind(), II.getInductionBinOp());
2628     EndValue->setName("ind.end");
2629 
2630     // Compute the end value for the additional bypass (if applicable).
2631     if (AdditionalBypass.first) {
2632       B.SetInsertPoint(AdditionalBypass.first,
2633                        AdditionalBypass.first->getFirstInsertionPt());
2634       EndValueFromAdditionalBypass =
2635           emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
2636                                Step, II.getKind(), II.getInductionBinOp());
2637       EndValueFromAdditionalBypass->setName("ind.end");
2638     }
2639   }
2640 
2641   // Create phi nodes to merge from the  backedge-taken check block.
2642   PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
2643                                          LoopScalarPreHeader->getFirstNonPHI());
2644   // Copy original phi DL over to the new one.
2645   BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2646 
2647   // The new PHI merges the original incoming value, in case of a bypass,
2648   // or the value at the end of the vectorized loop.
2649   BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
2650 
2651   // Fix the scalar body counter (PHI node).
2652   // The old induction's phi node in the scalar body needs the truncated
2653   // value.
2654   for (BasicBlock *BB : BypassBlocks)
2655     BCResumeVal->addIncoming(II.getStartValue(), BB);
2656 
2657   if (AdditionalBypass.first)
2658     BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
2659                                           EndValueFromAdditionalBypass);
2660   return BCResumeVal;
2661 }
2662 
2663 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2664 /// expansion results.
getExpandedStep(const InductionDescriptor & ID,const SCEV2ValueTy & ExpandedSCEVs)2665 static Value *getExpandedStep(const InductionDescriptor &ID,
2666                               const SCEV2ValueTy &ExpandedSCEVs) {
2667   const SCEV *Step = ID.getStep();
2668   if (auto *C = dyn_cast<SCEVConstant>(Step))
2669     return C->getValue();
2670   if (auto *U = dyn_cast<SCEVUnknown>(Step))
2671     return U->getValue();
2672   auto I = ExpandedSCEVs.find(Step);
2673   assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2674   return I->second;
2675 }
2676 
createInductionResumeValues(const SCEV2ValueTy & ExpandedSCEVs,std::pair<BasicBlock *,Value * > AdditionalBypass)2677 void InnerLoopVectorizer::createInductionResumeValues(
2678     const SCEV2ValueTy &ExpandedSCEVs,
2679     std::pair<BasicBlock *, Value *> AdditionalBypass) {
2680   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
2681           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
2682          "Inconsistent information about additional bypass.");
2683   // We are going to resume the execution of the scalar loop.
2684   // Go over all of the induction variables that we found and fix the
2685   // PHIs that are left in the scalar version of the loop.
2686   // The starting values of PHI nodes depend on the counter of the last
2687   // iteration in the vectorized loop.
2688   // If we come from a bypass edge then we need to start from the original
2689   // start value.
2690   for (const auto &InductionEntry : Legal->getInductionVars()) {
2691     PHINode *OrigPhi = InductionEntry.first;
2692     const InductionDescriptor &II = InductionEntry.second;
2693     PHINode *BCResumeVal = createInductionResumeValue(
2694         OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
2695         AdditionalBypass);
2696     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
2697   }
2698 }
2699 
2700 std::pair<BasicBlock *, Value *>
createVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)2701 InnerLoopVectorizer::createVectorizedLoopSkeleton(
2702     const SCEV2ValueTy &ExpandedSCEVs) {
2703   /*
2704    In this function we generate a new loop. The new loop will contain
2705    the vectorized instructions while the old loop will continue to run the
2706    scalar remainder.
2707 
2708        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2709      /  |      preheader are expanded here. Eventually all required SCEV
2710     /   |      expansion should happen here.
2711    /    v
2712   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2713   |  /  |
2714   | /   v
2715   ||   [ ]     <-- vector pre header.
2716   |/    |
2717   |     v
2718   |    [  ] \
2719   |    [  ]_|   <-- vector loop (created during VPlan execution).
2720   |     |
2721   |     v
2722   \   -[ ]   <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2723    |    |                       successors created during VPlan execution)
2724    \/   |
2725    /\   v
2726    | ->[ ]     <--- new preheader (wrapped in VPIRBasicBlock).
2727    |    |
2728  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
2729    |   [ ] \
2730    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
2731     \   |
2732      \  v
2733       >[ ]     <-- exit block(s). (wrapped in VPIRBasicBlock)
2734    ...
2735    */
2736 
2737   // Create an empty vector loop, and prepare basic blocks for the runtime
2738   // checks.
2739   createVectorLoopSkeleton("");
2740 
2741   // Now, compare the new count to zero. If it is zero skip the vector loop and
2742   // jump to the scalar loop. This check also covers the case where the
2743   // backedge-taken count is uint##_max: adding one to it will overflow leading
2744   // to an incorrect trip count of zero. In this (rare) case we will also jump
2745   // to the scalar loop.
2746   emitIterationCountCheck(LoopScalarPreHeader);
2747 
2748   // Generate the code to check any assumptions that we've made for SCEV
2749   // expressions.
2750   emitSCEVChecks(LoopScalarPreHeader);
2751 
2752   // Generate the code that checks in runtime if arrays overlap. We put the
2753   // checks into a separate block to make the more common case of few elements
2754   // faster.
2755   emitMemRuntimeChecks(LoopScalarPreHeader);
2756 
2757   // Emit phis for the new starting index of the scalar loop.
2758   createInductionResumeValues(ExpandedSCEVs);
2759 
2760   return {LoopVectorPreHeader, nullptr};
2761 }
2762 
2763 // Fix up external users of the induction variable. At this point, we are
2764 // in LCSSA form, with all external PHIs that use the IV having one input value,
2765 // coming from the remainder loop. We need those PHIs to also have a correct
2766 // value for the IV when arriving directly from the middle block.
fixupIVUsers(PHINode * OrigPhi,const InductionDescriptor & II,Value * VectorTripCount,Value * EndValue,BasicBlock * MiddleBlock,BasicBlock * VectorHeader,VPlan & Plan,VPTransformState & State)2767 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
2768                                        const InductionDescriptor &II,
2769                                        Value *VectorTripCount, Value *EndValue,
2770                                        BasicBlock *MiddleBlock,
2771                                        BasicBlock *VectorHeader, VPlan &Plan,
2772                                        VPTransformState &State) {
2773   // There are two kinds of external IV usages - those that use the value
2774   // computed in the last iteration (the PHI) and those that use the penultimate
2775   // value (the value that feeds into the phi from the loop latch).
2776   // We allow both, but they, obviously, have different values.
2777 
2778   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
2779 
2780   DenseMap<Value *, Value *> MissingVals;
2781 
2782   // An external user of the last iteration's value should see the value that
2783   // the remainder loop uses to initialize its own IV.
2784   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
2785   for (User *U : PostInc->users()) {
2786     Instruction *UI = cast<Instruction>(U);
2787     if (!OrigLoop->contains(UI)) {
2788       assert(isa<PHINode>(UI) && "Expected LCSSA form");
2789       MissingVals[UI] = EndValue;
2790     }
2791   }
2792 
2793   // An external user of the penultimate value need to see EndValue - Step.
2794   // The simplest way to get this is to recompute it from the constituent SCEVs,
2795   // that is Start + (Step * (CRD - 1)).
2796   for (User *U : OrigPhi->users()) {
2797     auto *UI = cast<Instruction>(U);
2798     if (!OrigLoop->contains(UI)) {
2799       assert(isa<PHINode>(UI) && "Expected LCSSA form");
2800       IRBuilder<> B(MiddleBlock->getTerminator());
2801 
2802       // Fast-math-flags propagate from the original induction instruction.
2803       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2804         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2805 
2806       Value *CountMinusOne = B.CreateSub(
2807           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
2808       CountMinusOne->setName("cmo");
2809 
2810       VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
2811       assert(StepVPV && "step must have been expanded during VPlan execution");
2812       Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2813                                         : State.get(StepVPV, {0, 0});
2814       Value *Escape =
2815           emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
2816                                II.getKind(), II.getInductionBinOp());
2817       Escape->setName("ind.escape");
2818       MissingVals[UI] = Escape;
2819     }
2820   }
2821 
2822   for (auto &I : MissingVals) {
2823     PHINode *PHI = cast<PHINode>(I.first);
2824     // One corner case we have to handle is two IVs "chasing" each-other,
2825     // that is %IV2 = phi [...], [ %IV1, %latch ]
2826     // In this case, if IV1 has an external use, we need to avoid adding both
2827     // "last value of IV1" and "penultimate value of IV2". So, verify that we
2828     // don't already have an incoming value for the middle block.
2829     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
2830       PHI->addIncoming(I.second, MiddleBlock);
2831       Plan.removeLiveOut(PHI);
2832     }
2833   }
2834 }
2835 
2836 namespace {
2837 
2838 struct CSEDenseMapInfo {
canHandle__anon71de2b2d0c11::CSEDenseMapInfo2839   static bool canHandle(const Instruction *I) {
2840     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2841            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2842   }
2843 
getEmptyKey__anon71de2b2d0c11::CSEDenseMapInfo2844   static inline Instruction *getEmptyKey() {
2845     return DenseMapInfo<Instruction *>::getEmptyKey();
2846   }
2847 
getTombstoneKey__anon71de2b2d0c11::CSEDenseMapInfo2848   static inline Instruction *getTombstoneKey() {
2849     return DenseMapInfo<Instruction *>::getTombstoneKey();
2850   }
2851 
getHashValue__anon71de2b2d0c11::CSEDenseMapInfo2852   static unsigned getHashValue(const Instruction *I) {
2853     assert(canHandle(I) && "Unknown instruction!");
2854     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2855                                                            I->value_op_end()));
2856   }
2857 
isEqual__anon71de2b2d0c11::CSEDenseMapInfo2858   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2859     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2860         LHS == getTombstoneKey() || RHS == getTombstoneKey())
2861       return LHS == RHS;
2862     return LHS->isIdenticalTo(RHS);
2863   }
2864 };
2865 
2866 } // end anonymous namespace
2867 
2868 ///Perform cse of induction variable instructions.
cse(BasicBlock * BB)2869 static void cse(BasicBlock *BB) {
2870   // Perform simple cse.
2871   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2872   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2873     if (!CSEDenseMapInfo::canHandle(&In))
2874       continue;
2875 
2876     // Check if we can replace this instruction with any of the
2877     // visited instructions.
2878     if (Instruction *V = CSEMap.lookup(&In)) {
2879       In.replaceAllUsesWith(V);
2880       In.eraseFromParent();
2881       continue;
2882     }
2883 
2884     CSEMap[&In] = &In;
2885   }
2886 }
2887 
2888 InstructionCost
getVectorCallCost(CallInst * CI,ElementCount VF) const2889 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2890                                               ElementCount VF) const {
2891   // We only need to calculate a cost if the VF is scalar; for actual vectors
2892   // we should already have a pre-calculated cost at each VF.
2893   if (!VF.isScalar())
2894     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2895 
2896   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2897   Type *RetTy = CI->getType();
2898   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
2899     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
2900       return *RedCost;
2901 
2902   SmallVector<Type *, 4> Tys;
2903   for (auto &ArgOp : CI->args())
2904     Tys.push_back(ArgOp->getType());
2905 
2906   InstructionCost ScalarCallCost =
2907       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2908 
2909   // If this is an intrinsic we may have a lower cost for it.
2910   if (getVectorIntrinsicIDForCall(CI, TLI)) {
2911     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2912     return std::min(ScalarCallCost, IntrinsicCost);
2913   }
2914   return ScalarCallCost;
2915 }
2916 
MaybeVectorizeType(Type * Elt,ElementCount VF)2917 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
2918   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2919     return Elt;
2920   return VectorType::get(Elt, VF);
2921 }
2922 
2923 InstructionCost
getVectorIntrinsicCost(CallInst * CI,ElementCount VF) const2924 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2925                                                    ElementCount VF) const {
2926   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2927   assert(ID && "Expected intrinsic call!");
2928   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
2929   FastMathFlags FMF;
2930   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2931     FMF = FPMO->getFastMathFlags();
2932 
2933   SmallVector<const Value *> Arguments(CI->args());
2934   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2935   SmallVector<Type *> ParamTys;
2936   std::transform(FTy->param_begin(), FTy->param_end(),
2937                  std::back_inserter(ParamTys),
2938                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
2939 
2940   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2941                                     dyn_cast<IntrinsicInst>(CI));
2942   return TTI.getIntrinsicInstrCost(CostAttrs,
2943                                    TargetTransformInfo::TCK_RecipThroughput);
2944 }
2945 
fixVectorizedLoop(VPTransformState & State,VPlan & Plan)2946 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
2947                                             VPlan &Plan) {
2948   // Fix widened non-induction PHIs by setting up the PHI operands.
2949   if (EnableVPlanNativePath)
2950     fixNonInductionPHIs(Plan, State);
2951 
2952   // Forget the original basic block.
2953   PSE.getSE()->forgetLoop(OrigLoop);
2954   PSE.getSE()->forgetBlockAndLoopDispositions();
2955 
2956   // After vectorization, the exit blocks of the original loop will have
2957   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2958   // looked through single-entry phis.
2959   SmallVector<BasicBlock *> ExitBlocks;
2960   OrigLoop->getExitBlocks(ExitBlocks);
2961   for (BasicBlock *Exit : ExitBlocks)
2962     for (PHINode &PN : Exit->phis())
2963       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
2964 
2965   VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2966   VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
2967   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
2968   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2969     // No edge from the middle block to the unique exit block has been inserted
2970     // and there is nothing to fix from vector loop; phis should have incoming
2971     // from scalar loop only.
2972   } else {
2973     // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
2974     // the cost model.
2975 
2976     // If we inserted an edge from the middle block to the unique exit block,
2977     // update uses outside the loop (phis) to account for the newly inserted
2978     // edge.
2979 
2980     // Fix-up external users of the induction variables.
2981     for (const auto &Entry : Legal->getInductionVars())
2982       fixupIVUsers(Entry.first, Entry.second,
2983                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
2984                    IVEndValues[Entry.first], LoopMiddleBlock,
2985                    VectorLoop->getHeader(), Plan, State);
2986   }
2987 
2988   // Fix live-out phis not already fixed earlier.
2989   for (const auto &KV : Plan.getLiveOuts())
2990     KV.second->fixPhi(Plan, State);
2991 
2992   for (Instruction *PI : PredicatedInstructions)
2993     sinkScalarOperands(&*PI);
2994 
2995   // Remove redundant induction instructions.
2996   cse(VectorLoop->getHeader());
2997 
2998   // Set/update profile weights for the vector and remainder loops as original
2999   // loop iterations are now distributed among them. Note that original loop
3000   // represented by LoopScalarBody becomes remainder loop after vectorization.
3001   //
3002   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3003   // end up getting slightly roughened result but that should be OK since
3004   // profile is not inherently precise anyway. Note also possible bypass of
3005   // vector code caused by legality checks is ignored, assigning all the weight
3006   // to the vector loop, optimistically.
3007   //
3008   // For scalable vectorization we can't know at compile time how many iterations
3009   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3010   // vscale of '1'.
3011   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3012                                LI->getLoopFor(LoopScalarBody),
3013                                VF.getKnownMinValue() * UF);
3014 }
3015 
sinkScalarOperands(Instruction * PredInst)3016 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3017   // The basic block and loop containing the predicated instruction.
3018   auto *PredBB = PredInst->getParent();
3019   auto *VectorLoop = LI->getLoopFor(PredBB);
3020 
3021   // Initialize a worklist with the operands of the predicated instruction.
3022   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3023 
3024   // Holds instructions that we need to analyze again. An instruction may be
3025   // reanalyzed if we don't yet know if we can sink it or not.
3026   SmallVector<Instruction *, 8> InstsToReanalyze;
3027 
3028   // Returns true if a given use occurs in the predicated block. Phi nodes use
3029   // their operands in their corresponding predecessor blocks.
3030   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3031     auto *I = cast<Instruction>(U.getUser());
3032     BasicBlock *BB = I->getParent();
3033     if (auto *Phi = dyn_cast<PHINode>(I))
3034       BB = Phi->getIncomingBlock(
3035           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3036     return BB == PredBB;
3037   };
3038 
3039   // Iteratively sink the scalarized operands of the predicated instruction
3040   // into the block we created for it. When an instruction is sunk, it's
3041   // operands are then added to the worklist. The algorithm ends after one pass
3042   // through the worklist doesn't sink a single instruction.
3043   bool Changed;
3044   do {
3045     // Add the instructions that need to be reanalyzed to the worklist, and
3046     // reset the changed indicator.
3047     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3048     InstsToReanalyze.clear();
3049     Changed = false;
3050 
3051     while (!Worklist.empty()) {
3052       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3053 
3054       // We can't sink an instruction if it is a phi node, is not in the loop,
3055       // may have side effects or may read from memory.
3056       // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3057       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3058           I->mayHaveSideEffects() || I->mayReadFromMemory())
3059           continue;
3060 
3061       // If the instruction is already in PredBB, check if we can sink its
3062       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3063       // sinking the scalar instruction I, hence it appears in PredBB; but it
3064       // may have failed to sink I's operands (recursively), which we try
3065       // (again) here.
3066       if (I->getParent() == PredBB) {
3067         Worklist.insert(I->op_begin(), I->op_end());
3068         continue;
3069       }
3070 
3071       // It's legal to sink the instruction if all its uses occur in the
3072       // predicated block. Otherwise, there's nothing to do yet, and we may
3073       // need to reanalyze the instruction.
3074       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3075         InstsToReanalyze.push_back(I);
3076         continue;
3077       }
3078 
3079       // Move the instruction to the beginning of the predicated block, and add
3080       // it's operands to the worklist.
3081       I->moveBefore(&*PredBB->getFirstInsertionPt());
3082       Worklist.insert(I->op_begin(), I->op_end());
3083 
3084       // The sinking may have enabled other instructions to be sunk, so we will
3085       // need to iterate.
3086       Changed = true;
3087     }
3088   } while (Changed);
3089 }
3090 
fixNonInductionPHIs(VPlan & Plan,VPTransformState & State)3091 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
3092                                               VPTransformState &State) {
3093   auto Iter = vp_depth_first_deep(Plan.getEntry());
3094   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3095     for (VPRecipeBase &P : VPBB->phis()) {
3096       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3097       if (!VPPhi)
3098         continue;
3099       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3100       // Make sure the builder has a valid insert point.
3101       Builder.SetInsertPoint(NewPhi);
3102       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3103         VPValue *Inc = VPPhi->getIncomingValue(i);
3104         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3105         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3106       }
3107     }
3108   }
3109 }
3110 
collectLoopScalars(ElementCount VF)3111 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3112   // We should not collect Scalars more than once per VF. Right now, this
3113   // function is called from collectUniformsAndScalars(), which already does
3114   // this check. Collecting Scalars for VF=1 does not make any sense.
3115   assert(VF.isVector() && !Scalars.contains(VF) &&
3116          "This function should not be visited twice for the same VF");
3117 
3118   // This avoids any chances of creating a REPLICATE recipe during planning
3119   // since that would result in generation of scalarized code during execution,
3120   // which is not supported for scalable vectors.
3121   if (VF.isScalable()) {
3122     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3123     return;
3124   }
3125 
3126   SmallSetVector<Instruction *, 8> Worklist;
3127 
3128   // These sets are used to seed the analysis with pointers used by memory
3129   // accesses that will remain scalar.
3130   SmallSetVector<Instruction *, 8> ScalarPtrs;
3131   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3132   auto *Latch = TheLoop->getLoopLatch();
3133 
3134   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3135   // The pointer operands of loads and stores will be scalar as long as the
3136   // memory access is not a gather or scatter operation. The value operand of a
3137   // store will remain scalar if the store is scalarized.
3138   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3139     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3140     assert(WideningDecision != CM_Unknown &&
3141            "Widening decision should be ready at this moment");
3142     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3143       if (Ptr == Store->getValueOperand())
3144         return WideningDecision == CM_Scalarize;
3145     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3146            "Ptr is neither a value or pointer operand");
3147     return WideningDecision != CM_GatherScatter;
3148   };
3149 
3150   // A helper that returns true if the given value is a bitcast or
3151   // getelementptr instruction contained in the loop.
3152   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3153     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3154             isa<GetElementPtrInst>(V)) &&
3155            !TheLoop->isLoopInvariant(V);
3156   };
3157 
3158   // A helper that evaluates a memory access's use of a pointer. If the use will
3159   // be a scalar use and the pointer is only used by memory accesses, we place
3160   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3161   // PossibleNonScalarPtrs.
3162   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3163     // We only care about bitcast and getelementptr instructions contained in
3164     // the loop.
3165     if (!isLoopVaryingBitCastOrGEP(Ptr))
3166       return;
3167 
3168     // If the pointer has already been identified as scalar (e.g., if it was
3169     // also identified as uniform), there's nothing to do.
3170     auto *I = cast<Instruction>(Ptr);
3171     if (Worklist.count(I))
3172       return;
3173 
3174     // If the use of the pointer will be a scalar use, and all users of the
3175     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3176     // place the pointer in PossibleNonScalarPtrs.
3177     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3178           return isa<LoadInst>(U) || isa<StoreInst>(U);
3179         }))
3180       ScalarPtrs.insert(I);
3181     else
3182       PossibleNonScalarPtrs.insert(I);
3183   };
3184 
3185   // We seed the scalars analysis with three classes of instructions: (1)
3186   // instructions marked uniform-after-vectorization and (2) bitcast,
3187   // getelementptr and (pointer) phi instructions used by memory accesses
3188   // requiring a scalar use.
3189   //
3190   // (1) Add to the worklist all instructions that have been identified as
3191   // uniform-after-vectorization.
3192   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3193 
3194   // (2) Add to the worklist all bitcast and getelementptr instructions used by
3195   // memory accesses requiring a scalar use. The pointer operands of loads and
3196   // stores will be scalar as long as the memory accesses is not a gather or
3197   // scatter operation. The value operand of a store will remain scalar if the
3198   // store is scalarized.
3199   for (auto *BB : TheLoop->blocks())
3200     for (auto &I : *BB) {
3201       if (auto *Load = dyn_cast<LoadInst>(&I)) {
3202         evaluatePtrUse(Load, Load->getPointerOperand());
3203       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3204         evaluatePtrUse(Store, Store->getPointerOperand());
3205         evaluatePtrUse(Store, Store->getValueOperand());
3206       }
3207     }
3208   for (auto *I : ScalarPtrs)
3209     if (!PossibleNonScalarPtrs.count(I)) {
3210       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3211       Worklist.insert(I);
3212     }
3213 
3214   // Insert the forced scalars.
3215   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3216   // induction variable when the PHI user is scalarized.
3217   auto ForcedScalar = ForcedScalars.find(VF);
3218   if (ForcedScalar != ForcedScalars.end())
3219     for (auto *I : ForcedScalar->second) {
3220       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3221       Worklist.insert(I);
3222     }
3223 
3224   // Expand the worklist by looking through any bitcasts and getelementptr
3225   // instructions we've already identified as scalar. This is similar to the
3226   // expansion step in collectLoopUniforms(); however, here we're only
3227   // expanding to include additional bitcasts and getelementptr instructions.
3228   unsigned Idx = 0;
3229   while (Idx != Worklist.size()) {
3230     Instruction *Dst = Worklist[Idx++];
3231     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3232       continue;
3233     auto *Src = cast<Instruction>(Dst->getOperand(0));
3234     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3235           auto *J = cast<Instruction>(U);
3236           return !TheLoop->contains(J) || Worklist.count(J) ||
3237                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3238                   isScalarUse(J, Src));
3239         })) {
3240       Worklist.insert(Src);
3241       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3242     }
3243   }
3244 
3245   // An induction variable will remain scalar if all users of the induction
3246   // variable and induction variable update remain scalar.
3247   for (const auto &Induction : Legal->getInductionVars()) {
3248     auto *Ind = Induction.first;
3249     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3250 
3251     // If tail-folding is applied, the primary induction variable will be used
3252     // to feed a vector compare.
3253     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3254       continue;
3255 
3256     // Returns true if \p Indvar is a pointer induction that is used directly by
3257     // load/store instruction \p I.
3258     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3259                                               Instruction *I) {
3260       return Induction.second.getKind() ==
3261                  InductionDescriptor::IK_PtrInduction &&
3262              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3263              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3264     };
3265 
3266     // Determine if all users of the induction variable are scalar after
3267     // vectorization.
3268     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3269       auto *I = cast<Instruction>(U);
3270       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3271              IsDirectLoadStoreFromPtrIndvar(Ind, I);
3272     });
3273     if (!ScalarInd)
3274       continue;
3275 
3276     // If the induction variable update is a fixed-order recurrence, neither the
3277     // induction variable or its update should be marked scalar after
3278     // vectorization.
3279     auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3280     if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3281       continue;
3282 
3283     // Determine if all users of the induction variable update instruction are
3284     // scalar after vectorization.
3285     auto ScalarIndUpdate =
3286         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3287           auto *I = cast<Instruction>(U);
3288           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3289                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3290         });
3291     if (!ScalarIndUpdate)
3292       continue;
3293 
3294     // The induction variable and its update instruction will remain scalar.
3295     Worklist.insert(Ind);
3296     Worklist.insert(IndUpdate);
3297     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3298     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3299                       << "\n");
3300   }
3301 
3302   Scalars[VF].insert(Worklist.begin(), Worklist.end());
3303 }
3304 
isScalarWithPredication(Instruction * I,ElementCount VF) const3305 bool LoopVectorizationCostModel::isScalarWithPredication(
3306     Instruction *I, ElementCount VF) const {
3307   if (!isPredicatedInst(I))
3308     return false;
3309 
3310   // Do we have a non-scalar lowering for this predicated
3311   // instruction? No - it is scalar with predication.
3312   switch(I->getOpcode()) {
3313   default:
3314     return true;
3315   case Instruction::Call:
3316     if (VF.isScalar())
3317       return true;
3318     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3319                .Kind == CM_Scalarize;
3320   case Instruction::Load:
3321   case Instruction::Store: {
3322     auto *Ptr = getLoadStorePointerOperand(I);
3323     auto *Ty = getLoadStoreType(I);
3324     Type *VTy = Ty;
3325     if (VF.isVector())
3326       VTy = VectorType::get(Ty, VF);
3327     const Align Alignment = getLoadStoreAlignment(I);
3328     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3329                                 TTI.isLegalMaskedGather(VTy, Alignment))
3330                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3331                                 TTI.isLegalMaskedScatter(VTy, Alignment));
3332   }
3333   case Instruction::UDiv:
3334   case Instruction::SDiv:
3335   case Instruction::SRem:
3336   case Instruction::URem: {
3337     // We have the option to use the safe-divisor idiom to avoid predication.
3338     // The cost based decision here will always select safe-divisor for
3339     // scalable vectors as scalarization isn't legal.
3340     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3341     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3342   }
3343   }
3344 }
3345 
isPredicatedInst(Instruction * I) const3346 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3347   if (!blockNeedsPredicationForAnyReason(I->getParent()))
3348     return false;
3349 
3350   // Can we prove this instruction is safe to unconditionally execute?
3351   // If not, we must use some form of predication.
3352   switch(I->getOpcode()) {
3353   default:
3354     return false;
3355   case Instruction::Load:
3356   case Instruction::Store: {
3357     if (!Legal->isMaskRequired(I))
3358       return false;
3359     // When we know the load's address is loop invariant and the instruction
3360     // in the original scalar loop was unconditionally executed then we
3361     // don't need to mark it as a predicated instruction. Tail folding may
3362     // introduce additional predication, but we're guaranteed to always have
3363     // at least one active lane.  We call Legal->blockNeedsPredication here
3364     // because it doesn't query tail-folding.  For stores, we need to prove
3365     // both speculation safety (which follows from the same argument as loads),
3366     // but also must prove the value being stored is correct.  The easiest
3367     // form of the later is to require that all values stored are the same.
3368     if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
3369         (isa<LoadInst>(I) ||
3370          (isa<StoreInst>(I) &&
3371           TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3372         !Legal->blockNeedsPredication(I->getParent()))
3373       return false;
3374     return true;
3375   }
3376   case Instruction::UDiv:
3377   case Instruction::SDiv:
3378   case Instruction::SRem:
3379   case Instruction::URem:
3380     // TODO: We can use the loop-preheader as context point here and get
3381     // context sensitive reasoning
3382     return !isSafeToSpeculativelyExecute(I);
3383   case Instruction::Call:
3384     return Legal->isMaskRequired(I);
3385   }
3386 }
3387 
3388 std::pair<InstructionCost, InstructionCost>
getDivRemSpeculationCost(Instruction * I,ElementCount VF) const3389 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3390                                                     ElementCount VF) const {
3391   assert(I->getOpcode() == Instruction::UDiv ||
3392          I->getOpcode() == Instruction::SDiv ||
3393          I->getOpcode() == Instruction::SRem ||
3394          I->getOpcode() == Instruction::URem);
3395   assert(!isSafeToSpeculativelyExecute(I));
3396 
3397   const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3398 
3399   // Scalarization isn't legal for scalable vector types
3400   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3401   if (!VF.isScalable()) {
3402     // Get the scalarization cost and scale this amount by the probability of
3403     // executing the predicated block. If the instruction is not predicated,
3404     // we fall through to the next case.
3405     ScalarizationCost = 0;
3406 
3407     // These instructions have a non-void type, so account for the phi nodes
3408     // that we will create. This cost is likely to be zero. The phi node
3409     // cost, if any, should be scaled by the block probability because it
3410     // models a copy at the end of each predicated block.
3411     ScalarizationCost += VF.getKnownMinValue() *
3412       TTI.getCFInstrCost(Instruction::PHI, CostKind);
3413 
3414     // The cost of the non-predicated instruction.
3415     ScalarizationCost += VF.getKnownMinValue() *
3416       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3417 
3418     // The cost of insertelement and extractelement instructions needed for
3419     // scalarization.
3420     ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3421 
3422     // Scale the cost by the probability of executing the predicated blocks.
3423     // This assumes the predicated block for each vector lane is equally
3424     // likely.
3425     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3426   }
3427   InstructionCost SafeDivisorCost = 0;
3428 
3429   auto *VecTy = ToVectorTy(I->getType(), VF);
3430 
3431   // The cost of the select guard to ensure all lanes are well defined
3432   // after we speculate above any internal control flow.
3433   SafeDivisorCost += TTI.getCmpSelInstrCost(
3434     Instruction::Select, VecTy,
3435     ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
3436     CmpInst::BAD_ICMP_PREDICATE, CostKind);
3437 
3438   // Certain instructions can be cheaper to vectorize if they have a constant
3439   // second vector operand. One example of this are shifts on x86.
3440   Value *Op2 = I->getOperand(1);
3441   auto Op2Info = TTI.getOperandInfo(Op2);
3442   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3443       Legal->isInvariant(Op2))
3444     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3445 
3446   SmallVector<const Value *, 4> Operands(I->operand_values());
3447   SafeDivisorCost += TTI.getArithmeticInstrCost(
3448     I->getOpcode(), VecTy, CostKind,
3449     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3450     Op2Info, Operands, I);
3451   return {ScalarizationCost, SafeDivisorCost};
3452 }
3453 
interleavedAccessCanBeWidened(Instruction * I,ElementCount VF) const3454 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3455     Instruction *I, ElementCount VF) const {
3456   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3457   assert(getWideningDecision(I, VF) == CM_Unknown &&
3458          "Decision should not be set yet.");
3459   auto *Group = getInterleavedAccessGroup(I);
3460   assert(Group && "Must have a group.");
3461 
3462   // If the instruction's allocated size doesn't equal it's type size, it
3463   // requires padding and will be scalarized.
3464   auto &DL = I->getDataLayout();
3465   auto *ScalarTy = getLoadStoreType(I);
3466   if (hasIrregularType(ScalarTy, DL))
3467     return false;
3468 
3469   // If the group involves a non-integral pointer, we may not be able to
3470   // losslessly cast all values to a common type.
3471   unsigned InterleaveFactor = Group->getFactor();
3472   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3473   for (unsigned i = 0; i < InterleaveFactor; i++) {
3474     Instruction *Member = Group->getMember(i);
3475     if (!Member)
3476       continue;
3477     auto *MemberTy = getLoadStoreType(Member);
3478     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3479     // Don't coerce non-integral pointers to integers or vice versa.
3480     if (MemberNI != ScalarNI) {
3481       // TODO: Consider adding special nullptr value case here
3482       return false;
3483     } else if (MemberNI && ScalarNI &&
3484                ScalarTy->getPointerAddressSpace() !=
3485                MemberTy->getPointerAddressSpace()) {
3486       return false;
3487     }
3488   }
3489 
3490   // Check if masking is required.
3491   // A Group may need masking for one of two reasons: it resides in a block that
3492   // needs predication, or it was decided to use masking to deal with gaps
3493   // (either a gap at the end of a load-access that may result in a speculative
3494   // load, or any gaps in a store-access).
3495   bool PredicatedAccessRequiresMasking =
3496       blockNeedsPredicationForAnyReason(I->getParent()) &&
3497       Legal->isMaskRequired(I);
3498   bool LoadAccessWithGapsRequiresEpilogMasking =
3499       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3500       !isScalarEpilogueAllowed();
3501   bool StoreAccessWithGapsRequiresMasking =
3502       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3503   if (!PredicatedAccessRequiresMasking &&
3504       !LoadAccessWithGapsRequiresEpilogMasking &&
3505       !StoreAccessWithGapsRequiresMasking)
3506     return true;
3507 
3508   // If masked interleaving is required, we expect that the user/target had
3509   // enabled it, because otherwise it either wouldn't have been created or
3510   // it should have been invalidated by the CostModel.
3511   assert(useMaskedInterleavedAccesses(TTI) &&
3512          "Masked interleave-groups for predicated accesses are not enabled.");
3513 
3514   if (Group->isReverse())
3515     return false;
3516 
3517   auto *Ty = getLoadStoreType(I);
3518   const Align Alignment = getLoadStoreAlignment(I);
3519   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3520                           : TTI.isLegalMaskedStore(Ty, Alignment);
3521 }
3522 
memoryInstructionCanBeWidened(Instruction * I,ElementCount VF)3523 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3524     Instruction *I, ElementCount VF) {
3525   // Get and ensure we have a valid memory instruction.
3526   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3527 
3528   auto *Ptr = getLoadStorePointerOperand(I);
3529   auto *ScalarTy = getLoadStoreType(I);
3530 
3531   // In order to be widened, the pointer should be consecutive, first of all.
3532   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3533     return false;
3534 
3535   // If the instruction is a store located in a predicated block, it will be
3536   // scalarized.
3537   if (isScalarWithPredication(I, VF))
3538     return false;
3539 
3540   // If the instruction's allocated size doesn't equal it's type size, it
3541   // requires padding and will be scalarized.
3542   auto &DL = I->getDataLayout();
3543   if (hasIrregularType(ScalarTy, DL))
3544     return false;
3545 
3546   return true;
3547 }
3548 
collectLoopUniforms(ElementCount VF)3549 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3550   // We should not collect Uniforms more than once per VF. Right now,
3551   // this function is called from collectUniformsAndScalars(), which
3552   // already does this check. Collecting Uniforms for VF=1 does not make any
3553   // sense.
3554 
3555   assert(VF.isVector() && !Uniforms.contains(VF) &&
3556          "This function should not be visited twice for the same VF");
3557 
3558   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
3559   // not analyze again.  Uniforms.count(VF) will return 1.
3560   Uniforms[VF].clear();
3561 
3562   // We now know that the loop is vectorizable!
3563   // Collect instructions inside the loop that will remain uniform after
3564   // vectorization.
3565 
3566   // Global values, params and instructions outside of current loop are out of
3567   // scope.
3568   auto isOutOfScope = [&](Value *V) -> bool {
3569     Instruction *I = dyn_cast<Instruction>(V);
3570     return (!I || !TheLoop->contains(I));
3571   };
3572 
3573   // Worklist containing uniform instructions demanding lane 0.
3574   SetVector<Instruction *> Worklist;
3575 
3576   // Add uniform instructions demanding lane 0 to the worklist. Instructions
3577   // that require predication must not be considered uniform after
3578   // vectorization, because that would create an erroneous replicating region
3579   // where only a single instance out of VF should be formed.
3580   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
3581     if (isOutOfScope(I)) {
3582       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3583                         << *I << "\n");
3584       return;
3585     }
3586     if (isPredicatedInst(I)) {
3587       LLVM_DEBUG(
3588           dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3589                  << "\n");
3590       return;
3591     }
3592     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3593     Worklist.insert(I);
3594   };
3595 
3596   // Start with the conditional branches exiting the loop. If the branch
3597   // condition is an instruction contained in the loop that is only used by the
3598   // branch, it is uniform.
3599   SmallVector<BasicBlock *> Exiting;
3600   TheLoop->getExitingBlocks(Exiting);
3601   for (BasicBlock *E : Exiting) {
3602     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3603     if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3604       addToWorklistIfAllowed(Cmp);
3605   }
3606 
3607   auto PrevVF = VF.divideCoefficientBy(2);
3608   // Return true if all lanes perform the same memory operation, and we can
3609   // thus chose to execute only one.
3610   auto isUniformMemOpUse = [&](Instruction *I) {
3611     // If the value was already known to not be uniform for the previous
3612     // (smaller VF), it cannot be uniform for the larger VF.
3613     if (PrevVF.isVector()) {
3614       auto Iter = Uniforms.find(PrevVF);
3615       if (Iter != Uniforms.end() && !Iter->second.contains(I))
3616         return false;
3617     }
3618     if (!Legal->isUniformMemOp(*I, VF))
3619       return false;
3620     if (isa<LoadInst>(I))
3621       // Loading the same address always produces the same result - at least
3622       // assuming aliasing and ordering which have already been checked.
3623       return true;
3624     // Storing the same value on every iteration.
3625     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3626   };
3627 
3628   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
3629     InstWidening WideningDecision = getWideningDecision(I, VF);
3630     assert(WideningDecision != CM_Unknown &&
3631            "Widening decision should be ready at this moment");
3632 
3633     if (isUniformMemOpUse(I))
3634       return true;
3635 
3636     return (WideningDecision == CM_Widen ||
3637             WideningDecision == CM_Widen_Reverse ||
3638             WideningDecision == CM_Interleave);
3639   };
3640 
3641   // Returns true if Ptr is the pointer operand of a memory access instruction
3642   // I, I is known to not require scalarization, and the pointer is not also
3643   // stored.
3644   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3645     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3646       return false;
3647     return getLoadStorePointerOperand(I) == Ptr &&
3648            (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3649   };
3650 
3651   // Holds a list of values which are known to have at least one uniform use.
3652   // Note that there may be other uses which aren't uniform.  A "uniform use"
3653   // here is something which only demands lane 0 of the unrolled iterations;
3654   // it does not imply that all lanes produce the same value (e.g. this is not
3655   // the usual meaning of uniform)
3656   SetVector<Value *> HasUniformUse;
3657 
3658   // Scan the loop for instructions which are either a) known to have only
3659   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3660   for (auto *BB : TheLoop->blocks())
3661     for (auto &I : *BB) {
3662       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3663         switch (II->getIntrinsicID()) {
3664         case Intrinsic::sideeffect:
3665         case Intrinsic::experimental_noalias_scope_decl:
3666         case Intrinsic::assume:
3667         case Intrinsic::lifetime_start:
3668         case Intrinsic::lifetime_end:
3669           if (TheLoop->hasLoopInvariantOperands(&I))
3670             addToWorklistIfAllowed(&I);
3671           break;
3672         default:
3673           break;
3674         }
3675       }
3676 
3677       // ExtractValue instructions must be uniform, because the operands are
3678       // known to be loop-invariant.
3679       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3680         assert(isOutOfScope(EVI->getAggregateOperand()) &&
3681                "Expected aggregate value to be loop invariant");
3682         addToWorklistIfAllowed(EVI);
3683         continue;
3684       }
3685 
3686       // If there's no pointer operand, there's nothing to do.
3687       auto *Ptr = getLoadStorePointerOperand(&I);
3688       if (!Ptr)
3689         continue;
3690 
3691       if (isUniformMemOpUse(&I))
3692         addToWorklistIfAllowed(&I);
3693 
3694       if (isVectorizedMemAccessUse(&I, Ptr))
3695         HasUniformUse.insert(Ptr);
3696     }
3697 
3698   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3699   // demanding) users.  Since loops are assumed to be in LCSSA form, this
3700   // disallows uses outside the loop as well.
3701   for (auto *V : HasUniformUse) {
3702     if (isOutOfScope(V))
3703       continue;
3704     auto *I = cast<Instruction>(V);
3705     auto UsersAreMemAccesses =
3706       llvm::all_of(I->users(), [&](User *U) -> bool {
3707         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
3708       });
3709     if (UsersAreMemAccesses)
3710       addToWorklistIfAllowed(I);
3711   }
3712 
3713   // Expand Worklist in topological order: whenever a new instruction
3714   // is added , its users should be already inside Worklist.  It ensures
3715   // a uniform instruction will only be used by uniform instructions.
3716   unsigned idx = 0;
3717   while (idx != Worklist.size()) {
3718     Instruction *I = Worklist[idx++];
3719 
3720     for (auto *OV : I->operand_values()) {
3721       // isOutOfScope operands cannot be uniform instructions.
3722       if (isOutOfScope(OV))
3723         continue;
3724       // First order recurrence Phi's should typically be considered
3725       // non-uniform.
3726       auto *OP = dyn_cast<PHINode>(OV);
3727       if (OP && Legal->isFixedOrderRecurrence(OP))
3728         continue;
3729       // If all the users of the operand are uniform, then add the
3730       // operand into the uniform worklist.
3731       auto *OI = cast<Instruction>(OV);
3732       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3733             auto *J = cast<Instruction>(U);
3734             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
3735           }))
3736         addToWorklistIfAllowed(OI);
3737     }
3738   }
3739 
3740   // For an instruction to be added into Worklist above, all its users inside
3741   // the loop should also be in Worklist. However, this condition cannot be
3742   // true for phi nodes that form a cyclic dependence. We must process phi
3743   // nodes separately. An induction variable will remain uniform if all users
3744   // of the induction variable and induction variable update remain uniform.
3745   // The code below handles both pointer and non-pointer induction variables.
3746   BasicBlock *Latch = TheLoop->getLoopLatch();
3747   for (const auto &Induction : Legal->getInductionVars()) {
3748     auto *Ind = Induction.first;
3749     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3750 
3751     // Determine if all users of the induction variable are uniform after
3752     // vectorization.
3753     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3754       auto *I = cast<Instruction>(U);
3755       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3756              isVectorizedMemAccessUse(I, Ind);
3757     });
3758     if (!UniformInd)
3759       continue;
3760 
3761     // Determine if all users of the induction variable update instruction are
3762     // uniform after vectorization.
3763     auto UniformIndUpdate =
3764         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3765           auto *I = cast<Instruction>(U);
3766           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3767                  isVectorizedMemAccessUse(I, IndUpdate);
3768         });
3769     if (!UniformIndUpdate)
3770       continue;
3771 
3772     // The induction variable and its update instruction will remain uniform.
3773     addToWorklistIfAllowed(Ind);
3774     addToWorklistIfAllowed(IndUpdate);
3775   }
3776 
3777   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3778 }
3779 
runtimeChecksRequired()3780 bool LoopVectorizationCostModel::runtimeChecksRequired() {
3781   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3782 
3783   if (Legal->getRuntimePointerChecking()->Need) {
3784     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3785         "runtime pointer checks needed. Enable vectorization of this "
3786         "loop with '#pragma clang loop vectorize(enable)' when "
3787         "compiling with -Os/-Oz",
3788         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3789     return true;
3790   }
3791 
3792   if (!PSE.getPredicate().isAlwaysTrue()) {
3793     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3794         "runtime SCEV checks needed. Enable vectorization of this "
3795         "loop with '#pragma clang loop vectorize(enable)' when "
3796         "compiling with -Os/-Oz",
3797         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3798     return true;
3799   }
3800 
3801   // FIXME: Avoid specializing for stride==1 instead of bailing out.
3802   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3803     reportVectorizationFailure("Runtime stride check for small trip count",
3804         "runtime stride == 1 checks needed. Enable vectorization of "
3805         "this loop without such check by compiling with -Os/-Oz",
3806         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3807     return true;
3808   }
3809 
3810   return false;
3811 }
3812 
isScalableVectorizationAllowed()3813 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3814   if (IsScalableVectorizationAllowed)
3815     return *IsScalableVectorizationAllowed;
3816 
3817   IsScalableVectorizationAllowed = false;
3818   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3819     return false;
3820 
3821   if (Hints->isScalableVectorizationDisabled()) {
3822     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3823                             "ScalableVectorizationDisabled", ORE, TheLoop);
3824     return false;
3825   }
3826 
3827   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3828 
3829   auto MaxScalableVF = ElementCount::getScalable(
3830       std::numeric_limits<ElementCount::ScalarTy>::max());
3831 
3832   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3833   // FIXME: While for scalable vectors this is currently sufficient, this should
3834   // be replaced by a more detailed mechanism that filters out specific VFs,
3835   // instead of invalidating vectorization for a whole set of VFs based on the
3836   // MaxVF.
3837 
3838   // Disable scalable vectorization if the loop contains unsupported reductions.
3839   if (!canVectorizeReductions(MaxScalableVF)) {
3840     reportVectorizationInfo(
3841         "Scalable vectorization not supported for the reduction "
3842         "operations found in this loop.",
3843         "ScalableVFUnfeasible", ORE, TheLoop);
3844     return false;
3845   }
3846 
3847   // Disable scalable vectorization if the loop contains any instructions
3848   // with element types not supported for scalable vectors.
3849   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3850         return !Ty->isVoidTy() &&
3851                !this->TTI.isElementTypeLegalForScalableVector(Ty);
3852       })) {
3853     reportVectorizationInfo("Scalable vectorization is not supported "
3854                             "for all element types found in this loop.",
3855                             "ScalableVFUnfeasible", ORE, TheLoop);
3856     return false;
3857   }
3858 
3859   if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3860     reportVectorizationInfo("The target does not provide maximum vscale value "
3861                             "for safe distance analysis.",
3862                             "ScalableVFUnfeasible", ORE, TheLoop);
3863     return false;
3864   }
3865 
3866   IsScalableVectorizationAllowed = true;
3867   return true;
3868 }
3869 
3870 ElementCount
getMaxLegalScalableVF(unsigned MaxSafeElements)3871 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3872   if (!isScalableVectorizationAllowed())
3873     return ElementCount::getScalable(0);
3874 
3875   auto MaxScalableVF = ElementCount::getScalable(
3876       std::numeric_limits<ElementCount::ScalarTy>::max());
3877   if (Legal->isSafeForAnyVectorWidth())
3878     return MaxScalableVF;
3879 
3880   std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3881   // Limit MaxScalableVF by the maximum safe dependence distance.
3882   MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3883 
3884   if (!MaxScalableVF)
3885     reportVectorizationInfo(
3886         "Max legal vector width too small, scalable vectorization "
3887         "unfeasible.",
3888         "ScalableVFUnfeasible", ORE, TheLoop);
3889 
3890   return MaxScalableVF;
3891 }
3892 
computeFeasibleMaxVF(unsigned MaxTripCount,ElementCount UserVF,bool FoldTailByMasking)3893 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3894     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3895   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3896   unsigned SmallestType, WidestType;
3897   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3898 
3899   // Get the maximum safe dependence distance in bits computed by LAA.
3900   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3901   // the memory accesses that is most restrictive (involved in the smallest
3902   // dependence distance).
3903   unsigned MaxSafeElements =
3904       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3905 
3906   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3907   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3908 
3909   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3910                     << ".\n");
3911   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3912                     << ".\n");
3913 
3914   // First analyze the UserVF, fall back if the UserVF should be ignored.
3915   if (UserVF) {
3916     auto MaxSafeUserVF =
3917         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3918 
3919     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3920       // If `VF=vscale x N` is safe, then so is `VF=N`
3921       if (UserVF.isScalable())
3922         return FixedScalableVFPair(
3923             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3924       else
3925         return UserVF;
3926     }
3927 
3928     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3929 
3930     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3931     // is better to ignore the hint and let the compiler choose a suitable VF.
3932     if (!UserVF.isScalable()) {
3933       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3934                         << " is unsafe, clamping to max safe VF="
3935                         << MaxSafeFixedVF << ".\n");
3936       ORE->emit([&]() {
3937         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3938                                           TheLoop->getStartLoc(),
3939                                           TheLoop->getHeader())
3940                << "User-specified vectorization factor "
3941                << ore::NV("UserVectorizationFactor", UserVF)
3942                << " is unsafe, clamping to maximum safe vectorization factor "
3943                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3944       });
3945       return MaxSafeFixedVF;
3946     }
3947 
3948     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3949       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3950                         << " is ignored because scalable vectors are not "
3951                            "available.\n");
3952       ORE->emit([&]() {
3953         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3954                                           TheLoop->getStartLoc(),
3955                                           TheLoop->getHeader())
3956                << "User-specified vectorization factor "
3957                << ore::NV("UserVectorizationFactor", UserVF)
3958                << " is ignored because the target does not support scalable "
3959                   "vectors. The compiler will pick a more suitable value.";
3960       });
3961     } else {
3962       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3963                         << " is unsafe. Ignoring scalable UserVF.\n");
3964       ORE->emit([&]() {
3965         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3966                                           TheLoop->getStartLoc(),
3967                                           TheLoop->getHeader())
3968                << "User-specified vectorization factor "
3969                << ore::NV("UserVectorizationFactor", UserVF)
3970                << " is unsafe. Ignoring the hint to let the compiler pick a "
3971                   "more suitable value.";
3972       });
3973     }
3974   }
3975 
3976   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3977                     << " / " << WidestType << " bits.\n");
3978 
3979   FixedScalableVFPair Result(ElementCount::getFixed(1),
3980                              ElementCount::getScalable(0));
3981   if (auto MaxVF =
3982           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3983                                   MaxSafeFixedVF, FoldTailByMasking))
3984     Result.FixedVF = MaxVF;
3985 
3986   if (auto MaxVF =
3987           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3988                                   MaxSafeScalableVF, FoldTailByMasking))
3989     if (MaxVF.isScalable()) {
3990       Result.ScalableVF = MaxVF;
3991       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3992                         << "\n");
3993     }
3994 
3995   return Result;
3996 }
3997 
3998 FixedScalableVFPair
computeMaxVF(ElementCount UserVF,unsigned UserIC)3999 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4000   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4001     // TODO: It may by useful to do since it's still likely to be dynamically
4002     // uniform if the target can skip.
4003     reportVectorizationFailure(
4004         "Not inserting runtime ptr check for divergent target",
4005         "runtime pointer checks needed. Not enabled for divergent target",
4006         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4007     return FixedScalableVFPair::getNone();
4008   }
4009 
4010   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4011   unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4012   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4013   if (TC == 1) {
4014     reportVectorizationFailure("Single iteration (non) loop",
4015         "loop trip count is one, irrelevant for vectorization",
4016         "SingleIterationLoop", ORE, TheLoop);
4017     return FixedScalableVFPair::getNone();
4018   }
4019 
4020   switch (ScalarEpilogueStatus) {
4021   case CM_ScalarEpilogueAllowed:
4022     return computeFeasibleMaxVF(MaxTC, UserVF, false);
4023   case CM_ScalarEpilogueNotAllowedUsePredicate:
4024     [[fallthrough]];
4025   case CM_ScalarEpilogueNotNeededUsePredicate:
4026     LLVM_DEBUG(
4027         dbgs() << "LV: vector predicate hint/switch found.\n"
4028                << "LV: Not allowing scalar epilogue, creating predicated "
4029                << "vector loop.\n");
4030     break;
4031   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4032     // fallthrough as a special case of OptForSize
4033   case CM_ScalarEpilogueNotAllowedOptSize:
4034     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4035       LLVM_DEBUG(
4036           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4037     else
4038       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4039                         << "count.\n");
4040 
4041     // Bail if runtime checks are required, which are not good when optimising
4042     // for size.
4043     if (runtimeChecksRequired())
4044       return FixedScalableVFPair::getNone();
4045 
4046     break;
4047   }
4048 
4049   // The only loops we can vectorize without a scalar epilogue, are loops with
4050   // a bottom-test and a single exiting block. We'd have to handle the fact
4051   // that not every instruction executes on the last iteration.  This will
4052   // require a lane mask which varies through the vector loop body.  (TODO)
4053   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4054     // If there was a tail-folding hint/switch, but we can't fold the tail by
4055     // masking, fallback to a vectorization with a scalar epilogue.
4056     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4057       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4058                            "scalar epilogue instead.\n");
4059       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4060       return computeFeasibleMaxVF(MaxTC, UserVF, false);
4061     }
4062     return FixedScalableVFPair::getNone();
4063   }
4064 
4065   // Now try the tail folding
4066 
4067   // Invalidate interleave groups that require an epilogue if we can't mask
4068   // the interleave-group.
4069   if (!useMaskedInterleavedAccesses(TTI)) {
4070     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4071            "No decisions should have been taken at this point");
4072     // Note: There is no need to invalidate any cost modeling decisions here, as
4073     // non where taken so far.
4074     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4075   }
4076 
4077   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4078 
4079   // Avoid tail folding if the trip count is known to be a multiple of any VF
4080   // we choose.
4081   std::optional<unsigned> MaxPowerOf2RuntimeVF =
4082       MaxFactors.FixedVF.getFixedValue();
4083   if (MaxFactors.ScalableVF) {
4084     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4085     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4086       MaxPowerOf2RuntimeVF = std::max<unsigned>(
4087           *MaxPowerOf2RuntimeVF,
4088           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4089     } else
4090       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4091   }
4092 
4093   if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4094     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4095            "MaxFixedVF must be a power of 2");
4096     unsigned MaxVFtimesIC =
4097         UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4098     ScalarEvolution *SE = PSE.getSE();
4099     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4100     const SCEV *ExitCount = SE->getAddExpr(
4101         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4102     const SCEV *Rem = SE->getURemExpr(
4103         SE->applyLoopGuards(ExitCount, TheLoop),
4104         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4105     if (Rem->isZero()) {
4106       // Accept MaxFixedVF if we do not have a tail.
4107       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4108       return MaxFactors;
4109     }
4110   }
4111 
4112   // If we don't know the precise trip count, or if the trip count that we
4113   // found modulo the vectorization factor is not zero, try to fold the tail
4114   // by masking.
4115   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4116   setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4117   if (foldTailByMasking()) {
4118     if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4119       LLVM_DEBUG(
4120           dbgs()
4121           << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4122              "try to generate VP Intrinsics with scalable vector "
4123              "factors only.\n");
4124       // Tail folded loop using VP intrinsics restricts the VF to be scalable
4125       // for now.
4126       // TODO: extend it for fixed vectors, if required.
4127       assert(MaxFactors.ScalableVF.isScalable() &&
4128              "Expected scalable vector factor.");
4129 
4130       MaxFactors.FixedVF = ElementCount::getFixed(1);
4131     }
4132     return MaxFactors;
4133   }
4134 
4135   // If there was a tail-folding hint/switch, but we can't fold the tail by
4136   // masking, fallback to a vectorization with a scalar epilogue.
4137   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4138     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4139                          "scalar epilogue instead.\n");
4140     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4141     return MaxFactors;
4142   }
4143 
4144   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4145     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4146     return FixedScalableVFPair::getNone();
4147   }
4148 
4149   if (TC == 0) {
4150     reportVectorizationFailure(
4151         "Unable to calculate the loop count due to complex control flow",
4152         "unable to calculate the loop count due to complex control flow",
4153         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4154     return FixedScalableVFPair::getNone();
4155   }
4156 
4157   reportVectorizationFailure(
4158       "Cannot optimize for size and vectorize at the same time.",
4159       "cannot optimize for size and vectorize at the same time. "
4160       "Enable vectorization of this loop with '#pragma clang loop "
4161       "vectorize(enable)' when compiling with -Os/-Oz",
4162       "NoTailLoopWithOptForSize", ORE, TheLoop);
4163   return FixedScalableVFPair::getNone();
4164 }
4165 
getMaximizedVFForTarget(unsigned MaxTripCount,unsigned SmallestType,unsigned WidestType,ElementCount MaxSafeVF,bool FoldTailByMasking)4166 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4167     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4168     ElementCount MaxSafeVF, bool FoldTailByMasking) {
4169   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4170   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4171       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4172                            : TargetTransformInfo::RGK_FixedWidthVector);
4173 
4174   // Convenience function to return the minimum of two ElementCounts.
4175   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4176     assert((LHS.isScalable() == RHS.isScalable()) &&
4177            "Scalable flags must match");
4178     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4179   };
4180 
4181   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4182   // Note that both WidestRegister and WidestType may not be a powers of 2.
4183   auto MaxVectorElementCount = ElementCount::get(
4184       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4185       ComputeScalableMaxVF);
4186   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4187   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4188                     << (MaxVectorElementCount * WidestType) << " bits.\n");
4189 
4190   if (!MaxVectorElementCount) {
4191     LLVM_DEBUG(dbgs() << "LV: The target has no "
4192                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
4193                       << " vector registers.\n");
4194     return ElementCount::getFixed(1);
4195   }
4196 
4197   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4198   if (MaxVectorElementCount.isScalable() &&
4199       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4200     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4201     auto Min = Attr.getVScaleRangeMin();
4202     WidestRegisterMinEC *= Min;
4203   }
4204 
4205   // When a scalar epilogue is required, at least one iteration of the scalar
4206   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4207   // max VF that results in a dead vector loop.
4208   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4209     MaxTripCount -= 1;
4210 
4211   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4212       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4213     // If upper bound loop trip count (TC) is known at compile time there is no
4214     // point in choosing VF greater than TC (as done in the loop below). Select
4215     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4216     // scalable, we only fall back on a fixed VF when the TC is less than or
4217     // equal to the known number of lanes.
4218     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4219     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4220                          "exceeding the constant trip count: "
4221                       << ClampedUpperTripCount << "\n");
4222     return ElementCount::get(
4223         ClampedUpperTripCount,
4224         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4225   }
4226 
4227   TargetTransformInfo::RegisterKind RegKind =
4228       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4229                            : TargetTransformInfo::RGK_FixedWidthVector;
4230   ElementCount MaxVF = MaxVectorElementCount;
4231   if (MaximizeBandwidth ||
4232       (MaximizeBandwidth.getNumOccurrences() == 0 &&
4233        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4234         (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4235     auto MaxVectorElementCountMaxBW = ElementCount::get(
4236         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4237         ComputeScalableMaxVF);
4238     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4239 
4240     // Collect all viable vectorization factors larger than the default MaxVF
4241     // (i.e. MaxVectorElementCount).
4242     SmallVector<ElementCount, 8> VFs;
4243     for (ElementCount VS = MaxVectorElementCount * 2;
4244          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4245       VFs.push_back(VS);
4246 
4247     // For each VF calculate its register usage.
4248     auto RUs = calculateRegisterUsage(VFs);
4249 
4250     // Select the largest VF which doesn't require more registers than existing
4251     // ones.
4252     for (int I = RUs.size() - 1; I >= 0; --I) {
4253       const auto &MLU = RUs[I].MaxLocalUsers;
4254       if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4255             return LU.second <= TTI.getNumberOfRegisters(LU.first);
4256           })) {
4257         MaxVF = VFs[I];
4258         break;
4259       }
4260     }
4261     if (ElementCount MinVF =
4262             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4263       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4264         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4265                           << ") with target's minimum: " << MinVF << '\n');
4266         MaxVF = MinVF;
4267       }
4268     }
4269 
4270     // Invalidate any widening decisions we might have made, in case the loop
4271     // requires prediction (decided later), but we have already made some
4272     // load/store widening decisions.
4273     invalidateCostModelingDecisions();
4274   }
4275   return MaxVF;
4276 }
4277 
4278 /// Convenience function that returns the value of vscale_range iff
4279 /// vscale_range.min == vscale_range.max or otherwise returns the value
4280 /// returned by the corresponding TTI method.
4281 static std::optional<unsigned>
getVScaleForTuning(const Loop * L,const TargetTransformInfo & TTI)4282 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4283   const Function *Fn = L->getHeader()->getParent();
4284   if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4285     auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4286     auto Min = Attr.getVScaleRangeMin();
4287     auto Max = Attr.getVScaleRangeMax();
4288     if (Max && Min == Max)
4289       return Max;
4290   }
4291 
4292   return TTI.getVScaleForTuning();
4293 }
4294 
isMoreProfitable(const VectorizationFactor & A,const VectorizationFactor & B) const4295 bool LoopVectorizationPlanner::isMoreProfitable(
4296     const VectorizationFactor &A, const VectorizationFactor &B) const {
4297   InstructionCost CostA = A.Cost;
4298   InstructionCost CostB = B.Cost;
4299 
4300   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4301 
4302   // Improve estimate for the vector width if it is scalable.
4303   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4304   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4305   if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4306     if (A.Width.isScalable())
4307       EstimatedWidthA *= *VScale;
4308     if (B.Width.isScalable())
4309       EstimatedWidthB *= *VScale;
4310   }
4311 
4312   // Assume vscale may be larger than 1 (or the value being tuned for),
4313   // so that scalable vectorization is slightly favorable over fixed-width
4314   // vectorization.
4315   bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4316                         A.Width.isScalable() && !B.Width.isScalable();
4317 
4318   auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4319                                 const InstructionCost &RHS) {
4320     return PreferScalable ? LHS <= RHS : LHS < RHS;
4321   };
4322 
4323   // To avoid the need for FP division:
4324   //      (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4325   // <=>  (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4326   if (!MaxTripCount)
4327     return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4328 
4329   auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4330                                            InstructionCost VectorCost,
4331                                            InstructionCost ScalarCost) {
4332     // If the trip count is a known (possibly small) constant, the trip count
4333     // will be rounded up to an integer number of iterations under
4334     // FoldTailByMasking. The total cost in that case will be
4335     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4336     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4337     // some extra overheads, but for the purpose of comparing the costs of
4338     // different VFs we can use this to compare the total loop-body cost
4339     // expected after vectorization.
4340     if (CM.foldTailByMasking())
4341       return VectorCost * divideCeil(MaxTripCount, VF);
4342     return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4343   };
4344 
4345   auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4346   auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4347   return CmpFn(RTCostA, RTCostB);
4348 }
4349 
emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,OptimizationRemarkEmitter * ORE,Loop * TheLoop)4350 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4351                                    OptimizationRemarkEmitter *ORE,
4352                                    Loop *TheLoop) {
4353   if (InvalidCosts.empty())
4354     return;
4355 
4356   // Emit a report of VFs with invalid costs in the loop.
4357 
4358   // Group the remarks per instruction, keeping the instruction order from
4359   // InvalidCosts.
4360   std::map<Instruction *, unsigned> Numbering;
4361   unsigned I = 0;
4362   for (auto &Pair : InvalidCosts)
4363     if (!Numbering.count(Pair.first))
4364       Numbering[Pair.first] = I++;
4365 
4366   // Sort the list, first on instruction(number) then on VF.
4367   sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4368     if (Numbering[A.first] != Numbering[B.first])
4369       return Numbering[A.first] < Numbering[B.first];
4370     const auto &LHS = A.second;
4371     const auto &RHS = B.second;
4372     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4373            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4374   });
4375 
4376   // For a list of ordered instruction-vf pairs:
4377   //   [(load, vf1), (load, vf2), (store, vf1)]
4378   // Group the instructions together to emit separate remarks for:
4379   //   load  (vf1, vf2)
4380   //   store (vf1)
4381   auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4382   auto Subset = ArrayRef<InstructionVFPair>();
4383   do {
4384     if (Subset.empty())
4385       Subset = Tail.take_front(1);
4386 
4387     Instruction *I = Subset.front().first;
4388 
4389     // If the next instruction is different, or if there are no other pairs,
4390     // emit a remark for the collated subset. e.g.
4391     //   [(load, vf1), (load, vf2))]
4392     // to emit:
4393     //  remark: invalid costs for 'load' at VF=(vf, vf2)
4394     if (Subset == Tail || Tail[Subset.size()].first != I) {
4395       std::string OutString;
4396       raw_string_ostream OS(OutString);
4397       assert(!Subset.empty() && "Unexpected empty range");
4398       OS << "Instruction with invalid costs prevented vectorization at VF=(";
4399       for (const auto &Pair : Subset)
4400         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4401       OS << "):";
4402       if (auto *CI = dyn_cast<CallInst>(I))
4403         OS << " call to " << CI->getCalledFunction()->getName();
4404       else
4405         OS << " " << I->getOpcodeName();
4406       OS.flush();
4407       reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
4408       Tail = Tail.drop_front(Subset.size());
4409       Subset = {};
4410     } else
4411       // Grow the subset by one element
4412       Subset = Tail.take_front(Subset.size() + 1);
4413   } while (!Tail.empty());
4414 }
4415 
4416 /// Check if any recipe of \p Plan will generate a vector value, which will be
4417 /// assigned a vector register.
willGenerateVectors(VPlan & Plan,ElementCount VF,const TargetTransformInfo & TTI)4418 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4419                                 const TargetTransformInfo &TTI) {
4420   assert(VF.isVector() && "Checking a scalar VF?");
4421   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
4422                           Plan.getCanonicalIV()->getScalarType()->getContext());
4423   DenseSet<VPRecipeBase *> EphemeralRecipes;
4424   collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4425   // Set of already visited types.
4426   DenseSet<Type *> Visited;
4427   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4428            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4429     for (VPRecipeBase &R : *VPBB) {
4430       if (EphemeralRecipes.contains(&R))
4431         continue;
4432       // Continue early if the recipe is considered to not produce a vector
4433       //  result. Note that this includes VPInstruction where some opcodes may
4434       // produce a vector, to preserve existing behavior as VPInstructions model
4435       // aspects not directly mapped to existing IR instructions.
4436       switch (R.getVPDefID()) {
4437       case VPDef::VPDerivedIVSC:
4438       case VPDef::VPScalarIVStepsSC:
4439       case VPDef::VPScalarCastSC:
4440       case VPDef::VPReplicateSC:
4441       case VPDef::VPInstructionSC:
4442       case VPDef::VPCanonicalIVPHISC:
4443       case VPDef::VPVectorPointerSC:
4444       case VPDef::VPExpandSCEVSC:
4445       case VPDef::VPEVLBasedIVPHISC:
4446       case VPDef::VPPredInstPHISC:
4447       case VPDef::VPBranchOnMaskSC:
4448         continue;
4449       case VPDef::VPReductionSC:
4450       case VPDef::VPActiveLaneMaskPHISC:
4451       case VPDef::VPWidenCallSC:
4452       case VPDef::VPWidenCanonicalIVSC:
4453       case VPDef::VPWidenCastSC:
4454       case VPDef::VPWidenGEPSC:
4455       case VPDef::VPWidenSC:
4456       case VPDef::VPWidenSelectSC:
4457       case VPDef::VPBlendSC:
4458       case VPDef::VPFirstOrderRecurrencePHISC:
4459       case VPDef::VPWidenPHISC:
4460       case VPDef::VPWidenIntOrFpInductionSC:
4461       case VPDef::VPWidenPointerInductionSC:
4462       case VPDef::VPReductionPHISC:
4463       case VPDef::VPInterleaveSC:
4464       case VPDef::VPWidenLoadEVLSC:
4465       case VPDef::VPWidenLoadSC:
4466       case VPDef::VPWidenStoreEVLSC:
4467       case VPDef::VPWidenStoreSC:
4468         break;
4469       default:
4470         llvm_unreachable("unhandled recipe");
4471       }
4472 
4473       auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4474         Type *VectorTy = ToVectorTy(ScalarTy, VF);
4475         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4476         if (!NumLegalParts)
4477           return false;
4478         if (VF.isScalable()) {
4479           // <vscale x 1 x iN> is assumed to be profitable over iN because
4480           // scalable registers are a distinct register class from scalar
4481           // ones. If we ever find a target which wants to lower scalable
4482           // vectors back to scalars, we'll need to update this code to
4483           // explicitly ask TTI about the register class uses for each part.
4484           return NumLegalParts <= VF.getKnownMinValue();
4485         }
4486         // Two or more parts that share a register - are vectorized.
4487         return NumLegalParts < VF.getKnownMinValue();
4488       };
4489 
4490       // If no def nor is a store, e.g., branches, continue - no value to check.
4491       if (R.getNumDefinedValues() == 0 &&
4492           !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4493               &R))
4494         continue;
4495       // For multi-def recipes, currently only interleaved loads, suffice to
4496       // check first def only.
4497       // For stores check their stored value; for interleaved stores suffice
4498       // the check first stored value only. In all cases this is the second
4499       // operand.
4500       VPValue *ToCheck =
4501           R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4502       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4503       if (!Visited.insert({ScalarTy}).second)
4504         continue;
4505       if (WillWiden(ScalarTy))
4506         return true;
4507     }
4508   }
4509 
4510   return false;
4511 }
4512 
selectVectorizationFactor()4513 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4514   InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4515   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4516   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4517   assert(any_of(VPlans,
4518                 [](std::unique_ptr<VPlan> &P) {
4519                   return P->hasVF(ElementCount::getFixed(1));
4520                 }) &&
4521          "Expected Scalar VF to be a candidate");
4522 
4523   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4524                                        ExpectedCost);
4525   VectorizationFactor ChosenFactor = ScalarCost;
4526 
4527   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4528   if (ForceVectorization &&
4529       (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4530     // Ignore scalar width, because the user explicitly wants vectorization.
4531     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4532     // evaluation.
4533     ChosenFactor.Cost = InstructionCost::getMax();
4534   }
4535 
4536   SmallVector<InstructionVFPair> InvalidCosts;
4537   for (auto &P : VPlans) {
4538     for (ElementCount VF : P->vectorFactors()) {
4539       // The cost for scalar VF=1 is already calculated, so ignore it.
4540       if (VF.isScalar())
4541         continue;
4542 
4543       InstructionCost C = CM.expectedCost(VF, &InvalidCosts);
4544       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4545 
4546 #ifndef NDEBUG
4547       unsigned AssumedMinimumVscale =
4548           getVScaleForTuning(OrigLoop, TTI).value_or(1);
4549       unsigned Width =
4550           Candidate.Width.isScalable()
4551               ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
4552               : Candidate.Width.getFixedValue();
4553       LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4554                         << " costs: " << (Candidate.Cost / Width));
4555       if (VF.isScalable())
4556         LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4557                           << AssumedMinimumVscale << ")");
4558       LLVM_DEBUG(dbgs() << ".\n");
4559 #endif
4560 
4561       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4562         LLVM_DEBUG(
4563             dbgs()
4564             << "LV: Not considering vector loop of width " << VF
4565             << " because it will not generate any vector instructions.\n");
4566         continue;
4567       }
4568 
4569       // If profitable add it to ProfitableVF list.
4570       if (isMoreProfitable(Candidate, ScalarCost))
4571         ProfitableVFs.push_back(Candidate);
4572 
4573       if (isMoreProfitable(Candidate, ChosenFactor))
4574         ChosenFactor = Candidate;
4575     }
4576   }
4577 
4578   emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
4579 
4580   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4581     reportVectorizationFailure(
4582         "There are conditional stores.",
4583         "store that is conditionally executed prevents vectorization",
4584         "ConditionalStore", ORE, OrigLoop);
4585     ChosenFactor = ScalarCost;
4586   }
4587 
4588   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4589                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4590              << "LV: Vectorization seems to be not beneficial, "
4591              << "but was forced by a user.\n");
4592   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
4593   return ChosenFactor;
4594 }
4595 
isCandidateForEpilogueVectorization(ElementCount VF) const4596 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4597     ElementCount VF) const {
4598   // Cross iteration phis such as reductions need special handling and are
4599   // currently unsupported.
4600   if (any_of(OrigLoop->getHeader()->phis(),
4601              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4602     return false;
4603 
4604   // Phis with uses outside of the loop require special handling and are
4605   // currently unsupported.
4606   for (const auto &Entry : Legal->getInductionVars()) {
4607     // Look for uses of the value of the induction at the last iteration.
4608     Value *PostInc =
4609         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4610     for (User *U : PostInc->users())
4611       if (!OrigLoop->contains(cast<Instruction>(U)))
4612         return false;
4613     // Look for uses of penultimate value of the induction.
4614     for (User *U : Entry.first->users())
4615       if (!OrigLoop->contains(cast<Instruction>(U)))
4616         return false;
4617   }
4618 
4619   // Epilogue vectorization code has not been auditted to ensure it handles
4620   // non-latch exits properly.  It may be fine, but it needs auditted and
4621   // tested.
4622   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4623     return false;
4624 
4625   return true;
4626 }
4627 
isEpilogueVectorizationProfitable(const ElementCount VF) const4628 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4629     const ElementCount VF) const {
4630   // FIXME: We need a much better cost-model to take different parameters such
4631   // as register pressure, code size increase and cost of extra branches into
4632   // account. For now we apply a very crude heuristic and only consider loops
4633   // with vectorization factors larger than a certain value.
4634 
4635   // Allow the target to opt out entirely.
4636   if (!TTI.preferEpilogueVectorization())
4637     return false;
4638 
4639   // We also consider epilogue vectorization unprofitable for targets that don't
4640   // consider interleaving beneficial (eg. MVE).
4641   if (TTI.getMaxInterleaveFactor(VF) <= 1)
4642     return false;
4643 
4644   unsigned Multiplier = 1;
4645   if (VF.isScalable())
4646     Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
4647   if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
4648     return true;
4649   return false;
4650 }
4651 
selectEpilogueVectorizationFactor(const ElementCount MainLoopVF,unsigned IC)4652 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4653     const ElementCount MainLoopVF, unsigned IC) {
4654   VectorizationFactor Result = VectorizationFactor::Disabled();
4655   if (!EnableEpilogueVectorization) {
4656     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4657     return Result;
4658   }
4659 
4660   if (!CM.isScalarEpilogueAllowed()) {
4661     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4662                          "epilogue is allowed.\n");
4663     return Result;
4664   }
4665 
4666   // Not really a cost consideration, but check for unsupported cases here to
4667   // simplify the logic.
4668   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4669     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4670                          "is not a supported candidate.\n");
4671     return Result;
4672   }
4673 
4674   if (EpilogueVectorizationForceVF > 1) {
4675     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4676     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
4677     if (hasPlanWithVF(ForcedEC))
4678       return {ForcedEC, 0, 0};
4679     else {
4680       LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4681                            "viable.\n");
4682       return Result;
4683     }
4684   }
4685 
4686   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4687       OrigLoop->getHeader()->getParent()->hasMinSize()) {
4688     LLVM_DEBUG(
4689         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4690     return Result;
4691   }
4692 
4693   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
4694     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4695                          "this loop\n");
4696     return Result;
4697   }
4698 
4699   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4700   // the main loop handles 8 lanes per iteration. We could still benefit from
4701   // vectorizing the epilogue loop with VF=4.
4702   ElementCount EstimatedRuntimeVF = MainLoopVF;
4703   if (MainLoopVF.isScalable()) {
4704     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
4705     if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
4706       EstimatedRuntimeVF *= *VScale;
4707   }
4708 
4709   ScalarEvolution &SE = *PSE.getSE();
4710   Type *TCType = Legal->getWidestInductionType();
4711   const SCEV *RemainingIterations = nullptr;
4712   for (auto &NextVF : ProfitableVFs) {
4713     // Skip candidate VFs without a corresponding VPlan.
4714     if (!hasPlanWithVF(NextVF.Width))
4715       continue;
4716 
4717     // Skip candidate VFs with widths >= the estimate runtime VF (scalable
4718     // vectors) or the VF of the main loop (fixed vectors).
4719     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4720          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4721         ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
4722       continue;
4723 
4724     // If NextVF is greater than the number of remaining iterations, the
4725     // epilogue loop would be dead. Skip such factors.
4726     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4727       // TODO: extend to support scalable VFs.
4728       if (!RemainingIterations) {
4729         const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
4730         RemainingIterations = SE.getURemExpr(
4731             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4732       }
4733       if (SE.isKnownPredicate(
4734               CmpInst::ICMP_UGT,
4735               SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4736               RemainingIterations))
4737         continue;
4738     }
4739 
4740     if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
4741       Result = NextVF;
4742   }
4743 
4744   if (Result != VectorizationFactor::Disabled())
4745     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4746                       << Result.Width << "\n");
4747   return Result;
4748 }
4749 
4750 std::pair<unsigned, unsigned>
getSmallestAndWidestTypes()4751 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4752   unsigned MinWidth = -1U;
4753   unsigned MaxWidth = 8;
4754   const DataLayout &DL = TheFunction->getDataLayout();
4755   // For in-loop reductions, no element types are added to ElementTypesInLoop
4756   // if there are no loads/stores in the loop. In this case, check through the
4757   // reduction variables to determine the maximum width.
4758   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4759     // Reset MaxWidth so that we can find the smallest type used by recurrences
4760     // in the loop.
4761     MaxWidth = -1U;
4762     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4763       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4764       // When finding the min width used by the recurrence we need to account
4765       // for casts on the input operands of the recurrence.
4766       MaxWidth = std::min<unsigned>(
4767           MaxWidth, std::min<unsigned>(
4768                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4769                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4770     }
4771   } else {
4772     for (Type *T : ElementTypesInLoop) {
4773       MinWidth = std::min<unsigned>(
4774           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4775       MaxWidth = std::max<unsigned>(
4776           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4777     }
4778   }
4779   return {MinWidth, MaxWidth};
4780 }
4781 
collectElementTypesForWidening()4782 void LoopVectorizationCostModel::collectElementTypesForWidening() {
4783   ElementTypesInLoop.clear();
4784   // For each block.
4785   for (BasicBlock *BB : TheLoop->blocks()) {
4786     // For each instruction in the loop.
4787     for (Instruction &I : BB->instructionsWithoutDebug()) {
4788       Type *T = I.getType();
4789 
4790       // Skip ignored values.
4791       if (ValuesToIgnore.count(&I))
4792         continue;
4793 
4794       // Only examine Loads, Stores and PHINodes.
4795       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4796         continue;
4797 
4798       // Examine PHI nodes that are reduction variables. Update the type to
4799       // account for the recurrence type.
4800       if (auto *PN = dyn_cast<PHINode>(&I)) {
4801         if (!Legal->isReductionVariable(PN))
4802           continue;
4803         const RecurrenceDescriptor &RdxDesc =
4804             Legal->getReductionVars().find(PN)->second;
4805         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4806             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
4807                                       RdxDesc.getRecurrenceType(),
4808                                       TargetTransformInfo::ReductionFlags()))
4809           continue;
4810         T = RdxDesc.getRecurrenceType();
4811       }
4812 
4813       // Examine the stored values.
4814       if (auto *ST = dyn_cast<StoreInst>(&I))
4815         T = ST->getValueOperand()->getType();
4816 
4817       assert(T->isSized() &&
4818              "Expected the load/store/recurrence type to be sized");
4819 
4820       ElementTypesInLoop.insert(T);
4821     }
4822   }
4823 }
4824 
4825 unsigned
selectInterleaveCount(ElementCount VF,InstructionCost LoopCost)4826 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4827                                                   InstructionCost LoopCost) {
4828   // -- The interleave heuristics --
4829   // We interleave the loop in order to expose ILP and reduce the loop overhead.
4830   // There are many micro-architectural considerations that we can't predict
4831   // at this level. For example, frontend pressure (on decode or fetch) due to
4832   // code size, or the number and capabilities of the execution ports.
4833   //
4834   // We use the following heuristics to select the interleave count:
4835   // 1. If the code has reductions, then we interleave to break the cross
4836   // iteration dependency.
4837   // 2. If the loop is really small, then we interleave to reduce the loop
4838   // overhead.
4839   // 3. We don't interleave if we think that we will spill registers to memory
4840   // due to the increased register pressure.
4841 
4842   if (!isScalarEpilogueAllowed())
4843     return 1;
4844 
4845   // Do not interleave if EVL is preferred and no User IC is specified.
4846   if (foldTailWithEVL()) {
4847     LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4848                          "Unroll factor forced to be 1.\n");
4849     return 1;
4850   }
4851 
4852   // We used the distance for the interleave count.
4853   if (!Legal->isSafeForAnyVectorWidth())
4854     return 1;
4855 
4856   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
4857   const bool HasReductions = !Legal->getReductionVars().empty();
4858 
4859   // If we did not calculate the cost for VF (because the user selected the VF)
4860   // then we calculate the cost of VF here.
4861   if (LoopCost == 0) {
4862     LoopCost = expectedCost(VF);
4863     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4864 
4865     // Loop body is free and there is no need for interleaving.
4866     if (LoopCost == 0)
4867       return 1;
4868   }
4869 
4870   RegisterUsage R = calculateRegisterUsage({VF})[0];
4871   // We divide by these constants so assume that we have at least one
4872   // instruction that uses at least one register.
4873   for (auto& pair : R.MaxLocalUsers) {
4874     pair.second = std::max(pair.second, 1U);
4875   }
4876 
4877   // We calculate the interleave count using the following formula.
4878   // Subtract the number of loop invariants from the number of available
4879   // registers. These registers are used by all of the interleaved instances.
4880   // Next, divide the remaining registers by the number of registers that is
4881   // required by the loop, in order to estimate how many parallel instances
4882   // fit without causing spills. All of this is rounded down if necessary to be
4883   // a power of two. We want power of two interleave count to simplify any
4884   // addressing operations or alignment considerations.
4885   // We also want power of two interleave counts to ensure that the induction
4886   // variable of the vector loop wraps to zero, when tail is folded by masking;
4887   // this currently happens when OptForSize, in which case IC is set to 1 above.
4888   unsigned IC = UINT_MAX;
4889 
4890   for (auto& pair : R.MaxLocalUsers) {
4891     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4892     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4893                       << " registers of "
4894                       << TTI.getRegisterClassName(pair.first) << " register class\n");
4895     if (VF.isScalar()) {
4896       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4897         TargetNumRegisters = ForceTargetNumScalarRegs;
4898     } else {
4899       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4900         TargetNumRegisters = ForceTargetNumVectorRegs;
4901     }
4902     unsigned MaxLocalUsers = pair.second;
4903     unsigned LoopInvariantRegs = 0;
4904     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
4905       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
4906 
4907     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4908                                      MaxLocalUsers);
4909     // Don't count the induction variable as interleaved.
4910     if (EnableIndVarRegisterHeur) {
4911       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4912                               std::max(1U, (MaxLocalUsers - 1)));
4913     }
4914 
4915     IC = std::min(IC, TmpIC);
4916   }
4917 
4918   // Clamp the interleave ranges to reasonable counts.
4919   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4920 
4921   // Check if the user has overridden the max.
4922   if (VF.isScalar()) {
4923     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4924       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4925   } else {
4926     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4927       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4928   }
4929 
4930   unsigned EstimatedVF = VF.getKnownMinValue();
4931   if (VF.isScalable()) {
4932     if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
4933       EstimatedVF *= *VScale;
4934   }
4935   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4936 
4937   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4938   if (KnownTC > 0) {
4939     // At least one iteration must be scalar when this constraint holds. So the
4940     // maximum available iterations for interleaving is one less.
4941     unsigned AvailableTC =
4942         requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
4943 
4944     // If trip count is known we select between two prospective ICs, where
4945     // 1) the aggressive IC is capped by the trip count divided by VF
4946     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4947     // The final IC is selected in a way that the epilogue loop trip count is
4948     // minimized while maximizing the IC itself, so that we either run the
4949     // vector loop at least once if it generates a small epilogue loop, or else
4950     // we run the vector loop at least twice.
4951 
4952     unsigned InterleaveCountUB = bit_floor(
4953         std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4954     unsigned InterleaveCountLB = bit_floor(std::max(
4955         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4956     MaxInterleaveCount = InterleaveCountLB;
4957 
4958     if (InterleaveCountUB != InterleaveCountLB) {
4959       unsigned TailTripCountUB =
4960           (AvailableTC % (EstimatedVF * InterleaveCountUB));
4961       unsigned TailTripCountLB =
4962           (AvailableTC % (EstimatedVF * InterleaveCountLB));
4963       // If both produce same scalar tail, maximize the IC to do the same work
4964       // in fewer vector loop iterations
4965       if (TailTripCountUB == TailTripCountLB)
4966         MaxInterleaveCount = InterleaveCountUB;
4967     }
4968   } else if (BestKnownTC && *BestKnownTC > 0) {
4969     // At least one iteration must be scalar when this constraint holds. So the
4970     // maximum available iterations for interleaving is one less.
4971     unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
4972                                ? (*BestKnownTC) - 1
4973                                : *BestKnownTC;
4974 
4975     // If trip count is an estimated compile time constant, limit the
4976     // IC to be capped by the trip count divided by VF * 2, such that the vector
4977     // loop runs at least twice to make interleaving seem profitable when there
4978     // is an epilogue loop present. Since exact Trip count is not known we
4979     // choose to be conservative in our IC estimate.
4980     MaxInterleaveCount = bit_floor(std::max(
4981         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4982   }
4983 
4984   assert(MaxInterleaveCount > 0 &&
4985          "Maximum interleave count must be greater than 0");
4986 
4987   // Clamp the calculated IC to be between the 1 and the max interleave count
4988   // that the target and trip count allows.
4989   if (IC > MaxInterleaveCount)
4990     IC = MaxInterleaveCount;
4991   else
4992     // Make sure IC is greater than 0.
4993     IC = std::max(1u, IC);
4994 
4995   assert(IC > 0 && "Interleave count must be greater than 0.");
4996 
4997   // Interleave if we vectorized this loop and there is a reduction that could
4998   // benefit from interleaving.
4999   if (VF.isVector() && HasReductions) {
5000     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5001     return IC;
5002   }
5003 
5004   // For any scalar loop that either requires runtime checks or predication we
5005   // are better off leaving this to the unroller. Note that if we've already
5006   // vectorized the loop we will have done the runtime check and so interleaving
5007   // won't require further checks.
5008   bool ScalarInterleavingRequiresPredication =
5009       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5010          return Legal->blockNeedsPredication(BB);
5011        }));
5012   bool ScalarInterleavingRequiresRuntimePointerCheck =
5013       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5014 
5015   // We want to interleave small loops in order to reduce the loop overhead and
5016   // potentially expose ILP opportunities.
5017   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5018                     << "LV: IC is " << IC << '\n'
5019                     << "LV: VF is " << VF << '\n');
5020   const bool AggressivelyInterleaveReductions =
5021       TTI.enableAggressiveInterleaving(HasReductions);
5022   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5023       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5024     // We assume that the cost overhead is 1 and we use the cost model
5025     // to estimate the cost of the loop and interleave until the cost of the
5026     // loop overhead is about 5% of the cost of the loop.
5027     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5028                                         SmallLoopCost / *LoopCost.getValue()));
5029 
5030     // Interleave until store/load ports (estimated by max interleave count) are
5031     // saturated.
5032     unsigned NumStores = Legal->getNumStores();
5033     unsigned NumLoads = Legal->getNumLoads();
5034     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5035     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5036 
5037     // There is little point in interleaving for reductions containing selects
5038     // and compares when VF=1 since it may just create more overhead than it's
5039     // worth for loops with small trip counts. This is because we still have to
5040     // do the final reduction after the loop.
5041     bool HasSelectCmpReductions =
5042         HasReductions &&
5043         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5044           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5045           return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5046               RdxDesc.getRecurrenceKind());
5047         });
5048     if (HasSelectCmpReductions) {
5049       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5050       return 1;
5051     }
5052 
5053     // If we have a scalar reduction (vector reductions are already dealt with
5054     // by this point), we can increase the critical path length if the loop
5055     // we're interleaving is inside another loop. For tree-wise reductions
5056     // set the limit to 2, and for ordered reductions it's best to disable
5057     // interleaving entirely.
5058     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5059       bool HasOrderedReductions =
5060           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5061             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5062             return RdxDesc.isOrdered();
5063           });
5064       if (HasOrderedReductions) {
5065         LLVM_DEBUG(
5066             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5067         return 1;
5068       }
5069 
5070       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5071       SmallIC = std::min(SmallIC, F);
5072       StoresIC = std::min(StoresIC, F);
5073       LoadsIC = std::min(LoadsIC, F);
5074     }
5075 
5076     if (EnableLoadStoreRuntimeInterleave &&
5077         std::max(StoresIC, LoadsIC) > SmallIC) {
5078       LLVM_DEBUG(
5079           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5080       return std::max(StoresIC, LoadsIC);
5081     }
5082 
5083     // If there are scalar reductions and TTI has enabled aggressive
5084     // interleaving for reductions, we will interleave to expose ILP.
5085     if (VF.isScalar() && AggressivelyInterleaveReductions) {
5086       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5087       // Interleave no less than SmallIC but not as aggressive as the normal IC
5088       // to satisfy the rare situation when resources are too limited.
5089       return std::max(IC / 2, SmallIC);
5090     } else {
5091       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5092       return SmallIC;
5093     }
5094   }
5095 
5096   // Interleave if this is a large loop (small loops are already dealt with by
5097   // this point) that could benefit from interleaving.
5098   if (AggressivelyInterleaveReductions) {
5099     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5100     return IC;
5101   }
5102 
5103   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5104   return 1;
5105 }
5106 
5107 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
calculateRegisterUsage(ArrayRef<ElementCount> VFs)5108 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5109   // This function calculates the register usage by measuring the highest number
5110   // of values that are alive at a single location. Obviously, this is a very
5111   // rough estimation. We scan the loop in a topological order in order and
5112   // assign a number to each instruction. We use RPO to ensure that defs are
5113   // met before their users. We assume that each instruction that has in-loop
5114   // users starts an interval. We record every time that an in-loop value is
5115   // used, so we have a list of the first and last occurrences of each
5116   // instruction. Next, we transpose this data structure into a multi map that
5117   // holds the list of intervals that *end* at a specific location. This multi
5118   // map allows us to perform a linear search. We scan the instructions linearly
5119   // and record each time that a new interval starts, by placing it in a set.
5120   // If we find this value in the multi-map then we remove it from the set.
5121   // The max register usage is the maximum size of the set.
5122   // We also search for instructions that are defined outside the loop, but are
5123   // used inside the loop. We need this number separately from the max-interval
5124   // usage number because when we unroll, loop-invariant values do not take
5125   // more register.
5126   LoopBlocksDFS DFS(TheLoop);
5127   DFS.perform(LI);
5128 
5129   RegisterUsage RU;
5130 
5131   // Each 'key' in the map opens a new interval. The values
5132   // of the map are the index of the 'last seen' usage of the
5133   // instruction that is the key.
5134   using IntervalMap = DenseMap<Instruction *, unsigned>;
5135 
5136   // Maps instruction to its index.
5137   SmallVector<Instruction *, 64> IdxToInstr;
5138   // Marks the end of each interval.
5139   IntervalMap EndPoint;
5140   // Saves the list of instruction indices that are used in the loop.
5141   SmallPtrSet<Instruction *, 8> Ends;
5142   // Saves the list of values that are used in the loop but are defined outside
5143   // the loop (not including non-instruction values such as arguments and
5144   // constants).
5145   SmallSetVector<Instruction *, 8> LoopInvariants;
5146 
5147   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5148     for (Instruction &I : BB->instructionsWithoutDebug()) {
5149       IdxToInstr.push_back(&I);
5150 
5151       // Save the end location of each USE.
5152       for (Value *U : I.operands()) {
5153         auto *Instr = dyn_cast<Instruction>(U);
5154 
5155         // Ignore non-instruction values such as arguments, constants, etc.
5156         // FIXME: Might need some motivation why these values are ignored. If
5157         // for example an argument is used inside the loop it will increase the
5158         // register pressure (so shouldn't we add it to LoopInvariants).
5159         if (!Instr)
5160           continue;
5161 
5162         // If this instruction is outside the loop then record it and continue.
5163         if (!TheLoop->contains(Instr)) {
5164           LoopInvariants.insert(Instr);
5165           continue;
5166         }
5167 
5168         // Overwrite previous end points.
5169         EndPoint[Instr] = IdxToInstr.size();
5170         Ends.insert(Instr);
5171       }
5172     }
5173   }
5174 
5175   // Saves the list of intervals that end with the index in 'key'.
5176   using InstrList = SmallVector<Instruction *, 2>;
5177   DenseMap<unsigned, InstrList> TransposeEnds;
5178 
5179   // Transpose the EndPoints to a list of values that end at each index.
5180   for (auto &Interval : EndPoint)
5181     TransposeEnds[Interval.second].push_back(Interval.first);
5182 
5183   SmallPtrSet<Instruction *, 8> OpenIntervals;
5184   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5185   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5186 
5187   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5188 
5189   const auto &TTICapture = TTI;
5190   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5191     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5192       return 0;
5193     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5194   };
5195 
5196   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5197     Instruction *I = IdxToInstr[i];
5198 
5199     // Remove all of the instructions that end at this location.
5200     InstrList &List = TransposeEnds[i];
5201     for (Instruction *ToRemove : List)
5202       OpenIntervals.erase(ToRemove);
5203 
5204     // Ignore instructions that are never used within the loop.
5205     if (!Ends.count(I))
5206       continue;
5207 
5208     // Skip ignored values.
5209     if (ValuesToIgnore.count(I))
5210       continue;
5211 
5212     collectInLoopReductions();
5213 
5214     // For each VF find the maximum usage of registers.
5215     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5216       // Count the number of registers used, per register class, given all open
5217       // intervals.
5218       // Note that elements in this SmallMapVector will be default constructed
5219       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5220       // there is no previous entry for ClassID.
5221       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5222 
5223       if (VFs[j].isScalar()) {
5224         for (auto *Inst : OpenIntervals) {
5225           unsigned ClassID =
5226               TTI.getRegisterClassForType(false, Inst->getType());
5227           // FIXME: The target might use more than one register for the type
5228           // even in the scalar case.
5229           RegUsage[ClassID] += 1;
5230         }
5231       } else {
5232         collectUniformsAndScalars(VFs[j]);
5233         for (auto *Inst : OpenIntervals) {
5234           // Skip ignored values for VF > 1.
5235           if (VecValuesToIgnore.count(Inst))
5236             continue;
5237           if (isScalarAfterVectorization(Inst, VFs[j])) {
5238             unsigned ClassID =
5239                 TTI.getRegisterClassForType(false, Inst->getType());
5240             // FIXME: The target might use more than one register for the type
5241             // even in the scalar case.
5242             RegUsage[ClassID] += 1;
5243           } else {
5244             unsigned ClassID =
5245                 TTI.getRegisterClassForType(true, Inst->getType());
5246             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5247           }
5248         }
5249       }
5250 
5251       for (auto& pair : RegUsage) {
5252         auto &Entry = MaxUsages[j][pair.first];
5253         Entry = std::max(Entry, pair.second);
5254       }
5255     }
5256 
5257     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5258                       << OpenIntervals.size() << '\n');
5259 
5260     // Add the current instruction to the list of open intervals.
5261     OpenIntervals.insert(I);
5262   }
5263 
5264   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5265     // Note that elements in this SmallMapVector will be default constructed
5266     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5267     // there is no previous entry for ClassID.
5268     SmallMapVector<unsigned, unsigned, 4> Invariant;
5269 
5270     for (auto *Inst : LoopInvariants) {
5271       // FIXME: The target might use more than one register for the type
5272       // even in the scalar case.
5273       bool IsScalar = all_of(Inst->users(), [&](User *U) {
5274         auto *I = cast<Instruction>(U);
5275         return TheLoop != LI->getLoopFor(I->getParent()) ||
5276                isScalarAfterVectorization(I, VFs[i]);
5277       });
5278 
5279       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5280       unsigned ClassID =
5281           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5282       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5283     }
5284 
5285     LLVM_DEBUG({
5286       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5287       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5288              << " item\n";
5289       for (const auto &pair : MaxUsages[i]) {
5290         dbgs() << "LV(REG): RegisterClass: "
5291                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5292                << " registers\n";
5293       }
5294       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5295              << " item\n";
5296       for (const auto &pair : Invariant) {
5297         dbgs() << "LV(REG): RegisterClass: "
5298                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5299                << " registers\n";
5300       }
5301     });
5302 
5303     RU.LoopInvariantRegs = Invariant;
5304     RU.MaxLocalUsers = MaxUsages[i];
5305     RUs[i] = RU;
5306   }
5307 
5308   return RUs;
5309 }
5310 
useEmulatedMaskMemRefHack(Instruction * I,ElementCount VF)5311 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5312                                                            ElementCount VF) {
5313   // TODO: Cost model for emulated masked load/store is completely
5314   // broken. This hack guides the cost model to use an artificially
5315   // high enough value to practically disable vectorization with such
5316   // operations, except where previously deployed legality hack allowed
5317   // using very low cost values. This is to avoid regressions coming simply
5318   // from moving "masked load/store" check from legality to cost model.
5319   // Masked Load/Gather emulation was previously never allowed.
5320   // Limited number of Masked Store/Scatter emulation was allowed.
5321   assert((isPredicatedInst(I)) &&
5322          "Expecting a scalar emulated instruction");
5323   return isa<LoadInst>(I) ||
5324          (isa<StoreInst>(I) &&
5325           NumPredStores > NumberOfStoresToPredicate);
5326 }
5327 
collectInstsToScalarize(ElementCount VF)5328 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5329   // If we aren't vectorizing the loop, or if we've already collected the
5330   // instructions to scalarize, there's nothing to do. Collection may already
5331   // have occurred if we have a user-selected VF and are now computing the
5332   // expected cost for interleaving.
5333   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5334     return;
5335 
5336   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5337   // not profitable to scalarize any instructions, the presence of VF in the
5338   // map will indicate that we've analyzed it already.
5339   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5340 
5341   PredicatedBBsAfterVectorization[VF].clear();
5342 
5343   // Find all the instructions that are scalar with predication in the loop and
5344   // determine if it would be better to not if-convert the blocks they are in.
5345   // If so, we also record the instructions to scalarize.
5346   for (BasicBlock *BB : TheLoop->blocks()) {
5347     if (!blockNeedsPredicationForAnyReason(BB))
5348       continue;
5349     for (Instruction &I : *BB)
5350       if (isScalarWithPredication(&I, VF)) {
5351         ScalarCostsTy ScalarCosts;
5352         // Do not apply discount logic for:
5353         // 1. Scalars after vectorization, as there will only be a single copy
5354         // of the instruction.
5355         // 2. Scalable VF, as that would lead to invalid scalarization costs.
5356         // 3. Emulated masked memrefs, if a hacked cost is needed.
5357         if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5358             !useEmulatedMaskMemRefHack(&I, VF) &&
5359             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5360           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5361         // Remember that BB will remain after vectorization.
5362         PredicatedBBsAfterVectorization[VF].insert(BB);
5363         for (auto *Pred : predecessors(BB)) {
5364           if (Pred->getSingleSuccessor() == BB)
5365             PredicatedBBsAfterVectorization[VF].insert(Pred);
5366         }
5367       }
5368   }
5369 }
5370 
computePredInstDiscount(Instruction * PredInst,ScalarCostsTy & ScalarCosts,ElementCount VF)5371 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5372     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5373   assert(!isUniformAfterVectorization(PredInst, VF) &&
5374          "Instruction marked uniform-after-vectorization will be predicated");
5375 
5376   // Initialize the discount to zero, meaning that the scalar version and the
5377   // vector version cost the same.
5378   InstructionCost Discount = 0;
5379 
5380   // Holds instructions to analyze. The instructions we visit are mapped in
5381   // ScalarCosts. Those instructions are the ones that would be scalarized if
5382   // we find that the scalar version costs less.
5383   SmallVector<Instruction *, 8> Worklist;
5384 
5385   // Returns true if the given instruction can be scalarized.
5386   auto canBeScalarized = [&](Instruction *I) -> bool {
5387     // We only attempt to scalarize instructions forming a single-use chain
5388     // from the original predicated block that would otherwise be vectorized.
5389     // Although not strictly necessary, we give up on instructions we know will
5390     // already be scalar to avoid traversing chains that are unlikely to be
5391     // beneficial.
5392     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5393         isScalarAfterVectorization(I, VF))
5394       return false;
5395 
5396     // If the instruction is scalar with predication, it will be analyzed
5397     // separately. We ignore it within the context of PredInst.
5398     if (isScalarWithPredication(I, VF))
5399       return false;
5400 
5401     // If any of the instruction's operands are uniform after vectorization,
5402     // the instruction cannot be scalarized. This prevents, for example, a
5403     // masked load from being scalarized.
5404     //
5405     // We assume we will only emit a value for lane zero of an instruction
5406     // marked uniform after vectorization, rather than VF identical values.
5407     // Thus, if we scalarize an instruction that uses a uniform, we would
5408     // create uses of values corresponding to the lanes we aren't emitting code
5409     // for. This behavior can be changed by allowing getScalarValue to clone
5410     // the lane zero values for uniforms rather than asserting.
5411     for (Use &U : I->operands())
5412       if (auto *J = dyn_cast<Instruction>(U.get()))
5413         if (isUniformAfterVectorization(J, VF))
5414           return false;
5415 
5416     // Otherwise, we can scalarize the instruction.
5417     return true;
5418   };
5419 
5420   // Compute the expected cost discount from scalarizing the entire expression
5421   // feeding the predicated instruction. We currently only consider expressions
5422   // that are single-use instruction chains.
5423   Worklist.push_back(PredInst);
5424   while (!Worklist.empty()) {
5425     Instruction *I = Worklist.pop_back_val();
5426 
5427     // If we've already analyzed the instruction, there's nothing to do.
5428     if (ScalarCosts.contains(I))
5429       continue;
5430 
5431     // Compute the cost of the vector instruction. Note that this cost already
5432     // includes the scalarization overhead of the predicated instruction.
5433     InstructionCost VectorCost = getInstructionCost(I, VF);
5434 
5435     // Compute the cost of the scalarized instruction. This cost is the cost of
5436     // the instruction as if it wasn't if-converted and instead remained in the
5437     // predicated block. We will scale this cost by block probability after
5438     // computing the scalarization overhead.
5439     InstructionCost ScalarCost =
5440         VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5441 
5442     // Compute the scalarization overhead of needed insertelement instructions
5443     // and phi nodes.
5444     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5445     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5446       ScalarCost += TTI.getScalarizationOverhead(
5447           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5448           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5449           /*Extract*/ false, CostKind);
5450       ScalarCost +=
5451           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5452     }
5453 
5454     // Compute the scalarization overhead of needed extractelement
5455     // instructions. For each of the instruction's operands, if the operand can
5456     // be scalarized, add it to the worklist; otherwise, account for the
5457     // overhead.
5458     for (Use &U : I->operands())
5459       if (auto *J = dyn_cast<Instruction>(U.get())) {
5460         assert(VectorType::isValidElementType(J->getType()) &&
5461                "Instruction has non-scalar type");
5462         if (canBeScalarized(J))
5463           Worklist.push_back(J);
5464         else if (needsExtract(J, VF)) {
5465           ScalarCost += TTI.getScalarizationOverhead(
5466               cast<VectorType>(ToVectorTy(J->getType(), VF)),
5467               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5468               /*Extract*/ true, CostKind);
5469         }
5470       }
5471 
5472     // Scale the total scalar cost by block probability.
5473     ScalarCost /= getReciprocalPredBlockProb();
5474 
5475     // Compute the discount. A non-negative discount means the vector version
5476     // of the instruction costs more, and scalarizing would be beneficial.
5477     Discount += VectorCost - ScalarCost;
5478     ScalarCosts[I] = ScalarCost;
5479   }
5480 
5481   return Discount;
5482 }
5483 
expectedCost(ElementCount VF,SmallVectorImpl<InstructionVFPair> * Invalid)5484 InstructionCost LoopVectorizationCostModel::expectedCost(
5485     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
5486   InstructionCost Cost;
5487 
5488   // For each block.
5489   for (BasicBlock *BB : TheLoop->blocks()) {
5490     InstructionCost BlockCost;
5491 
5492     // For each instruction in the old loop.
5493     for (Instruction &I : BB->instructionsWithoutDebug()) {
5494       // Skip ignored values.
5495       if (ValuesToIgnore.count(&I) ||
5496           (VF.isVector() && VecValuesToIgnore.count(&I)))
5497         continue;
5498 
5499       InstructionCost C = getInstructionCost(&I, VF);
5500 
5501       // Check if we should override the cost.
5502       if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5503         C = InstructionCost(ForceTargetInstructionCost);
5504 
5505       // Keep a list of instructions with invalid costs.
5506       if (Invalid && !C.isValid())
5507         Invalid->emplace_back(&I, VF);
5508 
5509       BlockCost += C;
5510       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5511                         << VF << " For instruction: " << I << '\n');
5512     }
5513 
5514     // If we are vectorizing a predicated block, it will have been
5515     // if-converted. This means that the block's instructions (aside from
5516     // stores and instructions that may divide by zero) will now be
5517     // unconditionally executed. For the scalar case, we may not always execute
5518     // the predicated block, if it is an if-else block. Thus, scale the block's
5519     // cost by the probability of executing it. blockNeedsPredication from
5520     // Legal is used so as to not include all blocks in tail folded loops.
5521     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5522       BlockCost /= getReciprocalPredBlockProb();
5523 
5524     Cost += BlockCost;
5525   }
5526 
5527   return Cost;
5528 }
5529 
5530 /// Gets Address Access SCEV after verifying that the access pattern
5531 /// is loop invariant except the induction variable dependence.
5532 ///
5533 /// This SCEV can be sent to the Target in order to estimate the address
5534 /// calculation cost.
getAddressAccessSCEV(Value * Ptr,LoopVectorizationLegality * Legal,PredicatedScalarEvolution & PSE,const Loop * TheLoop)5535 static const SCEV *getAddressAccessSCEV(
5536               Value *Ptr,
5537               LoopVectorizationLegality *Legal,
5538               PredicatedScalarEvolution &PSE,
5539               const Loop *TheLoop) {
5540 
5541   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5542   if (!Gep)
5543     return nullptr;
5544 
5545   // We are looking for a gep with all loop invariant indices except for one
5546   // which should be an induction variable.
5547   auto SE = PSE.getSE();
5548   unsigned NumOperands = Gep->getNumOperands();
5549   for (unsigned i = 1; i < NumOperands; ++i) {
5550     Value *Opd = Gep->getOperand(i);
5551     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5552         !Legal->isInductionVariable(Opd))
5553       return nullptr;
5554   }
5555 
5556   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5557   return PSE.getSCEV(Ptr);
5558 }
5559 
5560 InstructionCost
getMemInstScalarizationCost(Instruction * I,ElementCount VF)5561 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5562                                                         ElementCount VF) {
5563   assert(VF.isVector() &&
5564          "Scalarization cost of instruction implies vectorization.");
5565   if (VF.isScalable())
5566     return InstructionCost::getInvalid();
5567 
5568   Type *ValTy = getLoadStoreType(I);
5569   auto SE = PSE.getSE();
5570 
5571   unsigned AS = getLoadStoreAddressSpace(I);
5572   Value *Ptr = getLoadStorePointerOperand(I);
5573   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5574   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5575   //       that it is being called from this specific place.
5576 
5577   // Figure out whether the access is strided and get the stride value
5578   // if it's known in compile time
5579   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5580 
5581   // Get the cost of the scalar memory instruction and address computation.
5582   InstructionCost Cost =
5583       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5584 
5585   // Don't pass *I here, since it is scalar but will actually be part of a
5586   // vectorized loop where the user of it is a vectorized instruction.
5587   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5588   const Align Alignment = getLoadStoreAlignment(I);
5589   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5590                                                       ValTy->getScalarType(),
5591                                                       Alignment, AS, CostKind);
5592 
5593   // Get the overhead of the extractelement and insertelement instructions
5594   // we might create due to scalarization.
5595   Cost += getScalarizationOverhead(I, VF, CostKind);
5596 
5597   // If we have a predicated load/store, it will need extra i1 extracts and
5598   // conditional branches, but may not be executed for each vector lane. Scale
5599   // the cost by the probability of executing the predicated block.
5600   if (isPredicatedInst(I)) {
5601     Cost /= getReciprocalPredBlockProb();
5602 
5603     // Add the cost of an i1 extract and a branch
5604     auto *Vec_i1Ty =
5605         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5606     Cost += TTI.getScalarizationOverhead(
5607         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5608         /*Insert=*/false, /*Extract=*/true, CostKind);
5609     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5610 
5611     if (useEmulatedMaskMemRefHack(I, VF))
5612       // Artificially setting to a high enough value to practically disable
5613       // vectorization with such operations.
5614       Cost = 3000000;
5615   }
5616 
5617   return Cost;
5618 }
5619 
5620 InstructionCost
getConsecutiveMemOpCost(Instruction * I,ElementCount VF)5621 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5622                                                     ElementCount VF) {
5623   Type *ValTy = getLoadStoreType(I);
5624   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5625   Value *Ptr = getLoadStorePointerOperand(I);
5626   unsigned AS = getLoadStoreAddressSpace(I);
5627   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5628   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5629 
5630   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5631          "Stride should be 1 or -1 for consecutive memory access");
5632   const Align Alignment = getLoadStoreAlignment(I);
5633   InstructionCost Cost = 0;
5634   if (Legal->isMaskRequired(I)) {
5635     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5636                                       CostKind);
5637   } else {
5638     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5639     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5640                                 CostKind, OpInfo, I);
5641   }
5642 
5643   bool Reverse = ConsecutiveStride < 0;
5644   if (Reverse)
5645     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
5646                                std::nullopt, CostKind, 0);
5647   return Cost;
5648 }
5649 
5650 InstructionCost
getUniformMemOpCost(Instruction * I,ElementCount VF)5651 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5652                                                 ElementCount VF) {
5653   assert(Legal->isUniformMemOp(*I, VF));
5654 
5655   Type *ValTy = getLoadStoreType(I);
5656   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5657   const Align Alignment = getLoadStoreAlignment(I);
5658   unsigned AS = getLoadStoreAddressSpace(I);
5659   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5660   if (isa<LoadInst>(I)) {
5661     return TTI.getAddressComputationCost(ValTy) +
5662            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5663                                CostKind) +
5664            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5665   }
5666   StoreInst *SI = cast<StoreInst>(I);
5667 
5668   bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5669   return TTI.getAddressComputationCost(ValTy) +
5670          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5671                              CostKind) +
5672          (isLoopInvariantStoreValue
5673               ? 0
5674               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5675                                        CostKind, VF.getKnownMinValue() - 1));
5676 }
5677 
5678 InstructionCost
getGatherScatterCost(Instruction * I,ElementCount VF)5679 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5680                                                  ElementCount VF) {
5681   Type *ValTy = getLoadStoreType(I);
5682   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5683   const Align Alignment = getLoadStoreAlignment(I);
5684   const Value *Ptr = getLoadStorePointerOperand(I);
5685 
5686   return TTI.getAddressComputationCost(VectorTy) +
5687          TTI.getGatherScatterOpCost(
5688              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
5689              TargetTransformInfo::TCK_RecipThroughput, I);
5690 }
5691 
5692 InstructionCost
getInterleaveGroupCost(Instruction * I,ElementCount VF)5693 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5694                                                    ElementCount VF) {
5695   Type *ValTy = getLoadStoreType(I);
5696   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5697   unsigned AS = getLoadStoreAddressSpace(I);
5698   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5699 
5700   auto Group = getInterleavedAccessGroup(I);
5701   assert(Group && "Fail to get an interleaved access group.");
5702 
5703   unsigned InterleaveFactor = Group->getFactor();
5704   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5705 
5706   // Holds the indices of existing members in the interleaved group.
5707   SmallVector<unsigned, 4> Indices;
5708   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5709     if (Group->getMember(IF))
5710       Indices.push_back(IF);
5711 
5712   // Calculate the cost of the whole interleaved group.
5713   bool UseMaskForGaps =
5714       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5715       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5716   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5717       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
5718       AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
5719 
5720   if (Group->isReverse()) {
5721     // TODO: Add support for reversed masked interleaved access.
5722     assert(!Legal->isMaskRequired(I) &&
5723            "Reverse masked interleaved access not supported.");
5724     Cost += Group->getNumMembers() *
5725             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
5726                                std::nullopt, CostKind, 0);
5727   }
5728   return Cost;
5729 }
5730 
5731 std::optional<InstructionCost>
getReductionPatternCost(Instruction * I,ElementCount VF,Type * Ty,TTI::TargetCostKind CostKind) const5732 LoopVectorizationCostModel::getReductionPatternCost(
5733     Instruction *I, ElementCount VF, Type *Ty,
5734     TTI::TargetCostKind CostKind) const {
5735   using namespace llvm::PatternMatch;
5736   // Early exit for no inloop reductions
5737   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5738     return std::nullopt;
5739   auto *VectorTy = cast<VectorType>(Ty);
5740 
5741   // We are looking for a pattern of, and finding the minimal acceptable cost:
5742   //  reduce(mul(ext(A), ext(B))) or
5743   //  reduce(mul(A, B)) or
5744   //  reduce(ext(A)) or
5745   //  reduce(A).
5746   // The basic idea is that we walk down the tree to do that, finding the root
5747   // reduction instruction in InLoopReductionImmediateChains. From there we find
5748   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5749   // of the components. If the reduction cost is lower then we return it for the
5750   // reduction instruction and 0 for the other instructions in the pattern. If
5751   // it is not we return an invalid cost specifying the orignal cost method
5752   // should be used.
5753   Instruction *RetI = I;
5754   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5755     if (!RetI->hasOneUser())
5756       return std::nullopt;
5757     RetI = RetI->user_back();
5758   }
5759 
5760   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5761       RetI->user_back()->getOpcode() == Instruction::Add) {
5762     RetI = RetI->user_back();
5763   }
5764 
5765   // Test if the found instruction is a reduction, and if not return an invalid
5766   // cost specifying the parent to use the original cost modelling.
5767   if (!InLoopReductionImmediateChains.count(RetI))
5768     return std::nullopt;
5769 
5770   // Find the reduction this chain is a part of and calculate the basic cost of
5771   // the reduction on its own.
5772   Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5773   Instruction *ReductionPhi = LastChain;
5774   while (!isa<PHINode>(ReductionPhi))
5775     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5776 
5777   const RecurrenceDescriptor &RdxDesc =
5778       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5779 
5780   InstructionCost BaseCost;
5781   RecurKind RK = RdxDesc.getRecurrenceKind();
5782   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
5783     Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5784     BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5785                                           RdxDesc.getFastMathFlags(), CostKind);
5786   } else {
5787     BaseCost = TTI.getArithmeticReductionCost(
5788         RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5789   }
5790 
5791   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5792   // normal fmul instruction to the cost of the fadd reduction.
5793   if (RK == RecurKind::FMulAdd)
5794     BaseCost +=
5795         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5796 
5797   // If we're using ordered reductions then we can just return the base cost
5798   // here, since getArithmeticReductionCost calculates the full ordered
5799   // reduction cost when FP reassociation is not allowed.
5800   if (useOrderedReductions(RdxDesc))
5801     return BaseCost;
5802 
5803   // Get the operand that was not the reduction chain and match it to one of the
5804   // patterns, returning the better cost if it is found.
5805   Instruction *RedOp = RetI->getOperand(1) == LastChain
5806                            ? dyn_cast<Instruction>(RetI->getOperand(0))
5807                            : dyn_cast<Instruction>(RetI->getOperand(1));
5808 
5809   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5810 
5811   Instruction *Op0, *Op1;
5812   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5813       match(RedOp,
5814             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
5815       match(Op0, m_ZExtOrSExt(m_Value())) &&
5816       Op0->getOpcode() == Op1->getOpcode() &&
5817       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5818       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5819       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5820 
5821     // Matched reduce.add(ext(mul(ext(A), ext(B)))
5822     // Note that the extend opcodes need to all match, or if A==B they will have
5823     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5824     // which is equally fine.
5825     bool IsUnsigned = isa<ZExtInst>(Op0);
5826     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5827     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5828 
5829     InstructionCost ExtCost =
5830         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5831                              TTI::CastContextHint::None, CostKind, Op0);
5832     InstructionCost MulCost =
5833         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5834     InstructionCost Ext2Cost =
5835         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5836                              TTI::CastContextHint::None, CostKind, RedOp);
5837 
5838     InstructionCost RedCost = TTI.getMulAccReductionCost(
5839         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5840 
5841     if (RedCost.isValid() &&
5842         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5843       return I == RetI ? RedCost : 0;
5844   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5845              !TheLoop->isLoopInvariant(RedOp)) {
5846     // Matched reduce(ext(A))
5847     bool IsUnsigned = isa<ZExtInst>(RedOp);
5848     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5849     InstructionCost RedCost = TTI.getExtendedReductionCost(
5850         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5851         RdxDesc.getFastMathFlags(), CostKind);
5852 
5853     InstructionCost ExtCost =
5854         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5855                              TTI::CastContextHint::None, CostKind, RedOp);
5856     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5857       return I == RetI ? RedCost : 0;
5858   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5859              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5860     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5861         Op0->getOpcode() == Op1->getOpcode() &&
5862         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
5863       bool IsUnsigned = isa<ZExtInst>(Op0);
5864       Type *Op0Ty = Op0->getOperand(0)->getType();
5865       Type *Op1Ty = Op1->getOperand(0)->getType();
5866       Type *LargestOpTy =
5867           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5868                                                                     : Op0Ty;
5869       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5870 
5871       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5872       // different sizes. We take the largest type as the ext to reduce, and add
5873       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5874       InstructionCost ExtCost0 = TTI.getCastInstrCost(
5875           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5876           TTI::CastContextHint::None, CostKind, Op0);
5877       InstructionCost ExtCost1 = TTI.getCastInstrCost(
5878           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5879           TTI::CastContextHint::None, CostKind, Op1);
5880       InstructionCost MulCost =
5881           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5882 
5883       InstructionCost RedCost = TTI.getMulAccReductionCost(
5884           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5885       InstructionCost ExtraExtCost = 0;
5886       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5887         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5888         ExtraExtCost = TTI.getCastInstrCost(
5889             ExtraExtOp->getOpcode(), ExtType,
5890             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5891             TTI::CastContextHint::None, CostKind, ExtraExtOp);
5892       }
5893 
5894       if (RedCost.isValid() &&
5895           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5896         return I == RetI ? RedCost : 0;
5897     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5898       // Matched reduce.add(mul())
5899       InstructionCost MulCost =
5900           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5901 
5902       InstructionCost RedCost = TTI.getMulAccReductionCost(
5903           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
5904 
5905       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5906         return I == RetI ? RedCost : 0;
5907     }
5908   }
5909 
5910   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5911 }
5912 
5913 InstructionCost
getMemoryInstructionCost(Instruction * I,ElementCount VF)5914 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5915                                                      ElementCount VF) {
5916   // Calculate scalar cost only. Vectorization cost should be ready at this
5917   // moment.
5918   if (VF.isScalar()) {
5919     Type *ValTy = getLoadStoreType(I);
5920     const Align Alignment = getLoadStoreAlignment(I);
5921     unsigned AS = getLoadStoreAddressSpace(I);
5922 
5923     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5924     return TTI.getAddressComputationCost(ValTy) +
5925            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
5926                                TTI::TCK_RecipThroughput, OpInfo, I);
5927   }
5928   return getWideningCost(I, VF);
5929 }
5930 
getScalarizationOverhead(Instruction * I,ElementCount VF,TTI::TargetCostKind CostKind) const5931 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
5932     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
5933 
5934   // There is no mechanism yet to create a scalable scalarization loop,
5935   // so this is currently Invalid.
5936   if (VF.isScalable())
5937     return InstructionCost::getInvalid();
5938 
5939   if (VF.isScalar())
5940     return 0;
5941 
5942   InstructionCost Cost = 0;
5943   Type *RetTy = ToVectorTy(I->getType(), VF);
5944   if (!RetTy->isVoidTy() &&
5945       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5946     Cost += TTI.getScalarizationOverhead(
5947         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
5948         /*Insert*/ true,
5949         /*Extract*/ false, CostKind);
5950 
5951   // Some targets keep addresses scalar.
5952   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5953     return Cost;
5954 
5955   // Some targets support efficient element stores.
5956   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5957     return Cost;
5958 
5959   // Collect operands to consider.
5960   CallInst *CI = dyn_cast<CallInst>(I);
5961   Instruction::op_range Ops = CI ? CI->args() : I->operands();
5962 
5963   // Skip operands that do not require extraction/scalarization and do not incur
5964   // any overhead.
5965   SmallVector<Type *> Tys;
5966   for (auto *V : filterExtractingOperands(Ops, VF))
5967     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
5968   return Cost + TTI.getOperandsScalarizationOverhead(
5969                     filterExtractingOperands(Ops, VF), Tys, CostKind);
5970 }
5971 
setCostBasedWideningDecision(ElementCount VF)5972 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5973   if (VF.isScalar())
5974     return;
5975   NumPredStores = 0;
5976   for (BasicBlock *BB : TheLoop->blocks()) {
5977     // For each instruction in the old loop.
5978     for (Instruction &I : *BB) {
5979       Value *Ptr =  getLoadStorePointerOperand(&I);
5980       if (!Ptr)
5981         continue;
5982 
5983       // TODO: We should generate better code and update the cost model for
5984       // predicated uniform stores. Today they are treated as any other
5985       // predicated store (see added test cases in
5986       // invariant-store-vectorization.ll).
5987       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
5988         NumPredStores++;
5989 
5990       if (Legal->isUniformMemOp(I, VF)) {
5991         auto isLegalToScalarize = [&]() {
5992           if (!VF.isScalable())
5993             // Scalarization of fixed length vectors "just works".
5994             return true;
5995 
5996           // We have dedicated lowering for unpredicated uniform loads and
5997           // stores.  Note that even with tail folding we know that at least
5998           // one lane is active (i.e. generalized predication is not possible
5999           // here), and the logic below depends on this fact.
6000           if (!foldTailByMasking())
6001             return true;
6002 
6003           // For scalable vectors, a uniform memop load is always
6004           // uniform-by-parts  and we know how to scalarize that.
6005           if (isa<LoadInst>(I))
6006             return true;
6007 
6008           // A uniform store isn't neccessarily uniform-by-part
6009           // and we can't assume scalarization.
6010           auto &SI = cast<StoreInst>(I);
6011           return TheLoop->isLoopInvariant(SI.getValueOperand());
6012         };
6013 
6014         const InstructionCost GatherScatterCost =
6015           isLegalGatherOrScatter(&I, VF) ?
6016           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6017 
6018         // Load: Scalar load + broadcast
6019         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6020         // FIXME: This cost is a significant under-estimate for tail folded
6021         // memory ops.
6022         const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6023           getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6024 
6025         // Choose better solution for the current VF,  Note that Invalid
6026         // costs compare as maximumal large.  If both are invalid, we get
6027         // scalable invalid which signals a failure and a vectorization abort.
6028         if (GatherScatterCost < ScalarizationCost)
6029           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6030         else
6031           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6032         continue;
6033       }
6034 
6035       // We assume that widening is the best solution when possible.
6036       if (memoryInstructionCanBeWidened(&I, VF)) {
6037         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6038         int ConsecutiveStride = Legal->isConsecutivePtr(
6039             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6040         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6041                "Expected consecutive stride.");
6042         InstWidening Decision =
6043             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6044         setWideningDecision(&I, VF, Decision, Cost);
6045         continue;
6046       }
6047 
6048       // Choose between Interleaving, Gather/Scatter or Scalarization.
6049       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6050       unsigned NumAccesses = 1;
6051       if (isAccessInterleaved(&I)) {
6052         auto Group = getInterleavedAccessGroup(&I);
6053         assert(Group && "Fail to get an interleaved access group.");
6054 
6055         // Make one decision for the whole group.
6056         if (getWideningDecision(&I, VF) != CM_Unknown)
6057           continue;
6058 
6059         NumAccesses = Group->getNumMembers();
6060         if (interleavedAccessCanBeWidened(&I, VF))
6061           InterleaveCost = getInterleaveGroupCost(&I, VF);
6062       }
6063 
6064       InstructionCost GatherScatterCost =
6065           isLegalGatherOrScatter(&I, VF)
6066               ? getGatherScatterCost(&I, VF) * NumAccesses
6067               : InstructionCost::getInvalid();
6068 
6069       InstructionCost ScalarizationCost =
6070           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6071 
6072       // Choose better solution for the current VF,
6073       // write down this decision and use it during vectorization.
6074       InstructionCost Cost;
6075       InstWidening Decision;
6076       if (InterleaveCost <= GatherScatterCost &&
6077           InterleaveCost < ScalarizationCost) {
6078         Decision = CM_Interleave;
6079         Cost = InterleaveCost;
6080       } else if (GatherScatterCost < ScalarizationCost) {
6081         Decision = CM_GatherScatter;
6082         Cost = GatherScatterCost;
6083       } else {
6084         Decision = CM_Scalarize;
6085         Cost = ScalarizationCost;
6086       }
6087       // If the instructions belongs to an interleave group, the whole group
6088       // receives the same decision. The whole group receives the cost, but
6089       // the cost will actually be assigned to one instruction.
6090       if (auto Group = getInterleavedAccessGroup(&I))
6091         setWideningDecision(Group, VF, Decision, Cost);
6092       else
6093         setWideningDecision(&I, VF, Decision, Cost);
6094     }
6095   }
6096 
6097   // Make sure that any load of address and any other address computation
6098   // remains scalar unless there is gather/scatter support. This avoids
6099   // inevitable extracts into address registers, and also has the benefit of
6100   // activating LSR more, since that pass can't optimize vectorized
6101   // addresses.
6102   if (TTI.prefersVectorizedAddressing())
6103     return;
6104 
6105   // Start with all scalar pointer uses.
6106   SmallPtrSet<Instruction *, 8> AddrDefs;
6107   for (BasicBlock *BB : TheLoop->blocks())
6108     for (Instruction &I : *BB) {
6109       Instruction *PtrDef =
6110         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6111       if (PtrDef && TheLoop->contains(PtrDef) &&
6112           getWideningDecision(&I, VF) != CM_GatherScatter)
6113         AddrDefs.insert(PtrDef);
6114     }
6115 
6116   // Add all instructions used to generate the addresses.
6117   SmallVector<Instruction *, 4> Worklist;
6118   append_range(Worklist, AddrDefs);
6119   while (!Worklist.empty()) {
6120     Instruction *I = Worklist.pop_back_val();
6121     for (auto &Op : I->operands())
6122       if (auto *InstOp = dyn_cast<Instruction>(Op))
6123         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6124             AddrDefs.insert(InstOp).second)
6125           Worklist.push_back(InstOp);
6126   }
6127 
6128   for (auto *I : AddrDefs) {
6129     if (isa<LoadInst>(I)) {
6130       // Setting the desired widening decision should ideally be handled in
6131       // by cost functions, but since this involves the task of finding out
6132       // if the loaded register is involved in an address computation, it is
6133       // instead changed here when we know this is the case.
6134       InstWidening Decision = getWideningDecision(I, VF);
6135       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6136         // Scalarize a widened load of address.
6137         setWideningDecision(
6138             I, VF, CM_Scalarize,
6139             (VF.getKnownMinValue() *
6140              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6141       else if (auto Group = getInterleavedAccessGroup(I)) {
6142         // Scalarize an interleave group of address loads.
6143         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6144           if (Instruction *Member = Group->getMember(I))
6145             setWideningDecision(
6146                 Member, VF, CM_Scalarize,
6147                 (VF.getKnownMinValue() *
6148                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6149         }
6150       }
6151     } else
6152       // Make sure I gets scalarized and a cost estimate without
6153       // scalarization overhead.
6154       ForcedScalars[VF].insert(I);
6155   }
6156 }
6157 
setVectorizedCallDecision(ElementCount VF)6158 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6159   assert(!VF.isScalar() &&
6160          "Trying to set a vectorization decision for a scalar VF");
6161 
6162   for (BasicBlock *BB : TheLoop->blocks()) {
6163     // For each instruction in the old loop.
6164     for (Instruction &I : *BB) {
6165       CallInst *CI = dyn_cast<CallInst>(&I);
6166 
6167       if (!CI)
6168         continue;
6169 
6170       InstructionCost ScalarCost = InstructionCost::getInvalid();
6171       InstructionCost VectorCost = InstructionCost::getInvalid();
6172       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6173       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6174 
6175       Function *ScalarFunc = CI->getCalledFunction();
6176       Type *ScalarRetTy = CI->getType();
6177       SmallVector<Type *, 4> Tys, ScalarTys;
6178       bool MaskRequired = Legal->isMaskRequired(CI);
6179       for (auto &ArgOp : CI->args())
6180         ScalarTys.push_back(ArgOp->getType());
6181 
6182       // Compute corresponding vector type for return value and arguments.
6183       Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6184       for (Type *ScalarTy : ScalarTys)
6185         Tys.push_back(ToVectorTy(ScalarTy, VF));
6186 
6187       // An in-loop reduction using an fmuladd intrinsic is a special case;
6188       // we don't want the normal cost for that intrinsic.
6189       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6190         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6191           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6192                                   getVectorIntrinsicIDForCall(CI, TLI),
6193                                   std::nullopt, *RedCost);
6194           continue;
6195         }
6196 
6197       // Estimate cost of scalarized vector call. The source operands are
6198       // assumed to be vectors, so we need to extract individual elements from
6199       // there, execute VF scalar calls, and then gather the result into the
6200       // vector return value.
6201       InstructionCost ScalarCallCost =
6202           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6203 
6204       // Compute costs of unpacking argument values for the scalar calls and
6205       // packing the return values to a vector.
6206       InstructionCost ScalarizationCost =
6207           getScalarizationOverhead(CI, VF, CostKind);
6208 
6209       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6210 
6211       // Find the cost of vectorizing the call, if we can find a suitable
6212       // vector variant of the function.
6213       bool UsesMask = false;
6214       VFInfo FuncInfo;
6215       Function *VecFunc = nullptr;
6216       // Search through any available variants for one we can use at this VF.
6217       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6218         // Must match requested VF.
6219         if (Info.Shape.VF != VF)
6220           continue;
6221 
6222         // Must take a mask argument if one is required
6223         if (MaskRequired && !Info.isMasked())
6224           continue;
6225 
6226         // Check that all parameter kinds are supported
6227         bool ParamsOk = true;
6228         for (VFParameter Param : Info.Shape.Parameters) {
6229           switch (Param.ParamKind) {
6230           case VFParamKind::Vector:
6231             break;
6232           case VFParamKind::OMP_Uniform: {
6233             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6234             // Make sure the scalar parameter in the loop is invariant.
6235             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6236                                               TheLoop))
6237               ParamsOk = false;
6238             break;
6239           }
6240           case VFParamKind::OMP_Linear: {
6241             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6242             // Find the stride for the scalar parameter in this loop and see if
6243             // it matches the stride for the variant.
6244             // TODO: do we need to figure out the cost of an extract to get the
6245             // first lane? Or do we hope that it will be folded away?
6246             ScalarEvolution *SE = PSE.getSE();
6247             const auto *SAR =
6248                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6249 
6250             if (!SAR || SAR->getLoop() != TheLoop) {
6251               ParamsOk = false;
6252               break;
6253             }
6254 
6255             const SCEVConstant *Step =
6256                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6257 
6258             if (!Step ||
6259                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6260               ParamsOk = false;
6261 
6262             break;
6263           }
6264           case VFParamKind::GlobalPredicate:
6265             UsesMask = true;
6266             break;
6267           default:
6268             ParamsOk = false;
6269             break;
6270           }
6271         }
6272 
6273         if (!ParamsOk)
6274           continue;
6275 
6276         // Found a suitable candidate, stop here.
6277         VecFunc = CI->getModule()->getFunction(Info.VectorName);
6278         FuncInfo = Info;
6279         break;
6280       }
6281 
6282       // Add in the cost of synthesizing a mask if one wasn't required.
6283       InstructionCost MaskCost = 0;
6284       if (VecFunc && UsesMask && !MaskRequired)
6285         MaskCost = TTI.getShuffleCost(
6286             TargetTransformInfo::SK_Broadcast,
6287             VectorType::get(IntegerType::getInt1Ty(
6288                                 VecFunc->getFunctionType()->getContext()),
6289                             VF));
6290 
6291       if (TLI && VecFunc && !CI->isNoBuiltin())
6292         VectorCost =
6293             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6294 
6295       // Find the cost of an intrinsic; some targets may have instructions that
6296       // perform the operation without needing an actual call.
6297       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6298       if (IID != Intrinsic::not_intrinsic)
6299         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6300 
6301       InstructionCost Cost = ScalarCost;
6302       InstWidening Decision = CM_Scalarize;
6303 
6304       if (VectorCost <= Cost) {
6305         Cost = VectorCost;
6306         Decision = CM_VectorCall;
6307       }
6308 
6309       if (IntrinsicCost <= Cost) {
6310         Cost = IntrinsicCost;
6311         Decision = CM_IntrinsicCall;
6312       }
6313 
6314       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6315                               FuncInfo.getParamIndexForOptionalMask(), Cost);
6316     }
6317   }
6318 }
6319 
6320 InstructionCost
getInstructionCost(Instruction * I,ElementCount VF)6321 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6322                                                ElementCount VF) {
6323   // If we know that this instruction will remain uniform, check the cost of
6324   // the scalar version.
6325   if (isUniformAfterVectorization(I, VF))
6326     VF = ElementCount::getFixed(1);
6327 
6328   if (VF.isVector() && isProfitableToScalarize(I, VF))
6329     return InstsToScalarize[VF][I];
6330 
6331   // Forced scalars do not have any scalarization overhead.
6332   auto ForcedScalar = ForcedScalars.find(VF);
6333   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6334     auto InstSet = ForcedScalar->second;
6335     if (InstSet.count(I))
6336       return getInstructionCost(I, ElementCount::getFixed(1)) *
6337              VF.getKnownMinValue();
6338   }
6339 
6340   Type *RetTy = I->getType();
6341   if (canTruncateToMinimalBitwidth(I, VF))
6342     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6343   auto SE = PSE.getSE();
6344   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6345 
6346   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6347                                                 ElementCount VF) -> bool {
6348     if (VF.isScalar())
6349       return true;
6350 
6351     auto Scalarized = InstsToScalarize.find(VF);
6352     assert(Scalarized != InstsToScalarize.end() &&
6353            "VF not yet analyzed for scalarization profitability");
6354     return !Scalarized->second.count(I) &&
6355            llvm::all_of(I->users(), [&](User *U) {
6356              auto *UI = cast<Instruction>(U);
6357              return !Scalarized->second.count(UI);
6358            });
6359   };
6360   (void) hasSingleCopyAfterVectorization;
6361 
6362   Type *VectorTy;
6363   if (isScalarAfterVectorization(I, VF)) {
6364     // With the exception of GEPs and PHIs, after scalarization there should
6365     // only be one copy of the instruction generated in the loop. This is
6366     // because the VF is either 1, or any instructions that need scalarizing
6367     // have already been dealt with by the time we get here. As a result,
6368     // it means we don't have to multiply the instruction cost by VF.
6369     assert(I->getOpcode() == Instruction::GetElementPtr ||
6370            I->getOpcode() == Instruction::PHI ||
6371            (I->getOpcode() == Instruction::BitCast &&
6372             I->getType()->isPointerTy()) ||
6373            hasSingleCopyAfterVectorization(I, VF));
6374     VectorTy = RetTy;
6375   } else
6376     VectorTy = ToVectorTy(RetTy, VF);
6377 
6378   if (VF.isVector() && VectorTy->isVectorTy() &&
6379       !TTI.getNumberOfParts(VectorTy))
6380     return InstructionCost::getInvalid();
6381 
6382   // TODO: We need to estimate the cost of intrinsic calls.
6383   switch (I->getOpcode()) {
6384   case Instruction::GetElementPtr:
6385     // We mark this instruction as zero-cost because the cost of GEPs in
6386     // vectorized code depends on whether the corresponding memory instruction
6387     // is scalarized or not. Therefore, we handle GEPs with the memory
6388     // instruction cost.
6389     return 0;
6390   case Instruction::Br: {
6391     // In cases of scalarized and predicated instructions, there will be VF
6392     // predicated blocks in the vectorized loop. Each branch around these
6393     // blocks requires also an extract of its vector compare i1 element.
6394     // Note that the conditional branch from the loop latch will be replaced by
6395     // a single branch controlling the loop, so there is no extra overhead from
6396     // scalarization.
6397     bool ScalarPredicatedBB = false;
6398     BranchInst *BI = cast<BranchInst>(I);
6399     if (VF.isVector() && BI->isConditional() &&
6400         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6401          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6402         BI->getParent() != TheLoop->getLoopLatch())
6403       ScalarPredicatedBB = true;
6404 
6405     if (ScalarPredicatedBB) {
6406       // Not possible to scalarize scalable vector with predicated instructions.
6407       if (VF.isScalable())
6408         return InstructionCost::getInvalid();
6409       // Return cost for branches around scalarized and predicated blocks.
6410       auto *Vec_i1Ty =
6411           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6412       return (
6413           TTI.getScalarizationOverhead(
6414               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6415               /*Insert*/ false, /*Extract*/ true, CostKind) +
6416           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6417     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6418       // The back-edge branch will remain, as will all scalar branches.
6419       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6420     else
6421       // This branch will be eliminated by if-conversion.
6422       return 0;
6423     // Note: We currently assume zero cost for an unconditional branch inside
6424     // a predicated block since it will become a fall-through, although we
6425     // may decide in the future to call TTI for all branches.
6426   }
6427   case Instruction::PHI: {
6428     auto *Phi = cast<PHINode>(I);
6429 
6430     // First-order recurrences are replaced by vector shuffles inside the loop.
6431     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6432       // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6433       // penultimate value of the recurrence.
6434       // TODO: Consider vscale_range info.
6435       if (VF.isScalable() && VF.getKnownMinValue() == 1)
6436         return InstructionCost::getInvalid();
6437       SmallVector<int> Mask(VF.getKnownMinValue());
6438       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6439       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6440                                 cast<VectorType>(VectorTy), Mask, CostKind,
6441                                 VF.getKnownMinValue() - 1);
6442     }
6443 
6444     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6445     // converted into select instructions. We require N - 1 selects per phi
6446     // node, where N is the number of incoming values.
6447     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6448       return (Phi->getNumIncomingValues() - 1) *
6449              TTI.getCmpSelInstrCost(
6450                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6451                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6452                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6453 
6454     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6455   }
6456   case Instruction::UDiv:
6457   case Instruction::SDiv:
6458   case Instruction::URem:
6459   case Instruction::SRem:
6460     if (VF.isVector() && isPredicatedInst(I)) {
6461       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6462       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6463         ScalarCost : SafeDivisorCost;
6464     }
6465     // We've proven all lanes safe to speculate, fall through.
6466     [[fallthrough]];
6467   case Instruction::Add:
6468   case Instruction::FAdd:
6469   case Instruction::Sub:
6470   case Instruction::FSub:
6471   case Instruction::Mul:
6472   case Instruction::FMul:
6473   case Instruction::FDiv:
6474   case Instruction::FRem:
6475   case Instruction::Shl:
6476   case Instruction::LShr:
6477   case Instruction::AShr:
6478   case Instruction::And:
6479   case Instruction::Or:
6480   case Instruction::Xor: {
6481     // If we're speculating on the stride being 1, the multiplication may
6482     // fold away.  We can generalize this for all operations using the notion
6483     // of neutral elements.  (TODO)
6484     if (I->getOpcode() == Instruction::Mul &&
6485         (PSE.getSCEV(I->getOperand(0))->isOne() ||
6486          PSE.getSCEV(I->getOperand(1))->isOne()))
6487       return 0;
6488 
6489     // Detect reduction patterns
6490     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6491       return *RedCost;
6492 
6493     // Certain instructions can be cheaper to vectorize if they have a constant
6494     // second vector operand. One example of this are shifts on x86.
6495     Value *Op2 = I->getOperand(1);
6496     auto Op2Info = TTI.getOperandInfo(Op2);
6497     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6498         Legal->isInvariant(Op2))
6499       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6500 
6501     SmallVector<const Value *, 4> Operands(I->operand_values());
6502     return TTI.getArithmeticInstrCost(
6503         I->getOpcode(), VectorTy, CostKind,
6504         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6505         Op2Info, Operands, I, TLI);
6506   }
6507   case Instruction::FNeg: {
6508     return TTI.getArithmeticInstrCost(
6509         I->getOpcode(), VectorTy, CostKind,
6510         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6511         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6512         I->getOperand(0), I);
6513   }
6514   case Instruction::Select: {
6515     SelectInst *SI = cast<SelectInst>(I);
6516     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6517     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6518 
6519     const Value *Op0, *Op1;
6520     using namespace llvm::PatternMatch;
6521     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6522                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6523       // select x, y, false --> x & y
6524       // select x, true, y --> x | y
6525       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6526       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6527       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6528               Op1->getType()->getScalarSizeInBits() == 1);
6529 
6530       SmallVector<const Value *, 2> Operands{Op0, Op1};
6531       return TTI.getArithmeticInstrCost(
6532           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6533           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6534     }
6535 
6536     Type *CondTy = SI->getCondition()->getType();
6537     if (!ScalarCond)
6538       CondTy = VectorType::get(CondTy, VF);
6539 
6540     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6541     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6542       Pred = Cmp->getPredicate();
6543     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6544                                   CostKind, I);
6545   }
6546   case Instruction::ICmp:
6547   case Instruction::FCmp: {
6548     Type *ValTy = I->getOperand(0)->getType();
6549     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6550     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6551       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6552     VectorTy = ToVectorTy(ValTy, VF);
6553     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6554                                   cast<CmpInst>(I)->getPredicate(), CostKind,
6555                                   I);
6556   }
6557   case Instruction::Store:
6558   case Instruction::Load: {
6559     ElementCount Width = VF;
6560     if (Width.isVector()) {
6561       InstWidening Decision = getWideningDecision(I, Width);
6562       assert(Decision != CM_Unknown &&
6563              "CM decision should be taken at this point");
6564       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6565         return InstructionCost::getInvalid();
6566       if (Decision == CM_Scalarize)
6567         Width = ElementCount::getFixed(1);
6568     }
6569     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
6570     return getMemoryInstructionCost(I, VF);
6571   }
6572   case Instruction::BitCast:
6573     if (I->getType()->isPointerTy())
6574       return 0;
6575     [[fallthrough]];
6576   case Instruction::ZExt:
6577   case Instruction::SExt:
6578   case Instruction::FPToUI:
6579   case Instruction::FPToSI:
6580   case Instruction::FPExt:
6581   case Instruction::PtrToInt:
6582   case Instruction::IntToPtr:
6583   case Instruction::SIToFP:
6584   case Instruction::UIToFP:
6585   case Instruction::Trunc:
6586   case Instruction::FPTrunc: {
6587     // Computes the CastContextHint from a Load/Store instruction.
6588     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6589       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6590              "Expected a load or a store!");
6591 
6592       if (VF.isScalar() || !TheLoop->contains(I))
6593         return TTI::CastContextHint::Normal;
6594 
6595       switch (getWideningDecision(I, VF)) {
6596       case LoopVectorizationCostModel::CM_GatherScatter:
6597         return TTI::CastContextHint::GatherScatter;
6598       case LoopVectorizationCostModel::CM_Interleave:
6599         return TTI::CastContextHint::Interleave;
6600       case LoopVectorizationCostModel::CM_Scalarize:
6601       case LoopVectorizationCostModel::CM_Widen:
6602         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6603                                         : TTI::CastContextHint::Normal;
6604       case LoopVectorizationCostModel::CM_Widen_Reverse:
6605         return TTI::CastContextHint::Reversed;
6606       case LoopVectorizationCostModel::CM_Unknown:
6607         llvm_unreachable("Instr did not go through cost modelling?");
6608       case LoopVectorizationCostModel::CM_VectorCall:
6609       case LoopVectorizationCostModel::CM_IntrinsicCall:
6610         llvm_unreachable_internal("Instr has invalid widening decision");
6611       }
6612 
6613       llvm_unreachable("Unhandled case!");
6614     };
6615 
6616     unsigned Opcode = I->getOpcode();
6617     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6618     // For Trunc, the context is the only user, which must be a StoreInst.
6619     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6620       if (I->hasOneUse())
6621         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6622           CCH = ComputeCCH(Store);
6623     }
6624     // For Z/Sext, the context is the operand, which must be a LoadInst.
6625     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6626              Opcode == Instruction::FPExt) {
6627       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6628         CCH = ComputeCCH(Load);
6629     }
6630 
6631     // We optimize the truncation of induction variables having constant
6632     // integer steps. The cost of these truncations is the same as the scalar
6633     // operation.
6634     if (isOptimizableIVTruncate(I, VF)) {
6635       auto *Trunc = cast<TruncInst>(I);
6636       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6637                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6638     }
6639 
6640     // Detect reduction patterns
6641     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6642       return *RedCost;
6643 
6644     Type *SrcScalarTy = I->getOperand(0)->getType();
6645     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6646     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6647       SrcScalarTy =
6648           IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6649     Type *SrcVecTy =
6650         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6651 
6652     if (canTruncateToMinimalBitwidth(I, VF)) {
6653       // If the result type is <= the source type, there will be no extend
6654       // after truncating the users to the minimal required bitwidth.
6655       if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6656           (I->getOpcode() == Instruction::ZExt ||
6657            I->getOpcode() == Instruction::SExt))
6658         return 0;
6659     }
6660 
6661     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6662   }
6663   case Instruction::Call:
6664     return getVectorCallCost(cast<CallInst>(I), VF);
6665   case Instruction::ExtractValue:
6666     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
6667   case Instruction::Alloca:
6668     // We cannot easily widen alloca to a scalable alloca, as
6669     // the result would need to be a vector of pointers.
6670     if (VF.isScalable())
6671       return InstructionCost::getInvalid();
6672     [[fallthrough]];
6673   default:
6674     // This opcode is unknown. Assume that it is the same as 'mul'.
6675     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6676   } // end of switch.
6677 }
6678 
collectValuesToIgnore()6679 void LoopVectorizationCostModel::collectValuesToIgnore() {
6680   // Ignore ephemeral values.
6681   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6682 
6683   SmallVector<Value *, 4> DeadInterleavePointerOps;
6684   for (BasicBlock *BB : TheLoop->blocks())
6685     for (Instruction &I : *BB) {
6686       // Find all stores to invariant variables. Since they are going to sink
6687       // outside the loop we do not need calculate cost for them.
6688       StoreInst *SI;
6689       if ((SI = dyn_cast<StoreInst>(&I)) &&
6690           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
6691         ValuesToIgnore.insert(&I);
6692 
6693       // For interleave groups, we only create a pointer for the start of the
6694       // interleave group. Queue up addresses of group members except the insert
6695       // position for further processing.
6696       if (isAccessInterleaved(&I)) {
6697         auto *Group = getInterleavedAccessGroup(&I);
6698         if (Group->getInsertPos() == &I)
6699           continue;
6700         Value *PointerOp = getLoadStorePointerOperand(&I);
6701         DeadInterleavePointerOps.push_back(PointerOp);
6702       }
6703     }
6704 
6705   // Mark ops feeding interleave group members as free, if they are only used
6706   // by other dead computations.
6707   for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6708     auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6709     if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6710           Instruction *UI = cast<Instruction>(U);
6711           return !VecValuesToIgnore.contains(U) &&
6712                  (!isAccessInterleaved(UI) ||
6713                   getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6714         }))
6715       continue;
6716     VecValuesToIgnore.insert(Op);
6717     DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
6718   }
6719 
6720   // Ignore type-promoting instructions we identified during reduction
6721   // detection.
6722   for (const auto &Reduction : Legal->getReductionVars()) {
6723     const RecurrenceDescriptor &RedDes = Reduction.second;
6724     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6725     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6726   }
6727   // Ignore type-casting instructions we identified during induction
6728   // detection.
6729   for (const auto &Induction : Legal->getInductionVars()) {
6730     const InductionDescriptor &IndDes = Induction.second;
6731     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6732     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6733   }
6734 }
6735 
collectInLoopReductions()6736 void LoopVectorizationCostModel::collectInLoopReductions() {
6737   for (const auto &Reduction : Legal->getReductionVars()) {
6738     PHINode *Phi = Reduction.first;
6739     const RecurrenceDescriptor &RdxDesc = Reduction.second;
6740 
6741     // We don't collect reductions that are type promoted (yet).
6742     if (RdxDesc.getRecurrenceType() != Phi->getType())
6743       continue;
6744 
6745     // If the target would prefer this reduction to happen "in-loop", then we
6746     // want to record it as such.
6747     unsigned Opcode = RdxDesc.getOpcode();
6748     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6749         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6750                                    TargetTransformInfo::ReductionFlags()))
6751       continue;
6752 
6753     // Check that we can correctly put the reductions into the loop, by
6754     // finding the chain of operations that leads from the phi to the loop
6755     // exit value.
6756     SmallVector<Instruction *, 4> ReductionOperations =
6757         RdxDesc.getReductionOpChain(Phi, TheLoop);
6758     bool InLoop = !ReductionOperations.empty();
6759 
6760     if (InLoop) {
6761       InLoopReductions.insert(Phi);
6762       // Add the elements to InLoopReductionImmediateChains for cost modelling.
6763       Instruction *LastChain = Phi;
6764       for (auto *I : ReductionOperations) {
6765         InLoopReductionImmediateChains[I] = LastChain;
6766         LastChain = I;
6767       }
6768     }
6769     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6770                       << " reduction for phi: " << *Phi << "\n");
6771   }
6772 }
6773 
createICmp(CmpInst::Predicate Pred,VPValue * A,VPValue * B,DebugLoc DL,const Twine & Name)6774 VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
6775                                DebugLoc DL, const Twine &Name) {
6776   assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
6777          Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
6778   return tryInsertInstruction(
6779       new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
6780 }
6781 
6782 // This function will select a scalable VF if the target supports scalable
6783 // vectors and a fixed one otherwise.
6784 // TODO: we could return a pair of values that specify the max VF and
6785 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6786 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6787 // doesn't have a cost model that can choose which plan to execute if
6788 // more than one is generated.
determineVPlanVF(const TargetTransformInfo & TTI,LoopVectorizationCostModel & CM)6789 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6790                                      LoopVectorizationCostModel &CM) {
6791   unsigned WidestType;
6792   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6793 
6794   TargetTransformInfo::RegisterKind RegKind =
6795       TTI.enableScalableVectorization()
6796           ? TargetTransformInfo::RGK_ScalableVector
6797           : TargetTransformInfo::RGK_FixedWidthVector;
6798 
6799   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
6800   unsigned N = RegSize.getKnownMinValue() / WidestType;
6801   return ElementCount::get(N, RegSize.isScalable());
6802 }
6803 
6804 VectorizationFactor
planInVPlanNativePath(ElementCount UserVF)6805 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6806   ElementCount VF = UserVF;
6807   // Outer loop handling: They may require CFG and instruction level
6808   // transformations before even evaluating whether vectorization is profitable.
6809   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6810   // the vectorization pipeline.
6811   if (!OrigLoop->isInnermost()) {
6812     // If the user doesn't provide a vectorization factor, determine a
6813     // reasonable one.
6814     if (UserVF.isZero()) {
6815       VF = determineVPlanVF(TTI, CM);
6816       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6817 
6818       // Make sure we have a VF > 1 for stress testing.
6819       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6820         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6821                           << "overriding computed VF.\n");
6822         VF = ElementCount::getFixed(4);
6823       }
6824     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6825                !ForceTargetSupportsScalableVectors) {
6826       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6827                         << "not supported by the target.\n");
6828       reportVectorizationFailure(
6829           "Scalable vectorization requested but not supported by the target",
6830           "the scalable user-specified vectorization width for outer-loop "
6831           "vectorization cannot be used because the target does not support "
6832           "scalable vectors.",
6833           "ScalableVFUnfeasible", ORE, OrigLoop);
6834       return VectorizationFactor::Disabled();
6835     }
6836     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6837     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6838            "VF needs to be a power of two");
6839     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6840                       << "VF " << VF << " to build VPlans.\n");
6841     buildVPlans(VF, VF);
6842 
6843     // For VPlan build stress testing, we bail out after VPlan construction.
6844     if (VPlanBuildStressTest)
6845       return VectorizationFactor::Disabled();
6846 
6847     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6848   }
6849 
6850   LLVM_DEBUG(
6851       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6852                 "VPlan-native path.\n");
6853   return VectorizationFactor::Disabled();
6854 }
6855 
6856 std::optional<VectorizationFactor>
plan(ElementCount UserVF,unsigned UserIC)6857 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6858   assert(OrigLoop->isInnermost() && "Inner loop expected.");
6859   CM.collectValuesToIgnore();
6860   CM.collectElementTypesForWidening();
6861 
6862   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6863   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6864     return std::nullopt;
6865 
6866   // Invalidate interleave groups if all blocks of loop will be predicated.
6867   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
6868       !useMaskedInterleavedAccesses(TTI)) {
6869     LLVM_DEBUG(
6870         dbgs()
6871         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6872            "which requires masked-interleaved support.\n");
6873     if (CM.InterleaveInfo.invalidateGroups())
6874       // Invalidating interleave groups also requires invalidating all decisions
6875       // based on them, which includes widening decisions and uniform and scalar
6876       // values.
6877       CM.invalidateCostModelingDecisions();
6878   }
6879 
6880   if (CM.foldTailByMasking())
6881     Legal->prepareToFoldTailByMasking();
6882 
6883   ElementCount MaxUserVF =
6884       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6885   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
6886   if (!UserVF.isZero() && UserVFIsLegal) {
6887     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6888            "VF needs to be a power of two");
6889     // Collect the instructions (and their associated costs) that will be more
6890     // profitable to scalarize.
6891     CM.collectInLoopReductions();
6892     if (CM.selectUserVectorizationFactor(UserVF)) {
6893       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6894       buildVPlansWithVPRecipes(UserVF, UserVF);
6895       if (!hasPlanWithVF(UserVF)) {
6896         LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
6897                           << ".\n");
6898         return std::nullopt;
6899       }
6900 
6901       LLVM_DEBUG(printPlans(dbgs()));
6902       return {{UserVF, 0, 0}};
6903     } else
6904       reportVectorizationInfo("UserVF ignored because of invalid costs.",
6905                               "InvalidCost", ORE, OrigLoop);
6906   }
6907 
6908   // Collect the Vectorization Factor Candidates.
6909   SmallVector<ElementCount> VFCandidates;
6910   for (auto VF = ElementCount::getFixed(1);
6911        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
6912     VFCandidates.push_back(VF);
6913   for (auto VF = ElementCount::getScalable(1);
6914        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
6915     VFCandidates.push_back(VF);
6916 
6917   CM.collectInLoopReductions();
6918   for (const auto &VF : VFCandidates) {
6919     // Collect Uniform and Scalar instructions after vectorization with VF.
6920     CM.collectUniformsAndScalars(VF);
6921 
6922     // Collect the instructions (and their associated costs) that will be more
6923     // profitable to scalarize.
6924     if (VF.isVector())
6925       CM.collectInstsToScalarize(VF);
6926   }
6927 
6928   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
6929   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
6930 
6931   LLVM_DEBUG(printPlans(dbgs()));
6932   if (VPlans.empty())
6933     return std::nullopt;
6934   if (all_of(VPlans,
6935              [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }))
6936     return VectorizationFactor::Disabled();
6937 
6938   // Select the optimal vectorization factor according to the legacy cost-model.
6939   // This is now only used to verify the decisions by the new VPlan-based
6940   // cost-model and will be retired once the VPlan-based cost-model is
6941   // stabilized.
6942   VectorizationFactor VF = selectVectorizationFactor();
6943   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
6944   if (!hasPlanWithVF(VF.Width)) {
6945     LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
6946                       << ".\n");
6947     return std::nullopt;
6948   }
6949   return VF;
6950 }
6951 
getLegacyCost(Instruction * UI,ElementCount VF) const6952 InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6953                                              ElementCount VF) const {
6954   return CM.getInstructionCost(UI, VF);
6955 }
6956 
skipCostComputation(Instruction * UI,bool IsVector) const6957 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6958   return CM.ValuesToIgnore.contains(UI) ||
6959          (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
6960          SkipCostComputation.contains(UI);
6961 }
6962 
cost(VPlan & Plan,ElementCount VF) const6963 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6964                                                ElementCount VF) const {
6965   InstructionCost Cost = 0;
6966   LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
6967   VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
6968 
6969   // Cost modeling for inductions is inaccurate in the legacy cost model
6970   // compared to the recipes that are generated. To match here initially during
6971   // VPlan cost model bring up directly use the induction costs from the legacy
6972   // cost model. Note that we do this as pre-processing; the VPlan may not have
6973   // any recipes associated with the original induction increment instruction
6974   // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6975   // the cost of induction phis and increments (both that are represented by
6976   // recipes and those that are not), to avoid distinguishing between them here,
6977   // and skip all recipes that represent induction phis and increments (the
6978   // former case) later on, if they exist, to avoid counting them twice.
6979   // Similarly we pre-compute the cost of any optimized truncates.
6980   // TODO: Switch to more accurate costing based on VPlan.
6981   for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6982     Instruction *IVInc = cast<Instruction>(
6983         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
6984     SmallVector<Instruction *> IVInsts = {IV, IVInc};
6985     for (User *U : IV->users()) {
6986       auto *CI = cast<Instruction>(U);
6987       if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
6988         continue;
6989       IVInsts.push_back(CI);
6990     }
6991     for (Instruction *IVInst : IVInsts) {
6992       if (!CostCtx.SkipCostComputation.insert(IVInst).second)
6993         continue;
6994       InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
6995       LLVM_DEBUG({
6996         dbgs() << "Cost of " << InductionCost << " for VF " << VF
6997                << ": induction instruction " << *IVInst << "\n";
6998       });
6999       Cost += InductionCost;
7000     }
7001   }
7002 
7003   /// Compute the cost of all exiting conditions of the loop using the legacy
7004   /// cost model. This is to match the legacy behavior, which adds the cost of
7005   /// all exit conditions. Note that this over-estimates the cost, as there will
7006   /// be a single condition to control the vector loop.
7007   SmallVector<BasicBlock *> Exiting;
7008   CM.TheLoop->getExitingBlocks(Exiting);
7009   SetVector<Instruction *> ExitInstrs;
7010   // Collect all exit conditions.
7011   for (BasicBlock *EB : Exiting) {
7012     auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7013     if (!Term)
7014       continue;
7015     if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7016       ExitInstrs.insert(CondI);
7017     }
7018   }
7019   // Compute the cost of all instructions only feeding the exit conditions.
7020   for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7021     Instruction *CondI = ExitInstrs[I];
7022     if (!OrigLoop->contains(CondI) ||
7023         !CostCtx.SkipCostComputation.insert(CondI).second)
7024       continue;
7025     Cost += CostCtx.getLegacyCost(CondI, VF);
7026     for (Value *Op : CondI->operands()) {
7027       auto *OpI = dyn_cast<Instruction>(Op);
7028       if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7029             return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7030                    !ExitInstrs.contains(cast<Instruction>(U));
7031           }))
7032         continue;
7033       ExitInstrs.insert(OpI);
7034     }
7035   }
7036 
7037   // The legacy cost model has special logic to compute the cost of in-loop
7038   // reductions, which may be smaller than the sum of all instructions involved
7039   // in the reduction. For AnyOf reductions, VPlan codegen may remove the select
7040   // which the legacy cost model uses to assign cost. Pre-compute their costs
7041   // for now.
7042   // TODO: Switch to costing based on VPlan once the logic has been ported.
7043   for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7044     if (!CM.isInLoopReduction(RedPhi) &&
7045         !RecurrenceDescriptor::isAnyOfRecurrenceKind(
7046             RdxDesc.getRecurrenceKind()))
7047       continue;
7048 
7049     // AnyOf reduction codegen may remove the select. To match the legacy cost
7050     // model, pre-compute the cost for AnyOf reductions here.
7051     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7052             RdxDesc.getRecurrenceKind())) {
7053       auto *Select = cast<SelectInst>(*find_if(
7054           RedPhi->users(), [](User *U) { return isa<SelectInst>(U); }));
7055       assert(!CostCtx.SkipCostComputation.contains(Select) &&
7056              "reduction op visited multiple times");
7057       CostCtx.SkipCostComputation.insert(Select);
7058       auto ReductionCost = CostCtx.getLegacyCost(Select, VF);
7059       LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7060                         << ":\n any-of reduction " << *Select << "\n");
7061       Cost += ReductionCost;
7062       continue;
7063     }
7064 
7065     const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7066     SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7067                                                  ChainOps.end());
7068     // Also include the operands of instructions in the chain, as the cost-model
7069     // may mark extends as free.
7070     for (auto *ChainOp : ChainOps) {
7071       for (Value *Op : ChainOp->operands()) {
7072         if (auto *I = dyn_cast<Instruction>(Op))
7073           ChainOpsAndOperands.insert(I);
7074       }
7075     }
7076 
7077     // Pre-compute the cost for I, if it has a reduction pattern cost.
7078     for (Instruction *I : ChainOpsAndOperands) {
7079       auto ReductionCost = CM.getReductionPatternCost(
7080           I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7081       if (!ReductionCost)
7082         continue;
7083 
7084       assert(!CostCtx.SkipCostComputation.contains(I) &&
7085              "reduction op visited multiple times");
7086       CostCtx.SkipCostComputation.insert(I);
7087       LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7088                         << ":\n in-loop reduction " << *I << "\n");
7089       Cost += *ReductionCost;
7090     }
7091   }
7092 
7093   // Pre-compute the costs for branches except for the backedge, as the number
7094   // of replicate regions in a VPlan may not directly match the number of
7095   // branches, which would lead to different decisions.
7096   // TODO: Compute cost of branches for each replicate region in the VPlan,
7097   // which is more accurate than the legacy cost model.
7098   for (BasicBlock *BB : OrigLoop->blocks()) {
7099     if (BB == OrigLoop->getLoopLatch())
7100       continue;
7101     CostCtx.SkipCostComputation.insert(BB->getTerminator());
7102     auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7103     Cost += BranchCost;
7104   }
7105   // Now compute and add the VPlan-based cost.
7106   Cost += Plan.cost(VF, CostCtx);
7107   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
7108   return Cost;
7109 }
7110 
getBestPlan() const7111 VPlan &LoopVectorizationPlanner::getBestPlan() const {
7112   // If there is a single VPlan with a single VF, return it directly.
7113   VPlan &FirstPlan = *VPlans[0];
7114   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7115     return FirstPlan;
7116 
7117   VPlan *BestPlan = &FirstPlan;
7118   ElementCount ScalarVF = ElementCount::getFixed(1);
7119   assert(hasPlanWithVF(ScalarVF) &&
7120          "More than a single plan/VF w/o any plan having scalar VF");
7121 
7122   // TODO: Compute scalar cost using VPlan-based cost model.
7123   InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7124   VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
7125 
7126   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7127   if (ForceVectorization) {
7128     // Ignore scalar width, because the user explicitly wants vectorization.
7129     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7130     // evaluation.
7131     BestFactor.Cost = InstructionCost::getMax();
7132   }
7133 
7134   for (auto &P : VPlans) {
7135     for (ElementCount VF : P->vectorFactors()) {
7136       if (VF.isScalar())
7137         continue;
7138       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7139         LLVM_DEBUG(
7140             dbgs()
7141             << "LV: Not considering vector loop of width " << VF
7142             << " because it will not generate any vector instructions.\n");
7143         continue;
7144       }
7145 
7146       InstructionCost Cost = cost(*P, VF);
7147       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7148       if (isMoreProfitable(CurrentFactor, BestFactor)) {
7149         BestFactor = CurrentFactor;
7150         BestPlan = &*P;
7151       }
7152     }
7153   }
7154   BestPlan->setVF(BestFactor.Width);
7155   return *BestPlan;
7156 }
7157 
getBestPlanFor(ElementCount VF) const7158 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7159   assert(count_if(VPlans,
7160                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7161              1 &&
7162          "Best VF has not a single VPlan.");
7163 
7164   for (const VPlanPtr &Plan : VPlans) {
7165     if (Plan->hasVF(VF))
7166       return *Plan.get();
7167   }
7168   llvm_unreachable("No plan found!");
7169 }
7170 
AddRuntimeUnrollDisableMetaData(Loop * L)7171 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7172   SmallVector<Metadata *, 4> MDs;
7173   // Reserve first location for self reference to the LoopID metadata node.
7174   MDs.push_back(nullptr);
7175   bool IsUnrollMetadata = false;
7176   MDNode *LoopID = L->getLoopID();
7177   if (LoopID) {
7178     // First find existing loop unrolling disable metadata.
7179     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7180       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7181       if (MD) {
7182         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7183         IsUnrollMetadata =
7184             S && S->getString().starts_with("llvm.loop.unroll.disable");
7185       }
7186       MDs.push_back(LoopID->getOperand(i));
7187     }
7188   }
7189 
7190   if (!IsUnrollMetadata) {
7191     // Add runtime unroll disable metadata.
7192     LLVMContext &Context = L->getHeader()->getContext();
7193     SmallVector<Metadata *, 1> DisableOperands;
7194     DisableOperands.push_back(
7195         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7196     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7197     MDs.push_back(DisableNode);
7198     MDNode *NewLoopID = MDNode::get(Context, MDs);
7199     // Set operand 0 to refer to the loop id itself.
7200     NewLoopID->replaceOperandWith(0, NewLoopID);
7201     L->setLoopID(NewLoopID);
7202   }
7203 }
7204 
7205 // Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7206 // create a merge phi node for it and add it to \p ReductionResumeValues.
createAndCollectMergePhiForReduction(VPInstruction * RedResult,DenseMap<const RecurrenceDescriptor *,Value * > & ReductionResumeValues,VPTransformState & State,Loop * OrigLoop,BasicBlock * LoopMiddleBlock,bool VectorizingEpilogue)7207 static void createAndCollectMergePhiForReduction(
7208     VPInstruction *RedResult,
7209     DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
7210     VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
7211     bool VectorizingEpilogue) {
7212   if (!RedResult ||
7213       RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7214     return;
7215 
7216   auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7217   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7218 
7219   Value *FinalValue =
7220       State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7221   auto *ResumePhi =
7222       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7223   if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
7224                                  RdxDesc.getRecurrenceKind())) {
7225     auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
7226     assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
7227     assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
7228     ResumePhi = cast<PHINode>(Cmp->getOperand(0));
7229   }
7230   assert((!VectorizingEpilogue || ResumePhi) &&
7231          "when vectorizing the epilogue loop, we need a resume phi from main "
7232          "vector loop");
7233 
7234   // TODO: bc.merge.rdx should not be created here, instead it should be
7235   // modeled in VPlan.
7236   BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7237   // Create a phi node that merges control-flow from the backedge-taken check
7238   // block and the middle block.
7239   auto *BCBlockPhi =
7240       PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7241                       LoopScalarPreHeader->getTerminator()->getIterator());
7242 
7243   // If we are fixing reductions in the epilogue loop then we should already
7244   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7245   // we carry over the incoming values correctly.
7246   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7247     if (Incoming == LoopMiddleBlock)
7248       BCBlockPhi->addIncoming(FinalValue, Incoming);
7249     else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7250       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7251                               Incoming);
7252     else
7253       BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
7254   }
7255 
7256   auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7257   // TODO: This fixup should instead be modeled in VPlan.
7258   // Fix the scalar loop reduction variable with the incoming reduction sum
7259   // from the vector body and from the backedge value.
7260   int IncomingEdgeBlockIdx =
7261       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7262   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7263   // Pick the other block.
7264   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7265   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7266   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7267   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7268 
7269   ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7270 }
7271 
7272 std::pair<DenseMap<const SCEV *, Value *>,
7273           DenseMap<const RecurrenceDescriptor *, Value *>>
executePlan(ElementCount BestVF,unsigned BestUF,VPlan & BestVPlan,InnerLoopVectorizer & ILV,DominatorTree * DT,bool IsEpilogueVectorization,const DenseMap<const SCEV *,Value * > * ExpandedSCEVs)7274 LoopVectorizationPlanner::executePlan(
7275     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7276     InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7277     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7278   assert(BestVPlan.hasVF(BestVF) &&
7279          "Trying to execute plan with unsupported VF");
7280   assert(BestVPlan.hasUF(BestUF) &&
7281          "Trying to execute plan with unsupported UF");
7282   assert(
7283       (IsEpilogueVectorization || !ExpandedSCEVs) &&
7284       "expanded SCEVs to reuse can only be used during epilogue vectorization");
7285   (void)IsEpilogueVectorization;
7286 
7287   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7288 
7289   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7290                     << ", UF=" << BestUF << '\n');
7291   BestVPlan.setName("Final VPlan");
7292   LLVM_DEBUG(BestVPlan.dump());
7293 
7294   // Perform the actual loop transformation.
7295   VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7296                          OrigLoop->getHeader()->getContext());
7297 
7298   // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7299   // before making any changes to the CFG.
7300   if (!BestVPlan.getPreheader()->empty()) {
7301     State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7302     State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7303     BestVPlan.getPreheader()->execute(&State);
7304   }
7305   if (!ILV.getTripCount())
7306     ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7307   else
7308     assert(IsEpilogueVectorization && "should only re-use the existing trip "
7309                                       "count during epilogue vectorization");
7310 
7311   // 1. Set up the skeleton for vectorization, including vector pre-header and
7312   // middle block. The vector loop is created during VPlan execution.
7313   Value *CanonicalIVStartValue;
7314   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7315       ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7316                                                      : State.ExpandedSCEVs);
7317 #ifdef EXPENSIVE_CHECKS
7318   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7319 #endif
7320 
7321   // Only use noalias metadata when using memory checks guaranteeing no overlap
7322   // across all iterations.
7323   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7324   std::unique_ptr<LoopVersioning> LVer = nullptr;
7325   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7326       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7327 
7328     //  We currently don't use LoopVersioning for the actual loop cloning but we
7329     //  still use it to add the noalias metadata.
7330     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7331     //        metadata.
7332     LVer = std::make_unique<LoopVersioning>(
7333         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7334         PSE.getSE());
7335     State.LVer = &*LVer;
7336     State.LVer->prepareNoAliasMetadata();
7337   }
7338 
7339   ILV.printDebugTracesAtStart();
7340 
7341   //===------------------------------------------------===//
7342   //
7343   // Notice: any optimization or new instruction that go
7344   // into the code below should also be implemented in
7345   // the cost-model.
7346   //
7347   //===------------------------------------------------===//
7348 
7349   // 2. Copy and widen instructions from the old loop into the new loop.
7350   BestVPlan.prepareToExecute(ILV.getTripCount(),
7351                              ILV.getOrCreateVectorTripCount(nullptr),
7352                              CanonicalIVStartValue, State);
7353 
7354   BestVPlan.execute(&State);
7355 
7356   // 2.5 Collect reduction resume values.
7357   DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
7358   auto *ExitVPBB =
7359       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7360   for (VPRecipeBase &R : *ExitVPBB) {
7361     createAndCollectMergePhiForReduction(
7362         dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
7363         State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
7364   }
7365 
7366   // 2.6. Maintain Loop Hints
7367   // Keep all loop hints from the original loop on the vector loop (we'll
7368   // replace the vectorizer-specific hints below).
7369   MDNode *OrigLoopID = OrigLoop->getLoopID();
7370 
7371   std::optional<MDNode *> VectorizedLoopID =
7372       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7373                                       LLVMLoopVectorizeFollowupVectorized});
7374 
7375   VPBasicBlock *HeaderVPBB =
7376       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7377   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7378   if (VectorizedLoopID)
7379     L->setLoopID(*VectorizedLoopID);
7380   else {
7381     // Keep all loop hints from the original loop on the vector loop (we'll
7382     // replace the vectorizer-specific hints below).
7383     if (MDNode *LID = OrigLoop->getLoopID())
7384       L->setLoopID(LID);
7385 
7386     LoopVectorizeHints Hints(L, true, *ORE);
7387     Hints.setAlreadyVectorized();
7388   }
7389   TargetTransformInfo::UnrollingPreferences UP;
7390   TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7391   if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7392     AddRuntimeUnrollDisableMetaData(L);
7393 
7394   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7395   //    predication, updating analyses.
7396   ILV.fixVectorizedLoop(State, BestVPlan);
7397 
7398   ILV.printDebugTracesAtEnd();
7399 
7400   // 4. Adjust branch weight of the branch in the middle block.
7401   auto *MiddleTerm =
7402       cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7403   if (MiddleTerm->isConditional() &&
7404       hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7405     // Assume that `Count % VectorTripCount` is equally distributed.
7406     unsigned TripCount = State.UF * State.VF.getKnownMinValue();
7407     assert(TripCount > 0 && "trip count should not be zero");
7408     const uint32_t Weights[] = {1, TripCount - 1};
7409     setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7410   }
7411 
7412   return {State.ExpandedSCEVs, ReductionResumeValues};
7413 }
7414 
7415 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
printPlans(raw_ostream & O)7416 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7417   for (const auto &Plan : VPlans)
7418     if (PrintVPlansInDotFormat)
7419       Plan->printDOT(O);
7420     else
7421       Plan->print(O);
7422 }
7423 #endif
7424 
7425 //===--------------------------------------------------------------------===//
7426 // EpilogueVectorizerMainLoop
7427 //===--------------------------------------------------------------------===//
7428 
7429 /// This function is partially responsible for generating the control flow
7430 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7431 std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)7432 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7433     const SCEV2ValueTy &ExpandedSCEVs) {
7434   createVectorLoopSkeleton("");
7435 
7436   // Generate the code to check the minimum iteration count of the vector
7437   // epilogue (see below).
7438   EPI.EpilogueIterationCountCheck =
7439       emitIterationCountCheck(LoopScalarPreHeader, true);
7440   EPI.EpilogueIterationCountCheck->setName("iter.check");
7441 
7442   // Generate the code to check any assumptions that we've made for SCEV
7443   // expressions.
7444   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7445 
7446   // Generate the code that checks at runtime if arrays overlap. We put the
7447   // checks into a separate block to make the more common case of few elements
7448   // faster.
7449   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7450 
7451   // Generate the iteration count check for the main loop, *after* the check
7452   // for the epilogue loop, so that the path-length is shorter for the case
7453   // that goes directly through the vector epilogue. The longer-path length for
7454   // the main loop is compensated for, by the gain from vectorizing the larger
7455   // trip count. Note: the branch will get updated later on when we vectorize
7456   // the epilogue.
7457   EPI.MainLoopIterationCountCheck =
7458       emitIterationCountCheck(LoopScalarPreHeader, false);
7459 
7460   // Generate the induction variable.
7461   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7462 
7463   // Skip induction resume value creation here because they will be created in
7464   // the second pass for the scalar loop. The induction resume values for the
7465   // inductions in the epilogue loop are created before executing the plan for
7466   // the epilogue loop.
7467 
7468   return {LoopVectorPreHeader, nullptr};
7469 }
7470 
printDebugTracesAtStart()7471 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7472   LLVM_DEBUG({
7473     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7474            << "Main Loop VF:" << EPI.MainLoopVF
7475            << ", Main Loop UF:" << EPI.MainLoopUF
7476            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7477            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7478   });
7479 }
7480 
printDebugTracesAtEnd()7481 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7482   DEBUG_WITH_TYPE(VerboseDebug, {
7483     dbgs() << "intermediate fn:\n"
7484            << *OrigLoop->getHeader()->getParent() << "\n";
7485   });
7486 }
7487 
7488 BasicBlock *
emitIterationCountCheck(BasicBlock * Bypass,bool ForEpilogue)7489 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7490                                                     bool ForEpilogue) {
7491   assert(Bypass && "Expected valid bypass basic block.");
7492   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7493   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7494   Value *Count = getTripCount();
7495   // Reuse existing vector loop preheader for TC checks.
7496   // Note that new preheader block is generated for vector loop.
7497   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7498   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7499 
7500   // Generate code to check if the loop's trip count is less than VF * UF of the
7501   // main vector loop.
7502   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7503                                                     : VF.isVector())
7504                ? ICmpInst::ICMP_ULE
7505                : ICmpInst::ICMP_ULT;
7506 
7507   Value *CheckMinIters = Builder.CreateICmp(
7508       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7509       "min.iters.check");
7510 
7511   if (!ForEpilogue)
7512     TCCheckBlock->setName("vector.main.loop.iter.check");
7513 
7514   // Create new preheader for vector loop.
7515   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7516                                    DT, LI, nullptr, "vector.ph");
7517 
7518   if (ForEpilogue) {
7519     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7520                                  DT->getNode(Bypass)->getIDom()) &&
7521            "TC check is expected to dominate Bypass");
7522 
7523     // Update dominator for Bypass.
7524     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7525     LoopBypassBlocks.push_back(TCCheckBlock);
7526 
7527     // Save the trip count so we don't have to regenerate it in the
7528     // vec.epilog.iter.check. This is safe to do because the trip count
7529     // generated here dominates the vector epilog iter check.
7530     EPI.TripCount = Count;
7531   }
7532 
7533   BranchInst &BI =
7534       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7535   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7536     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7537   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7538 
7539   return TCCheckBlock;
7540 }
7541 
7542 //===--------------------------------------------------------------------===//
7543 // EpilogueVectorizerEpilogueLoop
7544 //===--------------------------------------------------------------------===//
7545 
7546 /// This function is partially responsible for generating the control flow
7547 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7548 std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)7549 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7550     const SCEV2ValueTy &ExpandedSCEVs) {
7551   createVectorLoopSkeleton("vec.epilog.");
7552 
7553   // Now, compare the remaining count and if there aren't enough iterations to
7554   // execute the vectorized epilogue skip to the scalar part.
7555   LoopVectorPreHeader->setName("vec.epilog.ph");
7556   BasicBlock *VecEpilogueIterationCountCheck =
7557       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
7558                  nullptr, "vec.epilog.iter.check", true);
7559   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7560                                           VecEpilogueIterationCountCheck);
7561 
7562   // Adjust the control flow taking the state info from the main loop
7563   // vectorization into account.
7564   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7565          "expected this to be saved from the previous pass.");
7566   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7567       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7568 
7569   DT->changeImmediateDominator(LoopVectorPreHeader,
7570                                EPI.MainLoopIterationCountCheck);
7571 
7572   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7573       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7574 
7575   if (EPI.SCEVSafetyCheck)
7576     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7577         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7578   if (EPI.MemSafetyCheck)
7579     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7580         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7581 
7582   DT->changeImmediateDominator(
7583       VecEpilogueIterationCountCheck,
7584       VecEpilogueIterationCountCheck->getSinglePredecessor());
7585 
7586   DT->changeImmediateDominator(LoopScalarPreHeader,
7587                                EPI.EpilogueIterationCountCheck);
7588   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7589     // If there is an epilogue which must run, there's no edge from the
7590     // middle block to exit blocks  and thus no need to update the immediate
7591     // dominator of the exit blocks.
7592     DT->changeImmediateDominator(LoopExitBlock,
7593                                  EPI.EpilogueIterationCountCheck);
7594 
7595   // Keep track of bypass blocks, as they feed start values to the induction and
7596   // reduction phis in the scalar loop preheader.
7597   if (EPI.SCEVSafetyCheck)
7598     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7599   if (EPI.MemSafetyCheck)
7600     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7601   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7602 
7603   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7604   // reductions which merge control-flow from the latch block and the middle
7605   // block. Update the incoming values here and move the Phi into the preheader.
7606   SmallVector<PHINode *, 4> PhisInBlock;
7607   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7608     PhisInBlock.push_back(&Phi);
7609 
7610   for (PHINode *Phi : PhisInBlock) {
7611     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7612     Phi->replaceIncomingBlockWith(
7613         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7614         VecEpilogueIterationCountCheck);
7615 
7616     // If the phi doesn't have an incoming value from the
7617     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7618     // value and also those from other check blocks. This is needed for
7619     // reduction phis only.
7620     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7621           return EPI.EpilogueIterationCountCheck == IncB;
7622         }))
7623       continue;
7624     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7625     if (EPI.SCEVSafetyCheck)
7626       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7627     if (EPI.MemSafetyCheck)
7628       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7629   }
7630 
7631   // Generate a resume induction for the vector epilogue and put it in the
7632   // vector epilogue preheader
7633   Type *IdxTy = Legal->getWidestInductionType();
7634   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7635   EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7636   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7637   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7638                            EPI.MainLoopIterationCountCheck);
7639 
7640   // Generate induction resume values. These variables save the new starting
7641   // indexes for the scalar loop. They are used to test if there are any tail
7642   // iterations left once the vector loop has completed.
7643   // Note that when the vectorized epilogue is skipped due to iteration count
7644   // check, then the resume value for the induction variable comes from
7645   // the trip count of the main vector loop, hence passing the AdditionalBypass
7646   // argument.
7647   createInductionResumeValues(ExpandedSCEVs,
7648                               {VecEpilogueIterationCountCheck,
7649                                EPI.VectorTripCount} /* AdditionalBypass */);
7650 
7651   return {LoopVectorPreHeader, EPResumeVal};
7652 }
7653 
7654 BasicBlock *
emitMinimumVectorEpilogueIterCountCheck(BasicBlock * Bypass,BasicBlock * Insert)7655 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7656     BasicBlock *Bypass, BasicBlock *Insert) {
7657 
7658   assert(EPI.TripCount &&
7659          "Expected trip count to have been safed in the first pass.");
7660   assert(
7661       (!isa<Instruction>(EPI.TripCount) ||
7662        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7663       "saved trip count does not dominate insertion point.");
7664   Value *TC = EPI.TripCount;
7665   IRBuilder<> Builder(Insert->getTerminator());
7666   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7667 
7668   // Generate code to check if the loop's trip count is less than VF * UF of the
7669   // vector epilogue loop.
7670   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7671                ? ICmpInst::ICMP_ULE
7672                : ICmpInst::ICMP_ULT;
7673 
7674   Value *CheckMinIters =
7675       Builder.CreateICmp(P, Count,
7676                          createStepForVF(Builder, Count->getType(),
7677                                          EPI.EpilogueVF, EPI.EpilogueUF),
7678                          "min.epilog.iters.check");
7679 
7680   BranchInst &BI =
7681       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7682   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7683     unsigned MainLoopStep = UF * VF.getKnownMinValue();
7684     unsigned EpilogueLoopStep =
7685         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7686     // We assume the remaining `Count` is equally distributed in
7687     // [0, MainLoopStep)
7688     // So the probability for `Count < EpilogueLoopStep` should be
7689     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7690     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7691     const uint32_t Weights[] = {EstimatedSkipCount,
7692                                 MainLoopStep - EstimatedSkipCount};
7693     setBranchWeights(BI, Weights, /*IsExpected=*/false);
7694   }
7695   ReplaceInstWithInst(Insert->getTerminator(), &BI);
7696   LoopBypassBlocks.push_back(Insert);
7697   return Insert;
7698 }
7699 
printDebugTracesAtStart()7700 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7701   LLVM_DEBUG({
7702     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7703            << "Epilogue Loop VF:" << EPI.EpilogueVF
7704            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7705   });
7706 }
7707 
printDebugTracesAtEnd()7708 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7709   DEBUG_WITH_TYPE(VerboseDebug, {
7710     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7711   });
7712 }
7713 
getDecisionAndClampRange(const std::function<bool (ElementCount)> & Predicate,VFRange & Range)7714 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7715     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7716   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7717   bool PredicateAtRangeStart = Predicate(Range.Start);
7718 
7719   for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7720     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7721       Range.End = TmpVF;
7722       break;
7723     }
7724 
7725   return PredicateAtRangeStart;
7726 }
7727 
7728 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7729 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7730 /// of VF's starting at a given VF and extending it as much as possible. Each
7731 /// vectorization decision can potentially shorten this sub-range during
7732 /// buildVPlan().
buildVPlans(ElementCount MinVF,ElementCount MaxVF)7733 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7734                                            ElementCount MaxVF) {
7735   auto MaxVFTimes2 = MaxVF * 2;
7736   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7737     VFRange SubRange = {VF, MaxVFTimes2};
7738     VPlans.push_back(buildVPlan(SubRange));
7739     VF = SubRange.End;
7740   }
7741 }
7742 
7743 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
mapToVPValues(User::op_range Operands)7744 VPRecipeBuilder::mapToVPValues(User::op_range Operands) {
7745   std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
7746     if (auto *I = dyn_cast<Instruction>(Op)) {
7747       if (auto *R = Ingredient2Recipe.lookup(I))
7748         return R->getVPSingleValue();
7749     }
7750     return Plan.getOrAddLiveIn(Op);
7751   };
7752   return map_range(Operands, Fn);
7753 }
7754 
createEdgeMask(BasicBlock * Src,BasicBlock * Dst)7755 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
7756   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7757 
7758   // Look for cached value.
7759   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7760   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7761   if (ECEntryIt != EdgeMaskCache.end())
7762     return ECEntryIt->second;
7763 
7764   VPValue *SrcMask = getBlockInMask(Src);
7765 
7766   // The terminator has to be a branch inst!
7767   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7768   assert(BI && "Unexpected terminator found");
7769 
7770   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7771     return EdgeMaskCache[Edge] = SrcMask;
7772 
7773   // If source is an exiting block, we know the exit edge is dynamically dead
7774   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
7775   // adding uses of an otherwise potentially dead instruction.
7776   if (OrigLoop->isLoopExiting(Src))
7777     return EdgeMaskCache[Edge] = SrcMask;
7778 
7779   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan);
7780   assert(EdgeMask && "No Edge Mask found for condition");
7781 
7782   if (BI->getSuccessor(0) != Dst)
7783     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7784 
7785   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
7786     // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
7787     // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
7788     // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
7789     EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
7790   }
7791 
7792   return EdgeMaskCache[Edge] = EdgeMask;
7793 }
7794 
getEdgeMask(BasicBlock * Src,BasicBlock * Dst) const7795 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
7796   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7797 
7798   // Look for cached value.
7799   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7800   EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
7801   assert(ECEntryIt != EdgeMaskCache.end() &&
7802          "looking up mask for edge which has not been created");
7803   return ECEntryIt->second;
7804 }
7805 
createHeaderMask()7806 void VPRecipeBuilder::createHeaderMask() {
7807   BasicBlock *Header = OrigLoop->getHeader();
7808 
7809   // When not folding the tail, use nullptr to model all-true mask.
7810   if (!CM.foldTailByMasking()) {
7811     BlockMaskCache[Header] = nullptr;
7812     return;
7813   }
7814 
7815   // Introduce the early-exit compare IV <= BTC to form header block mask.
7816   // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
7817   // constructing the desired canonical IV in the header block as its first
7818   // non-phi instructions.
7819 
7820   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
7821   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
7822   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
7823   HeaderVPBB->insert(IV, NewInsertionPoint);
7824 
7825   VPBuilder::InsertPointGuard Guard(Builder);
7826   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
7827   VPValue *BlockMask = nullptr;
7828   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
7829   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
7830   BlockMaskCache[Header] = BlockMask;
7831 }
7832 
getBlockInMask(BasicBlock * BB) const7833 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
7834   // Return the cached value.
7835   BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
7836   assert(BCEntryIt != BlockMaskCache.end() &&
7837          "Trying to access mask for block without one.");
7838   return BCEntryIt->second;
7839 }
7840 
createBlockInMask(BasicBlock * BB)7841 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
7842   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7843   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
7844   assert(OrigLoop->getHeader() != BB &&
7845          "Loop header must have cached block mask");
7846 
7847   // All-one mask is modelled as no-mask following the convention for masked
7848   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7849   VPValue *BlockMask = nullptr;
7850   // This is the block mask. We OR all incoming edges.
7851   for (auto *Predecessor : predecessors(BB)) {
7852     VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
7853     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
7854       BlockMaskCache[BB] = EdgeMask;
7855       return;
7856     }
7857 
7858     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7859       BlockMask = EdgeMask;
7860       continue;
7861     }
7862 
7863     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
7864   }
7865 
7866   BlockMaskCache[BB] = BlockMask;
7867 }
7868 
7869 VPWidenMemoryRecipe *
tryToWidenMemory(Instruction * I,ArrayRef<VPValue * > Operands,VFRange & Range)7870 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
7871                                   VFRange &Range) {
7872   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7873          "Must be called with either a load or store");
7874 
7875   auto willWiden = [&](ElementCount VF) -> bool {
7876     LoopVectorizationCostModel::InstWidening Decision =
7877         CM.getWideningDecision(I, VF);
7878     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7879            "CM decision should be taken at this point.");
7880     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7881       return true;
7882     if (CM.isScalarAfterVectorization(I, VF) ||
7883         CM.isProfitableToScalarize(I, VF))
7884       return false;
7885     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7886   };
7887 
7888   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7889     return nullptr;
7890 
7891   VPValue *Mask = nullptr;
7892   if (Legal->isMaskRequired(I))
7893     Mask = getBlockInMask(I->getParent());
7894 
7895   // Determine if the pointer operand of the access is either consecutive or
7896   // reverse consecutive.
7897   LoopVectorizationCostModel::InstWidening Decision =
7898       CM.getWideningDecision(I, Range.Start);
7899   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7900   bool Consecutive =
7901       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
7902 
7903   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
7904   if (Consecutive) {
7905     auto *GEP = dyn_cast<GetElementPtrInst>(
7906         Ptr->getUnderlyingValue()->stripPointerCasts());
7907     auto *VectorPtr = new VPVectorPointerRecipe(
7908         Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
7909         I->getDebugLoc());
7910     Builder.getInsertBlock()->appendRecipe(VectorPtr);
7911     Ptr = VectorPtr;
7912   }
7913   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7914     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7915                                  I->getDebugLoc());
7916 
7917   StoreInst *Store = cast<StoreInst>(I);
7918   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
7919                                 Reverse, I->getDebugLoc());
7920 }
7921 
7922 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
7923 /// insert a recipe to expand the step for the induction recipe.
7924 static VPWidenIntOrFpInductionRecipe *
createWidenInductionRecipes(PHINode * Phi,Instruction * PhiOrTrunc,VPValue * Start,const InductionDescriptor & IndDesc,VPlan & Plan,ScalarEvolution & SE,Loop & OrigLoop)7925 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
7926                             VPValue *Start, const InductionDescriptor &IndDesc,
7927                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
7928   assert(IndDesc.getStartValue() ==
7929          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
7930   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7931          "step must be loop invariant");
7932 
7933   VPValue *Step =
7934       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
7935   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
7936     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
7937   }
7938   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
7939   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
7940 }
7941 
tryToOptimizeInductionPHI(PHINode * Phi,ArrayRef<VPValue * > Operands,VFRange & Range)7942 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
7943     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
7944 
7945   // Check if this is an integer or fp induction. If so, build the recipe that
7946   // produces its scalar and vector values.
7947   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7948     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
7949                                        *PSE.getSE(), *OrigLoop);
7950 
7951   // Check if this is pointer induction. If so, build the recipe for it.
7952   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7953     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
7954                                                            *PSE.getSE());
7955     return new VPWidenPointerInductionRecipe(
7956         Phi, Operands[0], Step, *II,
7957         LoopVectorizationPlanner::getDecisionAndClampRange(
7958             [&](ElementCount VF) {
7959               return CM.isScalarAfterVectorization(Phi, VF);
7960             },
7961             Range));
7962   }
7963   return nullptr;
7964 }
7965 
tryToOptimizeInductionTruncate(TruncInst * I,ArrayRef<VPValue * > Operands,VFRange & Range)7966 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
7967     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
7968   // Optimize the special case where the source is a constant integer
7969   // induction variable. Notice that we can only optimize the 'trunc' case
7970   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7971   // (c) other casts depend on pointer size.
7972 
7973   // Determine whether \p K is a truncation based on an induction variable that
7974   // can be optimized.
7975   auto isOptimizableIVTruncate =
7976       [&](Instruction *K) -> std::function<bool(ElementCount)> {
7977     return [=](ElementCount VF) -> bool {
7978       return CM.isOptimizableIVTruncate(K, VF);
7979     };
7980   };
7981 
7982   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7983           isOptimizableIVTruncate(I), Range)) {
7984 
7985     auto *Phi = cast<PHINode>(I->getOperand(0));
7986     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
7987     VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
7988     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
7989                                        *OrigLoop);
7990   }
7991   return nullptr;
7992 }
7993 
tryToBlend(PHINode * Phi,ArrayRef<VPValue * > Operands)7994 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
7995                                            ArrayRef<VPValue *> Operands) {
7996   unsigned NumIncoming = Phi->getNumIncomingValues();
7997 
7998   // We know that all PHIs in non-header blocks are converted into selects, so
7999   // we don't have to worry about the insertion order and we can just use the
8000   // builder. At this point we generate the predication tree. There may be
8001   // duplications since this is a simple recursive scan, but future
8002   // optimizations will clean it up.
8003   // TODO: At the moment the first mask is always skipped, but it would be
8004   // better to skip the most expensive mask.
8005   SmallVector<VPValue *, 2> OperandsWithMask;
8006 
8007   for (unsigned In = 0; In < NumIncoming; In++) {
8008     OperandsWithMask.push_back(Operands[In]);
8009     VPValue *EdgeMask =
8010         getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8011     if (!EdgeMask) {
8012       assert(In == 0 && "Both null and non-null edge masks found");
8013       assert(all_equal(Operands) &&
8014              "Distinct incoming values with one having a full mask");
8015       break;
8016     }
8017     if (In == 0)
8018       continue;
8019     OperandsWithMask.push_back(EdgeMask);
8020   }
8021   return new VPBlendRecipe(Phi, OperandsWithMask);
8022 }
8023 
tryToWidenCall(CallInst * CI,ArrayRef<VPValue * > Operands,VFRange & Range)8024 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8025                                                    ArrayRef<VPValue *> Operands,
8026                                                    VFRange &Range) {
8027   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8028       [this, CI](ElementCount VF) {
8029         return CM.isScalarWithPredication(CI, VF);
8030       },
8031       Range);
8032 
8033   if (IsPredicated)
8034     return nullptr;
8035 
8036   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8037   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8038              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8039              ID == Intrinsic::pseudoprobe ||
8040              ID == Intrinsic::experimental_noalias_scope_decl))
8041     return nullptr;
8042 
8043   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8044   Ops.push_back(Operands.back());
8045 
8046   // Is it beneficial to perform intrinsic call compared to lib call?
8047   bool ShouldUseVectorIntrinsic =
8048       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8049                 [&](ElementCount VF) -> bool {
8050                   return CM.getCallWideningDecision(CI, VF).Kind ==
8051                          LoopVectorizationCostModel::CM_IntrinsicCall;
8052                 },
8053                 Range);
8054   if (ShouldUseVectorIntrinsic)
8055     return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID,
8056                                  CI->getDebugLoc());
8057 
8058   Function *Variant = nullptr;
8059   std::optional<unsigned> MaskPos;
8060   // Is better to call a vectorized version of the function than to to scalarize
8061   // the call?
8062   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8063       [&](ElementCount VF) -> bool {
8064         // The following case may be scalarized depending on the VF.
8065         // The flag shows whether we can use a usual Call for vectorized
8066         // version of the instruction.
8067 
8068         // If we've found a variant at a previous VF, then stop looking. A
8069         // vectorized variant of a function expects input in a certain shape
8070         // -- basically the number of input registers, the number of lanes
8071         // per register, and whether there's a mask required.
8072         // We store a pointer to the variant in the VPWidenCallRecipe, so
8073         // once we have an appropriate variant it's only valid for that VF.
8074         // This will force a different vplan to be generated for each VF that
8075         // finds a valid variant.
8076         if (Variant)
8077           return false;
8078         LoopVectorizationCostModel::CallWideningDecision Decision =
8079             CM.getCallWideningDecision(CI, VF);
8080         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8081           Variant = Decision.Variant;
8082           MaskPos = Decision.MaskPos;
8083           return true;
8084         }
8085 
8086         return false;
8087       },
8088       Range);
8089   if (ShouldUseVectorCall) {
8090     if (MaskPos.has_value()) {
8091       // We have 2 cases that would require a mask:
8092       //   1) The block needs to be predicated, either due to a conditional
8093       //      in the scalar loop or use of an active lane mask with
8094       //      tail-folding, and we use the appropriate mask for the block.
8095       //   2) No mask is required for the block, but the only available
8096       //      vector variant at this VF requires a mask, so we synthesize an
8097       //      all-true mask.
8098       VPValue *Mask = nullptr;
8099       if (Legal->isMaskRequired(CI))
8100         Mask = getBlockInMask(CI->getParent());
8101       else
8102         Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue(
8103             IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8104 
8105       Ops.insert(Ops.begin() + *MaskPos, Mask);
8106     }
8107 
8108     return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()),
8109                                  Intrinsic::not_intrinsic, CI->getDebugLoc(),
8110                                  Variant);
8111   }
8112 
8113   return nullptr;
8114 }
8115 
shouldWiden(Instruction * I,VFRange & Range) const8116 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8117   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8118          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8119   // Instruction should be widened, unless it is scalar after vectorization,
8120   // scalarization is profitable or it is predicated.
8121   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8122     return CM.isScalarAfterVectorization(I, VF) ||
8123            CM.isProfitableToScalarize(I, VF) ||
8124            CM.isScalarWithPredication(I, VF);
8125   };
8126   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8127                                                              Range);
8128 }
8129 
tryToWiden(Instruction * I,ArrayRef<VPValue * > Operands,VPBasicBlock * VPBB)8130 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8131                                            ArrayRef<VPValue *> Operands,
8132                                            VPBasicBlock *VPBB) {
8133   switch (I->getOpcode()) {
8134   default:
8135     return nullptr;
8136   case Instruction::SDiv:
8137   case Instruction::UDiv:
8138   case Instruction::SRem:
8139   case Instruction::URem: {
8140     // If not provably safe, use a select to form a safe divisor before widening the
8141     // div/rem operation itself.  Otherwise fall through to general handling below.
8142     if (CM.isPredicatedInst(I)) {
8143       SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8144       VPValue *Mask = getBlockInMask(I->getParent());
8145       VPValue *One =
8146           Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8147       auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8148       Ops[1] = SafeRHS;
8149       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8150     }
8151     [[fallthrough]];
8152   }
8153   case Instruction::Add:
8154   case Instruction::And:
8155   case Instruction::AShr:
8156   case Instruction::FAdd:
8157   case Instruction::FCmp:
8158   case Instruction::FDiv:
8159   case Instruction::FMul:
8160   case Instruction::FNeg:
8161   case Instruction::FRem:
8162   case Instruction::FSub:
8163   case Instruction::ICmp:
8164   case Instruction::LShr:
8165   case Instruction::Mul:
8166   case Instruction::Or:
8167   case Instruction::Select:
8168   case Instruction::Shl:
8169   case Instruction::Sub:
8170   case Instruction::Xor:
8171   case Instruction::Freeze:
8172     return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8173   };
8174 }
8175 
fixHeaderPhis()8176 void VPRecipeBuilder::fixHeaderPhis() {
8177   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8178   for (VPHeaderPHIRecipe *R : PhisToFix) {
8179     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8180     VPRecipeBase *IncR =
8181         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8182     R->addOperand(IncR->getVPSingleValue());
8183   }
8184 }
8185 
handleReplication(Instruction * I,VFRange & Range)8186 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
8187                                                       VFRange &Range) {
8188   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8189       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8190       Range);
8191 
8192   bool IsPredicated = CM.isPredicatedInst(I);
8193 
8194   // Even if the instruction is not marked as uniform, there are certain
8195   // intrinsic calls that can be effectively treated as such, so we check for
8196   // them here. Conservatively, we only do this for scalable vectors, since
8197   // for fixed-width VFs we can always fall back on full scalarization.
8198   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8199     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8200     case Intrinsic::assume:
8201     case Intrinsic::lifetime_start:
8202     case Intrinsic::lifetime_end:
8203       // For scalable vectors if one of the operands is variant then we still
8204       // want to mark as uniform, which will generate one instruction for just
8205       // the first lane of the vector. We can't scalarize the call in the same
8206       // way as for fixed-width vectors because we don't know how many lanes
8207       // there are.
8208       //
8209       // The reasons for doing it this way for scalable vectors are:
8210       //   1. For the assume intrinsic generating the instruction for the first
8211       //      lane is still be better than not generating any at all. For
8212       //      example, the input may be a splat across all lanes.
8213       //   2. For the lifetime start/end intrinsics the pointer operand only
8214       //      does anything useful when the input comes from a stack object,
8215       //      which suggests it should always be uniform. For non-stack objects
8216       //      the effect is to poison the object, which still allows us to
8217       //      remove the call.
8218       IsUniform = true;
8219       break;
8220     default:
8221       break;
8222     }
8223   }
8224   VPValue *BlockInMask = nullptr;
8225   if (!IsPredicated) {
8226     // Finalize the recipe for Instr, first if it is not predicated.
8227     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8228   } else {
8229     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8230     // Instructions marked for predication are replicated and a mask operand is
8231     // added initially. Masked replicate recipes will later be placed under an
8232     // if-then construct to prevent side-effects. Generate recipes to compute
8233     // the block mask for this region.
8234     BlockInMask = getBlockInMask(I->getParent());
8235   }
8236 
8237   // Note that there is some custom logic to mark some intrinsics as uniform
8238   // manually above for scalable vectors, which this assert needs to account for
8239   // as well.
8240   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8241           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8242          "Should not predicate a uniform recipe");
8243   auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8244                                        IsUniform, BlockInMask);
8245   return Recipe;
8246 }
8247 
8248 VPRecipeBase *
tryToCreateWidenRecipe(Instruction * Instr,ArrayRef<VPValue * > Operands,VFRange & Range,VPBasicBlock * VPBB)8249 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8250                                         ArrayRef<VPValue *> Operands,
8251                                         VFRange &Range, VPBasicBlock *VPBB) {
8252   // First, check for specific widening recipes that deal with inductions, Phi
8253   // nodes, calls and memory operations.
8254   VPRecipeBase *Recipe;
8255   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8256     if (Phi->getParent() != OrigLoop->getHeader())
8257       return tryToBlend(Phi, Operands);
8258 
8259     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8260       return Recipe;
8261 
8262     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8263     assert((Legal->isReductionVariable(Phi) ||
8264             Legal->isFixedOrderRecurrence(Phi)) &&
8265            "can only widen reductions and fixed-order recurrences here");
8266     VPValue *StartV = Operands[0];
8267     if (Legal->isReductionVariable(Phi)) {
8268       const RecurrenceDescriptor &RdxDesc =
8269           Legal->getReductionVars().find(Phi)->second;
8270       assert(RdxDesc.getRecurrenceStartValue() ==
8271              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8272       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8273                                            CM.isInLoopReduction(Phi),
8274                                            CM.useOrderedReductions(RdxDesc));
8275     } else {
8276       // TODO: Currently fixed-order recurrences are modeled as chains of
8277       // first-order recurrences. If there are no users of the intermediate
8278       // recurrences in the chain, the fixed order recurrence should be modeled
8279       // directly, enabling more efficient codegen.
8280       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8281     }
8282 
8283     PhisToFix.push_back(PhiRecipe);
8284     return PhiRecipe;
8285   }
8286 
8287   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8288                                     cast<TruncInst>(Instr), Operands, Range)))
8289     return Recipe;
8290 
8291   // All widen recipes below deal only with VF > 1.
8292   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8293           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8294     return nullptr;
8295 
8296   if (auto *CI = dyn_cast<CallInst>(Instr))
8297     return tryToWidenCall(CI, Operands, Range);
8298 
8299   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8300     return tryToWidenMemory(Instr, Operands, Range);
8301 
8302   if (!shouldWiden(Instr, Range))
8303     return nullptr;
8304 
8305   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8306     return new VPWidenGEPRecipe(GEP,
8307                                 make_range(Operands.begin(), Operands.end()));
8308 
8309   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8310     return new VPWidenSelectRecipe(
8311         *SI, make_range(Operands.begin(), Operands.end()));
8312   }
8313 
8314   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8315     return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8316                                  *CI);
8317   }
8318 
8319   return tryToWiden(Instr, Operands, VPBB);
8320 }
8321 
buildVPlansWithVPRecipes(ElementCount MinVF,ElementCount MaxVF)8322 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8323                                                         ElementCount MaxVF) {
8324   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8325 
8326   auto MaxVFTimes2 = MaxVF * 2;
8327   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8328     VFRange SubRange = {VF, MaxVFTimes2};
8329     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8330       // Now optimize the initial VPlan.
8331       if (!Plan->hasVF(ElementCount::getFixed(1)))
8332         VPlanTransforms::truncateToMinimalBitwidths(
8333             *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8334       VPlanTransforms::optimize(*Plan, *PSE.getSE());
8335       // TODO: try to put it close to addActiveLaneMask().
8336       // Discard the plan if it is not EVL-compatible
8337       if (CM.foldTailWithEVL() &&
8338           !VPlanTransforms::tryAddExplicitVectorLength(*Plan))
8339         break;
8340       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8341       VPlans.push_back(std::move(Plan));
8342     }
8343     VF = SubRange.End;
8344   }
8345 }
8346 
8347 // Add the necessary canonical IV and branch recipes required to control the
8348 // loop.
addCanonicalIVRecipes(VPlan & Plan,Type * IdxTy,bool HasNUW,DebugLoc DL)8349 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8350                                   DebugLoc DL) {
8351   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8352   auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8353 
8354   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8355   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8356   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8357   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8358   Header->insert(CanonicalIVPHI, Header->begin());
8359 
8360   VPBuilder Builder(TopRegion->getExitingBasicBlock());
8361   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8362   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8363       Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8364       "index.next");
8365   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8366 
8367   // Add the BranchOnCount VPInstruction to the latch.
8368   Builder.createNaryOp(VPInstruction::BranchOnCount,
8369                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8370 }
8371 
8372 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8373 // original exit block.
addUsersInExitBlock(VPBasicBlock * HeaderVPBB,Loop * OrigLoop,VPRecipeBuilder & Builder,VPlan & Plan)8374 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8375                                 VPRecipeBuilder &Builder, VPlan &Plan) {
8376   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8377   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8378   // Only handle single-exit loops with unique exit blocks for now.
8379   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8380     return;
8381 
8382   // Introduce VPUsers modeling the exit values.
8383   for (PHINode &ExitPhi : ExitBB->phis()) {
8384     Value *IncomingValue =
8385         ExitPhi.getIncomingValueForBlock(ExitingBB);
8386     VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
8387     // Exit values for inductions are computed and updated outside of VPlan and
8388     // independent of induction recipes.
8389     // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8390     // live-outs.
8391     if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
8392          !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
8393         isa<VPWidenPointerInductionRecipe>(V))
8394       continue;
8395     Plan.addLiveOut(&ExitPhi, V);
8396   }
8397 }
8398 
8399 /// Feed a resume value for every FOR from the vector loop to the scalar loop,
8400 /// if middle block branches to scalar preheader, by introducing ExtractFromEnd
8401 /// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the
8402 /// latter and corresponds to the scalar header.
addLiveOutsForFirstOrderRecurrences(VPlan & Plan)8403 static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
8404   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8405 
8406   // Start by finding out if middle block branches to scalar preheader, which is
8407   // not a VPIRBasicBlock, unlike Exit block - the other possible successor of
8408   // middle block.
8409   // TODO: Should be replaced by
8410   // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
8411   // scalar region is modeled as well.
8412   VPBasicBlock *ScalarPHVPBB = nullptr;
8413   auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
8414   for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) {
8415     if (isa<VPIRBasicBlock>(Succ))
8416       continue;
8417     assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?");
8418     ScalarPHVPBB = cast<VPBasicBlock>(Succ);
8419   }
8420   if (!ScalarPHVPBB)
8421     return;
8422 
8423   VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8424   VPBuilder MiddleBuilder(MiddleVPBB);
8425   // Reset insert point so new recipes are inserted before terminator and
8426   // condition, if there is either the former or both.
8427   if (auto *Terminator = MiddleVPBB->getTerminator()) {
8428     auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
8429     assert((!Condition || Condition->getParent() == MiddleVPBB) &&
8430            "Condition expected in MiddleVPBB");
8431     MiddleBuilder.setInsertPoint(Condition ? Condition : Terminator);
8432   }
8433   VPValue *OneVPV = Plan.getOrAddLiveIn(
8434       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
8435 
8436   for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8437     auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
8438     if (!FOR)
8439       continue;
8440 
8441     // Extract the resume value and create a new VPLiveOut for it.
8442     auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd,
8443                                               {FOR->getBackedgeValue(), OneVPV},
8444                                               {}, "vector.recur.extract");
8445     auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
8446         VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
8447         "scalar.recur.init");
8448     Plan.addLiveOut(cast<PHINode>(FOR->getUnderlyingInstr()), ResumePhiRecipe);
8449   }
8450 }
8451 
8452 VPlanPtr
tryToBuildVPlanWithVPRecipes(VFRange & Range)8453 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8454 
8455   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8456 
8457   // ---------------------------------------------------------------------------
8458   // Build initial VPlan: Scan the body of the loop in a topological order to
8459   // visit each basic block after having visited its predecessor basic blocks.
8460   // ---------------------------------------------------------------------------
8461 
8462   // Create initial VPlan skeleton, having a basic block for the pre-header
8463   // which contains SCEV expansions that need to happen before the CFG is
8464   // modified; a basic block for the vector pre-header, followed by a region for
8465   // the vector loop, followed by the middle basic block. The skeleton vector
8466   // loop region contains a header and latch basic blocks.
8467 
8468   bool RequiresScalarEpilogueCheck =
8469       LoopVectorizationPlanner::getDecisionAndClampRange(
8470           [this](ElementCount VF) {
8471             return !CM.requiresScalarEpilogue(VF.isVector());
8472           },
8473           Range);
8474   VPlanPtr Plan = VPlan::createInitialVPlan(
8475       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8476       *PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(),
8477       OrigLoop);
8478 
8479   // Don't use getDecisionAndClampRange here, because we don't know the UF
8480   // so this function is better to be conservative, rather than to split
8481   // it up into different VPlans.
8482   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8483   bool IVUpdateMayOverflow = false;
8484   for (ElementCount VF : Range)
8485     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8486 
8487   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8488   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8489   // When not folding the tail, we know that the induction increment will not
8490   // overflow.
8491   bool HasNUW = Style == TailFoldingStyle::None;
8492   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8493 
8494   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8495 
8496   // ---------------------------------------------------------------------------
8497   // Pre-construction: record ingredients whose recipes we'll need to further
8498   // process after constructing the initial VPlan.
8499   // ---------------------------------------------------------------------------
8500 
8501   // For each interleave group which is relevant for this (possibly trimmed)
8502   // Range, add it to the set of groups to be later applied to the VPlan and add
8503   // placeholders for its members' Recipes which we'll be replacing with a
8504   // single VPInterleaveRecipe.
8505   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8506     auto applyIG = [IG, this](ElementCount VF) -> bool {
8507       bool Result = (VF.isVector() && // Query is illegal for VF == 1
8508                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
8509                          LoopVectorizationCostModel::CM_Interleave);
8510       // For scalable vectors, the only interleave factor currently supported
8511       // is 2 since we require the (de)interleave2 intrinsics instead of
8512       // shufflevectors.
8513       assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8514              "Unsupported interleave factor for scalable vectors");
8515       return Result;
8516     };
8517     if (!getDecisionAndClampRange(applyIG, Range))
8518       continue;
8519     InterleaveGroups.insert(IG);
8520   };
8521 
8522   // ---------------------------------------------------------------------------
8523   // Construct recipes for the instructions in the loop
8524   // ---------------------------------------------------------------------------
8525 
8526   // Scan the body of the loop in a topological order to visit each basic block
8527   // after having visited its predecessor basic blocks.
8528   LoopBlocksDFS DFS(OrigLoop);
8529   DFS.perform(LI);
8530 
8531   VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
8532   VPBasicBlock *VPBB = HeaderVPBB;
8533   BasicBlock *HeaderBB = OrigLoop->getHeader();
8534   bool NeedsMasks =
8535       CM.foldTailByMasking() ||
8536       any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
8537         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8538         return Legal->blockNeedsPredication(BB) || NeedsBlends;
8539       });
8540   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8541     // Relevant instructions from basic block BB will be grouped into VPRecipe
8542     // ingredients and fill a new VPBasicBlock.
8543     if (VPBB != HeaderVPBB)
8544       VPBB->setName(BB->getName());
8545     Builder.setInsertPoint(VPBB);
8546 
8547     if (VPBB == HeaderVPBB)
8548       RecipeBuilder.createHeaderMask();
8549     else if (NeedsMasks)
8550       RecipeBuilder.createBlockInMask(BB);
8551 
8552     // Introduce each ingredient into VPlan.
8553     // TODO: Model and preserve debug intrinsics in VPlan.
8554     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8555       Instruction *Instr = &I;
8556       SmallVector<VPValue *, 4> Operands;
8557       auto *Phi = dyn_cast<PHINode>(Instr);
8558       if (Phi && Phi->getParent() == HeaderBB) {
8559         Operands.push_back(Plan->getOrAddLiveIn(
8560             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8561       } else {
8562         auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
8563         Operands = {OpRange.begin(), OpRange.end()};
8564       }
8565 
8566       // Invariant stores inside loop will be deleted and a single store
8567       // with the final reduction value will be added to the exit block
8568       StoreInst *SI;
8569       if ((SI = dyn_cast<StoreInst>(&I)) &&
8570           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8571         continue;
8572 
8573       VPRecipeBase *Recipe =
8574           RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8575       if (!Recipe)
8576         Recipe = RecipeBuilder.handleReplication(Instr, Range);
8577 
8578       RecipeBuilder.setRecipe(Instr, Recipe);
8579       if (isa<VPHeaderPHIRecipe>(Recipe)) {
8580         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8581         // the following cases, VPHeaderPHIRecipes may be created after non-phi
8582         // recipes and need to be moved to the phi section of HeaderVPBB:
8583         // * tail-folding (non-phi recipes computing the header mask are
8584         // introduced earlier than regular header phi recipes, and should appear
8585         // after them)
8586         // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8587 
8588         assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8589                 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8590                "unexpected recipe needs moving");
8591         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8592       } else
8593         VPBB->appendRecipe(Recipe);
8594     }
8595 
8596     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8597     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8598   }
8599 
8600   // After here, VPBB should not be used.
8601   VPBB = nullptr;
8602 
8603   if (CM.requiresScalarEpilogue(Range)) {
8604     // No edge from the middle block to the unique exit block has been inserted
8605     // and there is nothing to fix from vector loop; phis should have incoming
8606     // from scalar loop only.
8607   } else
8608     addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);
8609 
8610   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8611          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8612          "entry block must be set to a VPRegionBlock having a non-empty entry "
8613          "VPBasicBlock");
8614   RecipeBuilder.fixHeaderPhis();
8615 
8616   addLiveOutsForFirstOrderRecurrences(*Plan);
8617 
8618   // ---------------------------------------------------------------------------
8619   // Transform initial VPlan: Apply previously taken decisions, in order, to
8620   // bring the VPlan to its final state.
8621   // ---------------------------------------------------------------------------
8622 
8623   // Adjust the recipes for any inloop reductions.
8624   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
8625 
8626   // Interleave memory: for each Interleave Group we marked earlier as relevant
8627   // for this VPlan, replace the Recipes widening its memory instructions with a
8628   // single VPInterleaveRecipe at its insertion point.
8629   for (const auto *IG : InterleaveGroups) {
8630     auto *Recipe =
8631         cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
8632     SmallVector<VPValue *, 4> StoredValues;
8633     for (unsigned i = 0; i < IG->getFactor(); ++i)
8634       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8635         auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
8636         StoredValues.push_back(StoreR->getStoredValue());
8637       }
8638 
8639     bool NeedsMaskForGaps =
8640         IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8641     assert((!NeedsMaskForGaps || useMaskedInterleavedAccesses(CM.TTI)) &&
8642            "masked interleaved groups are not allowed.");
8643     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8644                                         Recipe->getMask(), NeedsMaskForGaps);
8645     VPIG->insertBefore(Recipe);
8646     unsigned J = 0;
8647     for (unsigned i = 0; i < IG->getFactor(); ++i)
8648       if (Instruction *Member = IG->getMember(i)) {
8649         VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8650         if (!Member->getType()->isVoidTy()) {
8651           VPValue *OriginalV = MemberR->getVPSingleValue();
8652           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8653           J++;
8654         }
8655         MemberR->eraseFromParent();
8656       }
8657   }
8658 
8659   for (ElementCount VF : Range)
8660     Plan->addVF(VF);
8661   Plan->setName("Initial VPlan");
8662 
8663   // Replace VPValues for known constant strides guaranteed by predicate scalar
8664   // evolution.
8665   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8666     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8667     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8668     // Only handle constant strides for now.
8669     if (!ScevStride)
8670       continue;
8671 
8672     auto *CI = Plan->getOrAddLiveIn(
8673         ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
8674     if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
8675       StrideVPV->replaceAllUsesWith(CI);
8676 
8677     // The versioned value may not be used in the loop directly but through a
8678     // sext/zext. Add new live-ins in those cases.
8679     for (Value *U : StrideV->users()) {
8680       if (!isa<SExtInst, ZExtInst>(U))
8681         continue;
8682       VPValue *StrideVPV = Plan->getLiveIn(U);
8683       if (!StrideVPV)
8684         continue;
8685       unsigned BW = U->getType()->getScalarSizeInBits();
8686       APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
8687                                  : ScevStride->getAPInt().zext(BW);
8688       VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
8689       StrideVPV->replaceAllUsesWith(CI);
8690     }
8691   }
8692 
8693   VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) {
8694     return Legal->blockNeedsPredication(BB);
8695   });
8696 
8697   // Sink users of fixed-order recurrence past the recipe defining the previous
8698   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8699   if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
8700     return nullptr;
8701 
8702   if (useActiveLaneMask(Style)) {
8703     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8704     // TailFoldingStyle is visible there.
8705     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8706     bool WithoutRuntimeCheck =
8707         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8708     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8709                                        WithoutRuntimeCheck);
8710   }
8711   return Plan;
8712 }
8713 
buildVPlan(VFRange & Range)8714 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8715   // Outer loop handling: They may require CFG and instruction level
8716   // transformations before even evaluating whether vectorization is profitable.
8717   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8718   // the vectorization pipeline.
8719   assert(!OrigLoop->isInnermost());
8720   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8721 
8722   // Create new empty VPlan
8723   auto Plan = VPlan::createInitialVPlan(
8724       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8725       *PSE.getSE(), true, false, OrigLoop);
8726 
8727   // Build hierarchical CFG
8728   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8729   HCFGBuilder.buildHierarchicalCFG();
8730 
8731   for (ElementCount VF : Range)
8732     Plan->addVF(VF);
8733 
8734   VPlanTransforms::VPInstructionsToVPRecipes(
8735       Plan,
8736       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8737       *PSE.getSE(), *TLI);
8738 
8739   // Remove the existing terminator of the exiting block of the top-most region.
8740   // A BranchOnCount will be added instead when adding the canonical IV recipes.
8741   auto *Term =
8742       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8743   Term->eraseFromParent();
8744 
8745   // Tail folding is not supported for outer loops, so the induction increment
8746   // is guaranteed to not wrap.
8747   bool HasNUW = true;
8748   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8749                         DebugLoc());
8750   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8751   return Plan;
8752 }
8753 
8754 // Adjust the recipes for reductions. For in-loop reductions the chain of
8755 // instructions leading from the loop exit instr to the phi need to be converted
8756 // to reductions, with one operand being vector and the other being the scalar
8757 // reduction chain. For other reductions, a select is introduced between the phi
8758 // and live-out recipes when folding the tail.
8759 //
8760 // A ComputeReductionResult recipe is added to the middle block, also for
8761 // in-loop reductions which compute their result in-loop, because generating
8762 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8763 //
8764 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8765 // with a boolean reduction phi node to check if the condition is true in any
8766 // iteration. The final value is selected by the final ComputeReductionResult.
adjustRecipesForReductions(VPlanPtr & Plan,VPRecipeBuilder & RecipeBuilder,ElementCount MinVF)8767 void LoopVectorizationPlanner::adjustRecipesForReductions(
8768     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8769   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8770   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8771   // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8772   // sank outside of the loop would keep the same order as they had in the
8773   // original loop.
8774   SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8775   for (VPRecipeBase &R : Header->phis()) {
8776     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8777       ReductionPHIList.emplace_back(ReductionPhi);
8778   }
8779   bool HasIntermediateStore = false;
8780   stable_sort(ReductionPHIList,
8781               [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8782                                             const VPReductionPHIRecipe *R2) {
8783                 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8784                 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8785                 HasIntermediateStore |= IS1 || IS2;
8786 
8787                 // If neither of the recipes has an intermediate store, keep the
8788                 // order the same.
8789                 if (!IS1 && !IS2)
8790                   return false;
8791 
8792                 // If only one of the recipes has an intermediate store, then
8793                 // move it towards the beginning of the list.
8794                 if (IS1 && !IS2)
8795                   return true;
8796 
8797                 if (!IS1 && IS2)
8798                   return false;
8799 
8800                 // If both recipes have an intermediate store, then the recipe
8801                 // with the later store should be processed earlier. So it
8802                 // should go to the beginning of the list.
8803                 return DT->dominates(IS2, IS1);
8804               });
8805 
8806   if (HasIntermediateStore && ReductionPHIList.size() > 1)
8807     for (VPRecipeBase *R : ReductionPHIList)
8808       R->moveBefore(*Header, Header->getFirstNonPhi());
8809 
8810   for (VPRecipeBase &R : Header->phis()) {
8811     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8812     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8813       continue;
8814 
8815     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8816     RecurKind Kind = RdxDesc.getRecurrenceKind();
8817     assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
8818            "AnyOf reductions are not allowed for in-loop reductions");
8819 
8820     // Collect the chain of "link" recipes for the reduction starting at PhiR.
8821     SetVector<VPSingleDefRecipe *> Worklist;
8822     Worklist.insert(PhiR);
8823     for (unsigned I = 0; I != Worklist.size(); ++I) {
8824       VPSingleDefRecipe *Cur = Worklist[I];
8825       for (VPUser *U : Cur->users()) {
8826         auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8827         if (!UserRecipe) {
8828           assert(isa<VPLiveOut>(U) &&
8829                  "U must either be a VPSingleDef or VPLiveOut");
8830           continue;
8831         }
8832         Worklist.insert(UserRecipe);
8833       }
8834     }
8835 
8836     // Visit operation "Links" along the reduction chain top-down starting from
8837     // the phi until LoopExitValue. We keep track of the previous item
8838     // (PreviousLink) to tell which of the two operands of a Link will remain
8839     // scalar and which will be reduced. For minmax by select(cmp), Link will be
8840     // the select instructions. Blend recipes of in-loop reduction phi's  will
8841     // get folded to their non-phi operand, as the reduction recipe handles the
8842     // condition directly.
8843     VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8844     for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
8845       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8846 
8847       // Index of the first operand which holds a non-mask vector operand.
8848       unsigned IndexOfFirstOperand;
8849       // Recognize a call to the llvm.fmuladd intrinsic.
8850       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8851       VPValue *VecOp;
8852       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8853       if (IsFMulAdd) {
8854         assert(
8855             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
8856             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8857         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8858                 isa<VPWidenCallRecipe>(CurrentLink)) &&
8859                CurrentLink->getOperand(2) == PreviousLink &&
8860                "expected a call where the previous link is the added operand");
8861 
8862         // If the instruction is a call to the llvm.fmuladd intrinsic then we
8863         // need to create an fmul recipe (multiplying the first two operands of
8864         // the fmuladd together) to use as the vector operand for the fadd
8865         // reduction.
8866         VPInstruction *FMulRecipe = new VPInstruction(
8867             Instruction::FMul,
8868             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
8869             CurrentLinkI->getFastMathFlags());
8870         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
8871         VecOp = FMulRecipe;
8872       } else {
8873         auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
8874         if (PhiR->isInLoop() && Blend) {
8875           assert(Blend->getNumIncomingValues() == 2 &&
8876                  "Blend must have 2 incoming values");
8877           if (Blend->getIncomingValue(0) == PhiR)
8878             Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
8879           else {
8880             assert(Blend->getIncomingValue(1) == PhiR &&
8881                    "PhiR must be an operand of the blend");
8882             Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
8883           }
8884           continue;
8885         }
8886 
8887         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8888           if (isa<VPWidenRecipe>(CurrentLink)) {
8889             assert(isa<CmpInst>(CurrentLinkI) &&
8890                    "need to have the compare of the select");
8891             continue;
8892           }
8893           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
8894                  "must be a select recipe");
8895           IndexOfFirstOperand = 1;
8896         } else {
8897           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
8898                  "Expected to replace a VPWidenSC");
8899           IndexOfFirstOperand = 0;
8900         }
8901         // Note that for non-commutable operands (cmp-selects), the semantics of
8902         // the cmp-select are captured in the recurrence kind.
8903         unsigned VecOpId =
8904             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
8905                 ? IndexOfFirstOperand + 1
8906                 : IndexOfFirstOperand;
8907         VecOp = CurrentLink->getOperand(VecOpId);
8908         assert(VecOp != PreviousLink &&
8909                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
8910                                        (VecOpId - IndexOfFirstOperand)) ==
8911                    PreviousLink &&
8912                "PreviousLink must be the operand other than VecOp");
8913       }
8914 
8915       BasicBlock *BB = CurrentLinkI->getParent();
8916       VPValue *CondOp = nullptr;
8917       if (CM.blockNeedsPredicationForAnyReason(BB))
8918         CondOp = RecipeBuilder.getBlockInMask(BB);
8919 
8920       VPReductionRecipe *RedRecipe =
8921           new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
8922                                 CondOp, CM.useOrderedReductions(RdxDesc));
8923       // Append the recipe to the end of the VPBasicBlock because we need to
8924       // ensure that it comes after all of it's inputs, including CondOp.
8925       // Note that this transformation may leave over dead recipes (including
8926       // CurrentLink), which will be cleaned by a later VPlan transform.
8927       LinkVPBB->appendRecipe(RedRecipe);
8928       CurrentLink->replaceAllUsesWith(RedRecipe);
8929       PreviousLink = RedRecipe;
8930     }
8931   }
8932   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8933   Builder.setInsertPoint(&*LatchVPBB->begin());
8934   VPBasicBlock *MiddleVPBB =
8935       cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
8936   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8937   for (VPRecipeBase &R :
8938        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8939     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8940     if (!PhiR)
8941       continue;
8942 
8943     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8944     // Adjust AnyOf reductions; replace the reduction phi for the selected value
8945     // with a boolean reduction phi node to check if the condition is true in
8946     // any iteration. The final value is selected by the final
8947     // ComputeReductionResult.
8948     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
8949             RdxDesc.getRecurrenceKind())) {
8950       auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
8951         return isa<VPWidenSelectRecipe>(U) ||
8952                (isa<VPReplicateRecipe>(U) &&
8953                 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
8954                     Instruction::Select);
8955       }));
8956       VPValue *Cmp = Select->getOperand(0);
8957       // If the compare is checking the reduction PHI node, adjust it to check
8958       // the start value.
8959       if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
8960         for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
8961           if (CmpR->getOperand(I) == PhiR)
8962             CmpR->setOperand(I, PhiR->getStartValue());
8963       }
8964       VPBuilder::InsertPointGuard Guard(Builder);
8965       Builder.setInsertPoint(Select);
8966 
8967       // If the true value of the select is the reduction phi, the new value is
8968       // selected if the negated condition is true in any iteration.
8969       if (Select->getOperand(1) == PhiR)
8970         Cmp = Builder.createNot(Cmp);
8971       VPValue *Or = Builder.createOr(PhiR, Cmp);
8972       Select->getVPSingleValue()->replaceAllUsesWith(Or);
8973 
8974       // Convert the reduction phi to operate on bools.
8975       PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
8976                               OrigLoop->getHeader()->getContext())));
8977     }
8978 
8979     // If tail is folded by masking, introduce selects between the phi
8980     // and the live-out instruction of each reduction, at the beginning of the
8981     // dedicated latch block.
8982     auto *OrigExitingVPV = PhiR->getBackedgeValue();
8983     auto *NewExitingVPV = PhiR->getBackedgeValue();
8984     if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
8985       VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
8986       assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
8987              "reduction recipe must be defined before latch");
8988       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
8989       std::optional<FastMathFlags> FMFs =
8990           PhiTy->isFloatingPointTy()
8991               ? std::make_optional(RdxDesc.getFastMathFlags())
8992               : std::nullopt;
8993       NewExitingVPV =
8994           Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
8995       OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
8996         return isa<VPInstruction>(&U) &&
8997                cast<VPInstruction>(&U)->getOpcode() ==
8998                    VPInstruction::ComputeReductionResult;
8999       });
9000       if (PreferPredicatedReductionSelect ||
9001           TTI.preferPredicatedReductionSelect(
9002               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9003               TargetTransformInfo::ReductionFlags()))
9004         PhiR->setOperand(1, NewExitingVPV);
9005     }
9006 
9007     // If the vector reduction can be performed in a smaller type, we truncate
9008     // then extend the loop exit value to enable InstCombine to evaluate the
9009     // entire expression in the smaller type.
9010     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9011     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9012         !RecurrenceDescriptor::isAnyOfRecurrenceKind(
9013             RdxDesc.getRecurrenceKind())) {
9014       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9015       Type *RdxTy = RdxDesc.getRecurrenceType();
9016       auto *Trunc =
9017           new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9018       auto *Extnd =
9019           RdxDesc.isSigned()
9020               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9021               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9022 
9023       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9024       Extnd->insertAfter(Trunc);
9025       if (PhiR->getOperand(1) == NewExitingVPV)
9026         PhiR->setOperand(1, Extnd->getVPSingleValue());
9027       NewExitingVPV = Extnd;
9028     }
9029 
9030     // We want code in the middle block to appear to execute on the location of
9031     // the scalar loop's latch terminator because: (a) it is all compiler
9032     // generated, (b) these instructions are always executed after evaluating
9033     // the latch conditional branch, and (c) other passes may add new
9034     // predecessors which terminate on this line. This is the easiest way to
9035     // ensure we don't accidentally cause an extra step back into the loop while
9036     // debugging.
9037     DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9038 
9039     // TODO: At the moment ComputeReductionResult also drives creation of the
9040     // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9041     // even for in-loop reductions, until the reduction resume value handling is
9042     // also modeled in VPlan.
9043     auto *FinalReductionResult = new VPInstruction(
9044         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9045     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9046     OrigExitingVPV->replaceUsesWithIf(
9047         FinalReductionResult,
9048         [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9049   }
9050 
9051   VPlanTransforms::clearReductionWrapFlags(*Plan);
9052 }
9053 
execute(VPTransformState & State)9054 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9055   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9056          "Not a pointer induction according to InductionDescriptor!");
9057   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9058          "Unexpected type.");
9059   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
9060          "Recipe should have been replaced");
9061 
9062   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9063   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
9064   Type *PhiType = IndDesc.getStep()->getType();
9065 
9066   // Build a pointer phi
9067   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9068   Type *ScStValueType = ScalarStartValue->getType();
9069   PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
9070                                            CanonicalIV->getIterator());
9071 
9072   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9073   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9074 
9075   // A pointer induction, performed by using a gep
9076   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9077 
9078   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9079   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9080   Value *NumUnrolledElems =
9081       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9082   Value *InductionGEP = GetElementPtrInst::Create(
9083       State.Builder.getInt8Ty(), NewPointerPhi,
9084       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9085       InductionLoc);
9086   // Add induction update using an incorrect block temporarily. The phi node
9087   // will be fixed after VPlan execution. Note that at this point the latch
9088   // block cannot be used, as it does not exist yet.
9089   // TODO: Model increment value in VPlan, by turning the recipe into a
9090   // multi-def and a subclass of VPHeaderPHIRecipe.
9091   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9092 
9093   // Create UF many actual address geps that use the pointer
9094   // phi as base and a vectorized version of the step value
9095   // (<step*0, ..., step*N>) as offset.
9096   for (unsigned Part = 0; Part < State.UF; ++Part) {
9097     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9098     Value *StartOffsetScalar =
9099         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9100     Value *StartOffset =
9101         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9102     // Create a vector of consecutive numbers from zero to VF.
9103     StartOffset = State.Builder.CreateAdd(
9104         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9105 
9106     assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9107            "scalar step must be the same across all parts");
9108     Value *GEP = State.Builder.CreateGEP(
9109         State.Builder.getInt8Ty(), NewPointerPhi,
9110         State.Builder.CreateMul(
9111             StartOffset,
9112             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9113             "vector.gep"));
9114     State.set(this, GEP, Part);
9115   }
9116 }
9117 
execute(VPTransformState & State)9118 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9119   assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9120 
9121   // Fast-math-flags propagate from the original induction instruction.
9122   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9123   if (FPBinOp)
9124     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9125 
9126   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9127   Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
9128   Value *DerivedIV = emitTransformedIndex(
9129       State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9130       Kind, cast_if_present<BinaryOperator>(FPBinOp));
9131   DerivedIV->setName("offset.idx");
9132   assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9133 
9134   State.set(this, DerivedIV, VPIteration(0, 0));
9135 }
9136 
execute(VPTransformState & State)9137 void VPReplicateRecipe::execute(VPTransformState &State) {
9138   Instruction *UI = getUnderlyingInstr();
9139   if (State.Instance) { // Generate a single instance.
9140     assert((State.VF.isScalar() || !isUniform()) &&
9141            "uniform recipe shouldn't be predicated");
9142     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9143     State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9144     // Insert scalar instance packing it into a vector.
9145     if (State.VF.isVector() && shouldPack()) {
9146       // If we're constructing lane 0, initialize to start from poison.
9147       if (State.Instance->Lane.isFirstLane()) {
9148         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9149         Value *Poison = PoisonValue::get(
9150             VectorType::get(UI->getType(), State.VF));
9151         State.set(this, Poison, State.Instance->Part);
9152       }
9153       State.packScalarIntoVectorValue(this, *State.Instance);
9154     }
9155     return;
9156   }
9157 
9158   if (IsUniform) {
9159     // If the recipe is uniform across all parts (instead of just per VF), only
9160     // generate a single instance.
9161     if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9162         all_of(operands(), [](VPValue *Op) {
9163           return Op->isDefinedOutsideVectorRegions();
9164         })) {
9165       State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9166       if (user_begin() != user_end()) {
9167         for (unsigned Part = 1; Part < State.UF; ++Part)
9168           State.set(this, State.get(this, VPIteration(0, 0)),
9169                     VPIteration(Part, 0));
9170       }
9171       return;
9172     }
9173 
9174     // Uniform within VL means we need to generate lane 0 only for each
9175     // unrolled copy.
9176     for (unsigned Part = 0; Part < State.UF; ++Part)
9177       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9178     return;
9179   }
9180 
9181   // A store of a loop varying value to a uniform address only needs the last
9182   // copy of the store.
9183   if (isa<StoreInst>(UI) &&
9184       vputils::isUniformAfterVectorization(getOperand(1))) {
9185     auto Lane = VPLane::getLastLaneForVF(State.VF);
9186     State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9187                                     State);
9188     return;
9189   }
9190 
9191   // Generate scalar instances for all VF lanes of all UF parts.
9192   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9193   const unsigned EndLane = State.VF.getKnownMinValue();
9194   for (unsigned Part = 0; Part < State.UF; ++Part)
9195     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9196       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9197 }
9198 
execute(VPTransformState & State)9199 void VPWidenLoadRecipe::execute(VPTransformState &State) {
9200   auto *LI = cast<LoadInst>(&Ingredient);
9201 
9202   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9203   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9204   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9205   bool CreateGather = !isConsecutive();
9206 
9207   auto &Builder = State.Builder;
9208   State.setDebugLocFrom(getDebugLoc());
9209   for (unsigned Part = 0; Part < State.UF; ++Part) {
9210     Value *NewLI;
9211     Value *Mask = nullptr;
9212     if (auto *VPMask = getMask()) {
9213       // Mask reversal is only needed for non-all-one (null) masks, as reverse
9214       // of a null all-one mask is a null mask.
9215       Mask = State.get(VPMask, Part);
9216       if (isReverse())
9217         Mask = Builder.CreateVectorReverse(Mask, "reverse");
9218     }
9219 
9220     Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
9221     if (CreateGather) {
9222       NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
9223                                          "wide.masked.gather");
9224     } else if (Mask) {
9225       NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
9226                                        PoisonValue::get(DataTy),
9227                                        "wide.masked.load");
9228     } else {
9229       NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
9230     }
9231     // Add metadata to the load, but setVectorValue to the reverse shuffle.
9232     State.addMetadata(NewLI, LI);
9233     if (Reverse)
9234       NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9235     State.set(this, NewLI, Part);
9236   }
9237 }
9238 
9239 /// Use all-true mask for reverse rather than actual mask, as it avoids a
9240 /// dependence w/o affecting the result.
createReverseEVL(IRBuilderBase & Builder,Value * Operand,Value * EVL,const Twine & Name)9241 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
9242                                      Value *EVL, const Twine &Name) {
9243   VectorType *ValTy = cast<VectorType>(Operand->getType());
9244   Value *AllTrueMask =
9245       Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
9246   return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
9247                                  {Operand, AllTrueMask, EVL}, nullptr, Name);
9248 }
9249 
execute(VPTransformState & State)9250 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
9251   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9252                           "explicit vector length.");
9253   auto *LI = cast<LoadInst>(&Ingredient);
9254 
9255   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9256   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9257   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9258   bool CreateGather = !isConsecutive();
9259 
9260   auto &Builder = State.Builder;
9261   State.setDebugLocFrom(getDebugLoc());
9262   CallInst *NewLI;
9263   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9264   Value *Addr = State.get(getAddr(), 0, !CreateGather);
9265   Value *Mask = nullptr;
9266   if (VPValue *VPMask = getMask()) {
9267     Mask = State.get(VPMask, 0);
9268     if (isReverse())
9269       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9270   } else {
9271     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9272   }
9273 
9274   if (CreateGather) {
9275     NewLI =
9276         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9277                                 nullptr, "wide.masked.gather");
9278   } else {
9279     VectorBuilder VBuilder(Builder);
9280     VBuilder.setEVL(EVL).setMask(Mask);
9281     NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
9282         Instruction::Load, DataTy, Addr, "vp.op.load"));
9283   }
9284   NewLI->addParamAttr(
9285       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
9286   State.addMetadata(NewLI, LI);
9287   Instruction *Res = NewLI;
9288   if (isReverse())
9289     Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
9290   State.set(this, Res, 0);
9291 }
9292 
execute(VPTransformState & State)9293 void VPWidenStoreRecipe::execute(VPTransformState &State) {
9294   auto *SI = cast<StoreInst>(&Ingredient);
9295 
9296   VPValue *StoredVPValue = getStoredValue();
9297   bool CreateScatter = !isConsecutive();
9298   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9299 
9300   auto &Builder = State.Builder;
9301   State.setDebugLocFrom(getDebugLoc());
9302 
9303   for (unsigned Part = 0; Part < State.UF; ++Part) {
9304     Instruction *NewSI = nullptr;
9305     Value *Mask = nullptr;
9306     if (auto *VPMask = getMask()) {
9307       // Mask reversal is only needed for non-all-one (null) masks, as reverse
9308       // of a null all-one mask is a null mask.
9309       Mask = State.get(VPMask, Part);
9310       if (isReverse())
9311         Mask = Builder.CreateVectorReverse(Mask, "reverse");
9312     }
9313 
9314     Value *StoredVal = State.get(StoredVPValue, Part);
9315     if (isReverse()) {
9316       // If we store to reverse consecutive memory locations, then we need
9317       // to reverse the order of elements in the stored value.
9318       StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9319       // We don't want to update the value in the map as it might be used in
9320       // another expression. So don't call resetVectorValue(StoredVal).
9321     }
9322     Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
9323     if (CreateScatter)
9324       NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
9325     else if (Mask)
9326       NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
9327     else
9328       NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
9329     State.addMetadata(NewSI, SI);
9330   }
9331 }
9332 
execute(VPTransformState & State)9333 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
9334   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9335                           "explicit vector length.");
9336   auto *SI = cast<StoreInst>(&Ingredient);
9337 
9338   VPValue *StoredValue = getStoredValue();
9339   bool CreateScatter = !isConsecutive();
9340   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9341 
9342   auto &Builder = State.Builder;
9343   State.setDebugLocFrom(getDebugLoc());
9344 
9345   CallInst *NewSI = nullptr;
9346   Value *StoredVal = State.get(StoredValue, 0);
9347   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9348   if (isReverse())
9349     StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
9350   Value *Mask = nullptr;
9351   if (VPValue *VPMask = getMask()) {
9352     Mask = State.get(VPMask, 0);
9353     if (isReverse())
9354       Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9355   } else {
9356     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9357   }
9358   Value *Addr = State.get(getAddr(), 0, !CreateScatter);
9359   if (CreateScatter) {
9360     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9361                                     Intrinsic::vp_scatter,
9362                                     {StoredVal, Addr, Mask, EVL});
9363   } else {
9364     VectorBuilder VBuilder(Builder);
9365     VBuilder.setEVL(EVL).setMask(Mask);
9366     NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
9367         Instruction::Store, Type::getVoidTy(EVL->getContext()),
9368         {StoredVal, Addr}));
9369   }
9370   NewSI->addParamAttr(
9371       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
9372   State.addMetadata(NewSI, SI);
9373 }
9374 
9375 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9376 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9377 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9378 // for predication.
getScalarEpilogueLowering(Function * F,Loop * L,LoopVectorizeHints & Hints,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,LoopVectorizationLegality & LVL,InterleavedAccessInfo * IAI)9379 static ScalarEpilogueLowering getScalarEpilogueLowering(
9380     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9381     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9382     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9383   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9384   // don't look at hints or options, and don't request a scalar epilogue.
9385   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9386   // LoopAccessInfo (due to code dependency and not being able to reliably get
9387   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9388   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9389   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9390   // back to the old way and vectorize with versioning when forced. See D81345.)
9391   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9392                                                       PGSOQueryType::IRPass) &&
9393                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9394     return CM_ScalarEpilogueNotAllowedOptSize;
9395 
9396   // 2) If set, obey the directives
9397   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9398     switch (PreferPredicateOverEpilogue) {
9399     case PreferPredicateTy::ScalarEpilogue:
9400       return CM_ScalarEpilogueAllowed;
9401     case PreferPredicateTy::PredicateElseScalarEpilogue:
9402       return CM_ScalarEpilogueNotNeededUsePredicate;
9403     case PreferPredicateTy::PredicateOrDontVectorize:
9404       return CM_ScalarEpilogueNotAllowedUsePredicate;
9405     };
9406   }
9407 
9408   // 3) If set, obey the hints
9409   switch (Hints.getPredicate()) {
9410   case LoopVectorizeHints::FK_Enabled:
9411     return CM_ScalarEpilogueNotNeededUsePredicate;
9412   case LoopVectorizeHints::FK_Disabled:
9413     return CM_ScalarEpilogueAllowed;
9414   };
9415 
9416   // 4) if the TTI hook indicates this is profitable, request predication.
9417   TailFoldingInfo TFI(TLI, &LVL, IAI);
9418   if (TTI->preferPredicateOverEpilogue(&TFI))
9419     return CM_ScalarEpilogueNotNeededUsePredicate;
9420 
9421   return CM_ScalarEpilogueAllowed;
9422 }
9423 
9424 // Process the loop in the VPlan-native vectorization path. This path builds
9425 // VPlan upfront in the vectorization pipeline, which allows to apply
9426 // VPlan-to-VPlan transformations from the very beginning without modifying the
9427 // input LLVM IR.
processLoopInVPlanNativePath(Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,LoopVectorizationLegality * LVL,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,LoopVectorizeHints & Hints,LoopVectorizationRequirements & Requirements)9428 static bool processLoopInVPlanNativePath(
9429     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9430     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9431     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9432     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9433     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9434     LoopVectorizationRequirements &Requirements) {
9435 
9436   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9437     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9438     return false;
9439   }
9440   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9441   Function *F = L->getHeader()->getParent();
9442   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9443 
9444   ScalarEpilogueLowering SEL =
9445       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9446 
9447   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9448                                 &Hints, IAI);
9449   // Use the planner for outer loop vectorization.
9450   // TODO: CM is not used at this point inside the planner. Turn CM into an
9451   // optional argument if we don't need it in the future.
9452   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9453                                ORE);
9454 
9455   // Get user vectorization factor.
9456   ElementCount UserVF = Hints.getWidth();
9457 
9458   CM.collectElementTypesForWidening();
9459 
9460   // Plan how to best vectorize, return the best VF and its cost.
9461   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9462 
9463   // If we are stress testing VPlan builds, do not attempt to generate vector
9464   // code. Masked vector code generation support will follow soon.
9465   // Also, do not attempt to vectorize if no vector code will be produced.
9466   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9467     return false;
9468 
9469   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9470 
9471   {
9472     bool AddBranchWeights =
9473         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9474     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9475                              F->getDataLayout(), AddBranchWeights);
9476     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9477                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9478     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9479                       << L->getHeader()->getParent()->getName() << "\"\n");
9480     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9481   }
9482 
9483   reportVectorization(ORE, L, VF, 1);
9484 
9485   // Mark the loop as already vectorized to avoid vectorizing again.
9486   Hints.setAlreadyVectorized();
9487   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9488   return true;
9489 }
9490 
9491 // Emit a remark if there are stores to floats that required a floating point
9492 // extension. If the vectorized loop was generated with floating point there
9493 // will be a performance penalty from the conversion overhead and the change in
9494 // the vector width.
checkMixedPrecision(Loop * L,OptimizationRemarkEmitter * ORE)9495 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9496   SmallVector<Instruction *, 4> Worklist;
9497   for (BasicBlock *BB : L->getBlocks()) {
9498     for (Instruction &Inst : *BB) {
9499       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9500         if (S->getValueOperand()->getType()->isFloatTy())
9501           Worklist.push_back(S);
9502       }
9503     }
9504   }
9505 
9506   // Traverse the floating point stores upwards searching, for floating point
9507   // conversions.
9508   SmallPtrSet<const Instruction *, 4> Visited;
9509   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9510   while (!Worklist.empty()) {
9511     auto *I = Worklist.pop_back_val();
9512     if (!L->contains(I))
9513       continue;
9514     if (!Visited.insert(I).second)
9515       continue;
9516 
9517     // Emit a remark if the floating point store required a floating
9518     // point conversion.
9519     // TODO: More work could be done to identify the root cause such as a
9520     // constant or a function return type and point the user to it.
9521     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9522       ORE->emit([&]() {
9523         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9524                                           I->getDebugLoc(), L->getHeader())
9525                << "floating point conversion changes vector width. "
9526                << "Mixed floating point precision requires an up/down "
9527                << "cast that will negatively impact performance.";
9528       });
9529 
9530     for (Use &Op : I->operands())
9531       if (auto *OpI = dyn_cast<Instruction>(Op))
9532         Worklist.push_back(OpI);
9533   }
9534 }
9535 
areRuntimeChecksProfitable(GeneratedRTChecks & Checks,VectorizationFactor & VF,std::optional<unsigned> VScale,Loop * L,ScalarEvolution & SE,ScalarEpilogueLowering SEL)9536 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9537                                        VectorizationFactor &VF,
9538                                        std::optional<unsigned> VScale, Loop *L,
9539                                        ScalarEvolution &SE,
9540                                        ScalarEpilogueLowering SEL) {
9541   InstructionCost CheckCost = Checks.getCost();
9542   if (!CheckCost.isValid())
9543     return false;
9544 
9545   // When interleaving only scalar and vector cost will be equal, which in turn
9546   // would lead to a divide by 0. Fall back to hard threshold.
9547   if (VF.Width.isScalar()) {
9548     if (CheckCost > VectorizeMemoryCheckThreshold) {
9549       LLVM_DEBUG(
9550           dbgs()
9551           << "LV: Interleaving only is not profitable due to runtime checks\n");
9552       return false;
9553     }
9554     return true;
9555   }
9556 
9557   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9558   uint64_t ScalarC = *VF.ScalarCost.getValue();
9559   if (ScalarC == 0)
9560     return true;
9561 
9562   // First, compute the minimum iteration count required so that the vector
9563   // loop outperforms the scalar loop.
9564   //  The total cost of the scalar loop is
9565   //   ScalarC * TC
9566   //  where
9567   //  * TC is the actual trip count of the loop.
9568   //  * ScalarC is the cost of a single scalar iteration.
9569   //
9570   //  The total cost of the vector loop is
9571   //    RtC + VecC * (TC / VF) + EpiC
9572   //  where
9573   //  * RtC is the cost of the generated runtime checks
9574   //  * VecC is the cost of a single vector iteration.
9575   //  * TC is the actual trip count of the loop
9576   //  * VF is the vectorization factor
9577   //  * EpiCost is the cost of the generated epilogue, including the cost
9578   //    of the remaining scalar operations.
9579   //
9580   // Vectorization is profitable once the total vector cost is less than the
9581   // total scalar cost:
9582   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
9583   //
9584   // Now we can compute the minimum required trip count TC as
9585   //   VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9586   //
9587   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9588   // the computations are performed on doubles, not integers and the result
9589   // is rounded up, hence we get an upper estimate of the TC.
9590   unsigned IntVF = VF.Width.getKnownMinValue();
9591   if (VF.Width.isScalable()) {
9592     unsigned AssumedMinimumVscale = 1;
9593     if (VScale)
9594       AssumedMinimumVscale = *VScale;
9595     IntVF *= AssumedMinimumVscale;
9596   }
9597   uint64_t RtC = *CheckCost.getValue();
9598   uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
9599   uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9600 
9601   // Second, compute a minimum iteration count so that the cost of the
9602   // runtime checks is only a fraction of the total scalar loop cost. This
9603   // adds a loop-dependent bound on the overhead incurred if the runtime
9604   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9605   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9606   // cost, compute
9607   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
9608   uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9609 
9610   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9611   // epilogue is allowed, choose the next closest multiple of VF. This should
9612   // partly compensate for ignoring the epilogue cost.
9613   uint64_t MinTC = std::max(MinTC1, MinTC2);
9614   if (SEL == CM_ScalarEpilogueAllowed)
9615     MinTC = alignTo(MinTC, IntVF);
9616   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
9617 
9618   LLVM_DEBUG(
9619       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9620              << VF.MinProfitableTripCount << "\n");
9621 
9622   // Skip vectorization if the expected trip count is less than the minimum
9623   // required trip count.
9624   if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9625     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
9626                                 VF.MinProfitableTripCount)) {
9627       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9628                            "trip count < minimum profitable VF ("
9629                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
9630                         << ")\n");
9631 
9632       return false;
9633     }
9634   }
9635   return true;
9636 }
9637 
LoopVectorizePass(LoopVectorizeOptions Opts)9638 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9639     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9640                                !EnableLoopInterleaving),
9641       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9642                               !EnableLoopVectorization) {}
9643 
processLoop(Loop * L)9644 bool LoopVectorizePass::processLoop(Loop *L) {
9645   assert((EnableVPlanNativePath || L->isInnermost()) &&
9646          "VPlan-native path is not enabled. Only process inner loops.");
9647 
9648   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9649                     << L->getHeader()->getParent()->getName() << "' from "
9650                     << L->getLocStr() << "\n");
9651 
9652   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9653 
9654   LLVM_DEBUG(
9655       dbgs() << "LV: Loop hints:"
9656              << " force="
9657              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9658                      ? "disabled"
9659                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9660                             ? "enabled"
9661                             : "?"))
9662              << " width=" << Hints.getWidth()
9663              << " interleave=" << Hints.getInterleave() << "\n");
9664 
9665   // Function containing loop
9666   Function *F = L->getHeader()->getParent();
9667 
9668   // Looking at the diagnostic output is the only way to determine if a loop
9669   // was vectorized (other than looking at the IR or machine code), so it
9670   // is important to generate an optimization remark for each loop. Most of
9671   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9672   // generated as OptimizationRemark and OptimizationRemarkMissed are
9673   // less verbose reporting vectorized loops and unvectorized loops that may
9674   // benefit from vectorization, respectively.
9675 
9676   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9677     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9678     return false;
9679   }
9680 
9681   PredicatedScalarEvolution PSE(*SE, *L);
9682 
9683   // Check if it is legal to vectorize the loop.
9684   LoopVectorizationRequirements Requirements;
9685   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9686                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9687   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9688     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9689     Hints.emitRemarkWithHints();
9690     return false;
9691   }
9692 
9693   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9694   // here. They may require CFG and instruction level transformations before
9695   // even evaluating whether vectorization is profitable. Since we cannot modify
9696   // the incoming IR, we need to build VPlan upfront in the vectorization
9697   // pipeline.
9698   if (!L->isInnermost())
9699     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9700                                         ORE, BFI, PSI, Hints, Requirements);
9701 
9702   assert(L->isInnermost() && "Inner loop expected.");
9703 
9704   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9705   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9706 
9707   // If an override option has been passed in for interleaved accesses, use it.
9708   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9709     UseInterleaved = EnableInterleavedMemAccesses;
9710 
9711   // Analyze interleaved memory accesses.
9712   if (UseInterleaved)
9713     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9714 
9715   // Check the function attributes and profiles to find out if this function
9716   // should be optimized for size.
9717   ScalarEpilogueLowering SEL =
9718       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9719 
9720   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9721   // count by optimizing for size, to minimize overheads.
9722   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9723   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9724     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9725                       << "This loop is worth vectorizing only if no scalar "
9726                       << "iteration overheads are incurred.");
9727     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9728       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9729     else {
9730       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9731         LLVM_DEBUG(dbgs() << "\n");
9732         // Predicate tail-folded loops are efficient even when the loop
9733         // iteration count is low. However, setting the epilogue policy to
9734         // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9735         // with runtime checks. It's more effective to let
9736         // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9737         // for the loop.
9738         if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9739           SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9740       } else {
9741         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9742                              "small to consider vectorizing.\n");
9743         reportVectorizationFailure(
9744             "The trip count is below the minial threshold value.",
9745             "loop trip count is too low, avoiding vectorization",
9746             "LowTripCount", ORE, L);
9747         Hints.emitRemarkWithHints();
9748         return false;
9749       }
9750     }
9751   }
9752 
9753   // Check the function attributes to see if implicit floats or vectors are
9754   // allowed.
9755   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9756     reportVectorizationFailure(
9757         "Can't vectorize when the NoImplicitFloat attribute is used",
9758         "loop not vectorized due to NoImplicitFloat attribute",
9759         "NoImplicitFloat", ORE, L);
9760     Hints.emitRemarkWithHints();
9761     return false;
9762   }
9763 
9764   // Check if the target supports potentially unsafe FP vectorization.
9765   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9766   // for the target we're vectorizing for, to make sure none of the
9767   // additional fp-math flags can help.
9768   if (Hints.isPotentiallyUnsafe() &&
9769       TTI->isFPVectorizationPotentiallyUnsafe()) {
9770     reportVectorizationFailure(
9771         "Potentially unsafe FP op prevents vectorization",
9772         "loop not vectorized due to unsafe FP support.",
9773         "UnsafeFP", ORE, L);
9774     Hints.emitRemarkWithHints();
9775     return false;
9776   }
9777 
9778   bool AllowOrderedReductions;
9779   // If the flag is set, use that instead and override the TTI behaviour.
9780   if (ForceOrderedReductions.getNumOccurrences() > 0)
9781     AllowOrderedReductions = ForceOrderedReductions;
9782   else
9783     AllowOrderedReductions = TTI->enableOrderedReductions();
9784   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9785     ORE->emit([&]() {
9786       auto *ExactFPMathInst = Requirements.getExactFPInst();
9787       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9788                                                  ExactFPMathInst->getDebugLoc(),
9789                                                  ExactFPMathInst->getParent())
9790              << "loop not vectorized: cannot prove it is safe to reorder "
9791                 "floating-point operations";
9792     });
9793     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9794                          "reorder floating-point operations\n");
9795     Hints.emitRemarkWithHints();
9796     return false;
9797   }
9798 
9799   // Use the cost model.
9800   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9801                                 F, &Hints, IAI);
9802   // Use the planner for vectorization.
9803   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9804                                ORE);
9805 
9806   // Get user vectorization factor and interleave count.
9807   ElementCount UserVF = Hints.getWidth();
9808   unsigned UserIC = Hints.getInterleave();
9809 
9810   // Plan how to best vectorize, return the best VF and its cost.
9811   std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9812 
9813   VectorizationFactor VF = VectorizationFactor::Disabled();
9814   unsigned IC = 1;
9815 
9816   bool AddBranchWeights =
9817       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9818   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9819                            F->getDataLayout(), AddBranchWeights);
9820   if (MaybeVF) {
9821     VF = *MaybeVF;
9822     // Select the interleave count.
9823     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9824 
9825     unsigned SelectedIC = std::max(IC, UserIC);
9826     //  Optimistically generate runtime checks if they are needed. Drop them if
9827     //  they turn out to not be profitable.
9828     if (VF.Width.isVector() || SelectedIC > 1)
9829       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9830 
9831     // Check if it is profitable to vectorize with runtime checks.
9832     bool ForceVectorization =
9833         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
9834     if (!ForceVectorization &&
9835         !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
9836                                     *PSE.getSE(), SEL)) {
9837       ORE->emit([&]() {
9838         return OptimizationRemarkAnalysisAliasing(
9839                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9840                    L->getHeader())
9841                << "loop not vectorized: cannot prove it is safe to reorder "
9842                   "memory operations";
9843       });
9844       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9845       Hints.emitRemarkWithHints();
9846       return false;
9847     }
9848   }
9849 
9850   // Identify the diagnostic messages that should be produced.
9851   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9852   bool VectorizeLoop = true, InterleaveLoop = true;
9853   if (VF.Width.isScalar()) {
9854     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9855     VecDiagMsg = std::make_pair(
9856         "VectorizationNotBeneficial",
9857         "the cost-model indicates that vectorization is not beneficial");
9858     VectorizeLoop = false;
9859   }
9860 
9861   if (!MaybeVF && UserIC > 1) {
9862     // Tell the user interleaving was avoided up-front, despite being explicitly
9863     // requested.
9864     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9865                          "interleaving should be avoided up front\n");
9866     IntDiagMsg = std::make_pair(
9867         "InterleavingAvoided",
9868         "Ignoring UserIC, because interleaving was avoided up front");
9869     InterleaveLoop = false;
9870   } else if (IC == 1 && UserIC <= 1) {
9871     // Tell the user interleaving is not beneficial.
9872     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9873     IntDiagMsg = std::make_pair(
9874         "InterleavingNotBeneficial",
9875         "the cost-model indicates that interleaving is not beneficial");
9876     InterleaveLoop = false;
9877     if (UserIC == 1) {
9878       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9879       IntDiagMsg.second +=
9880           " and is explicitly disabled or interleave count is set to 1";
9881     }
9882   } else if (IC > 1 && UserIC == 1) {
9883     // Tell the user interleaving is beneficial, but it explicitly disabled.
9884     LLVM_DEBUG(
9885         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9886     IntDiagMsg = std::make_pair(
9887         "InterleavingBeneficialButDisabled",
9888         "the cost-model indicates that interleaving is beneficial "
9889         "but is explicitly disabled or interleave count is set to 1");
9890     InterleaveLoop = false;
9891   }
9892 
9893   // Override IC if user provided an interleave count.
9894   IC = UserIC > 0 ? UserIC : IC;
9895 
9896   // Emit diagnostic messages, if any.
9897   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9898   if (!VectorizeLoop && !InterleaveLoop) {
9899     // Do not vectorize or interleaving the loop.
9900     ORE->emit([&]() {
9901       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9902                                       L->getStartLoc(), L->getHeader())
9903              << VecDiagMsg.second;
9904     });
9905     ORE->emit([&]() {
9906       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9907                                       L->getStartLoc(), L->getHeader())
9908              << IntDiagMsg.second;
9909     });
9910     return false;
9911   } else if (!VectorizeLoop && InterleaveLoop) {
9912     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9913     ORE->emit([&]() {
9914       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9915                                         L->getStartLoc(), L->getHeader())
9916              << VecDiagMsg.second;
9917     });
9918   } else if (VectorizeLoop && !InterleaveLoop) {
9919     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9920                       << ") in " << L->getLocStr() << '\n');
9921     ORE->emit([&]() {
9922       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9923                                         L->getStartLoc(), L->getHeader())
9924              << IntDiagMsg.second;
9925     });
9926   } else if (VectorizeLoop && InterleaveLoop) {
9927     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9928                       << ") in " << L->getLocStr() << '\n');
9929     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9930   }
9931 
9932   bool DisableRuntimeUnroll = false;
9933   MDNode *OrigLoopID = L->getLoopID();
9934   {
9935     using namespace ore;
9936     if (!VectorizeLoop) {
9937       assert(IC > 1 && "interleave count should not be 1 or 0");
9938       // If we decided that it is not legal to vectorize the loop, then
9939       // interleave it.
9940       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
9941                                  &CM, BFI, PSI, Checks);
9942 
9943       VPlan &BestPlan =
9944           UseLegacyCostModel ? LVP.getBestPlanFor(VF.Width) : LVP.getBestPlan();
9945       assert((UseLegacyCostModel || BestPlan.hasScalarVFOnly()) &&
9946              "VPlan cost model and legacy cost model disagreed");
9947       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
9948 
9949       ORE->emit([&]() {
9950         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9951                                   L->getHeader())
9952                << "interleaved loop (interleaved count: "
9953                << NV("InterleaveCount", IC) << ")";
9954       });
9955     } else {
9956       // If we decided that it is *legal* to vectorize the loop, then do it.
9957 
9958       // Consider vectorizing the epilogue too if it's profitable.
9959       VectorizationFactor EpilogueVF =
9960           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
9961       if (EpilogueVF.Width.isVector()) {
9962 
9963         // The first pass vectorizes the main loop and creates a scalar epilogue
9964         // to be vectorized by executing the plan (potentially with a different
9965         // factor) again shortly afterwards.
9966         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
9967         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
9968                                            EPI, &LVL, &CM, BFI, PSI, Checks);
9969 
9970         std::unique_ptr<VPlan> BestMainPlan(
9971             LVP.getBestPlanFor(EPI.MainLoopVF).duplicate());
9972         const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
9973             EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
9974         ++LoopsVectorized;
9975 
9976         // Second pass vectorizes the epilogue and adjusts the control flow
9977         // edges from the first pass.
9978         EPI.MainLoopVF = EPI.EpilogueVF;
9979         EPI.MainLoopUF = EPI.EpilogueUF;
9980         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9981                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
9982                                                  Checks);
9983 
9984         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
9985         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
9986         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9987         Header->setName("vec.epilog.vector.body");
9988 
9989         // Re-use the trip count and steps expanded for the main loop, as
9990         // skeleton creation needs it as a value that dominates both the scalar
9991         // and vector epilogue loops
9992         // TODO: This is a workaround needed for epilogue vectorization and it
9993         // should be removed once induction resume value creation is done
9994         // directly in VPlan.
9995         EpilogILV.setTripCount(MainILV.getTripCount());
9996         for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
9997           auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
9998           auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
9999               ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10000           ExpandR->replaceAllUsesWith(ExpandedVal);
10001           if (BestEpiPlan.getTripCount() == ExpandR)
10002             BestEpiPlan.resetTripCount(ExpandedVal);
10003           ExpandR->eraseFromParent();
10004         }
10005 
10006         // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10007         // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10008         // before vectorizing the epilogue loop.
10009         for (VPRecipeBase &R : Header->phis()) {
10010           if (isa<VPCanonicalIVPHIRecipe>(&R))
10011             continue;
10012 
10013           Value *ResumeV = nullptr;
10014           // TODO: Move setting of resume values to prepareToExecute.
10015           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10016             const RecurrenceDescriptor &RdxDesc =
10017                 ReductionPhi->getRecurrenceDescriptor();
10018             RecurKind RK = RdxDesc.getRecurrenceKind();
10019             ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
10020             if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
10021               // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10022               // start value; compare the final value from the main vector loop
10023               // to the start value.
10024               IRBuilder<> Builder(
10025                   cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10026               ResumeV = Builder.CreateICmpNE(ResumeV,
10027                                              RdxDesc.getRecurrenceStartValue());
10028             }
10029           } else {
10030             // Create induction resume values for both widened pointer and
10031             // integer/fp inductions and update the start value of the induction
10032             // recipes to use the resume value.
10033             PHINode *IndPhi = nullptr;
10034             const InductionDescriptor *ID;
10035             if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10036               IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10037               ID = &Ind->getInductionDescriptor();
10038             } else {
10039               auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10040               IndPhi = WidenInd->getPHINode();
10041               ID = &WidenInd->getInductionDescriptor();
10042             }
10043 
10044             ResumeV = MainILV.createInductionResumeValue(
10045                 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10046                 {EPI.MainLoopIterationCountCheck});
10047           }
10048           assert(ResumeV && "Must have a resume value");
10049           VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
10050           cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10051         }
10052 
10053         assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10054                "DT not preserved correctly");
10055         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10056                         DT, true, &ExpandedSCEVs);
10057         ++LoopsEpilogueVectorized;
10058 
10059         if (!MainILV.areSafetyChecksAdded())
10060           DisableRuntimeUnroll = true;
10061       } else {
10062         ElementCount Width = VF.Width;
10063         VPlan &BestPlan =
10064             UseLegacyCostModel ? LVP.getBestPlanFor(Width) : LVP.getBestPlan();
10065         if (!UseLegacyCostModel) {
10066           assert(size(BestPlan.vectorFactors()) == 1 &&
10067                  "Plan should have a single VF");
10068           Width = *BestPlan.vectorFactors().begin();
10069           LLVM_DEBUG(dbgs()
10070                      << "VF picked by VPlan cost model: " << Width << "\n");
10071           assert(VF.Width == Width &&
10072                  "VPlan cost model and legacy cost model disagreed");
10073         }
10074         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width,
10075                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10076                                PSI, Checks);
10077         LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
10078         ++LoopsVectorized;
10079 
10080         // Add metadata to disable runtime unrolling a scalar loop when there
10081         // are no runtime checks about strides and memory. A scalar loop that is
10082         // rarely used is not worth unrolling.
10083         if (!LB.areSafetyChecksAdded())
10084           DisableRuntimeUnroll = true;
10085       }
10086       // Report the vectorization decision.
10087       reportVectorization(ORE, L, VF, IC);
10088     }
10089 
10090     if (ORE->allowExtraAnalysis(LV_NAME))
10091       checkMixedPrecision(L, ORE);
10092   }
10093 
10094   std::optional<MDNode *> RemainderLoopID =
10095       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10096                                       LLVMLoopVectorizeFollowupEpilogue});
10097   if (RemainderLoopID) {
10098     L->setLoopID(*RemainderLoopID);
10099   } else {
10100     if (DisableRuntimeUnroll)
10101       AddRuntimeUnrollDisableMetaData(L);
10102 
10103     // Mark the loop as already vectorized to avoid vectorizing again.
10104     Hints.setAlreadyVectorized();
10105   }
10106 
10107   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10108   return true;
10109 }
10110 
runImpl(Function & F,ScalarEvolution & SE_,LoopInfo & LI_,TargetTransformInfo & TTI_,DominatorTree & DT_,BlockFrequencyInfo * BFI_,TargetLibraryInfo * TLI_,DemandedBits & DB_,AssumptionCache & AC_,LoopAccessInfoManager & LAIs_,OptimizationRemarkEmitter & ORE_,ProfileSummaryInfo * PSI_)10111 LoopVectorizeResult LoopVectorizePass::runImpl(
10112     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10113     DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
10114     DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10115     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10116   SE = &SE_;
10117   LI = &LI_;
10118   TTI = &TTI_;
10119   DT = &DT_;
10120   BFI = BFI_;
10121   TLI = TLI_;
10122   AC = &AC_;
10123   LAIs = &LAIs_;
10124   DB = &DB_;
10125   ORE = &ORE_;
10126   PSI = PSI_;
10127 
10128   // Don't attempt if
10129   // 1. the target claims to have no vector registers, and
10130   // 2. interleaving won't help ILP.
10131   //
10132   // The second condition is necessary because, even if the target has no
10133   // vector registers, loop vectorization may still enable scalar
10134   // interleaving.
10135   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10136       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10137     return LoopVectorizeResult(false, false);
10138 
10139   bool Changed = false, CFGChanged = false;
10140 
10141   // The vectorizer requires loops to be in simplified form.
10142   // Since simplification may add new inner loops, it has to run before the
10143   // legality and profitability checks. This means running the loop vectorizer
10144   // will simplify all loops, regardless of whether anything end up being
10145   // vectorized.
10146   for (const auto &L : *LI)
10147     Changed |= CFGChanged |=
10148         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10149 
10150   // Build up a worklist of inner-loops to vectorize. This is necessary as
10151   // the act of vectorizing or partially unrolling a loop creates new loops
10152   // and can invalidate iterators across the loops.
10153   SmallVector<Loop *, 8> Worklist;
10154 
10155   for (Loop *L : *LI)
10156     collectSupportedLoops(*L, LI, ORE, Worklist);
10157 
10158   LoopsAnalyzed += Worklist.size();
10159 
10160   // Now walk the identified inner loops.
10161   while (!Worklist.empty()) {
10162     Loop *L = Worklist.pop_back_val();
10163 
10164     // For the inner loops we actually process, form LCSSA to simplify the
10165     // transform.
10166     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10167 
10168     Changed |= CFGChanged |= processLoop(L);
10169 
10170     if (Changed) {
10171       LAIs->clear();
10172 
10173 #ifndef NDEBUG
10174       if (VerifySCEV)
10175         SE->verify();
10176 #endif
10177     }
10178   }
10179 
10180   // Process each loop nest in the function.
10181   return LoopVectorizeResult(Changed, CFGChanged);
10182 }
10183 
run(Function & F,FunctionAnalysisManager & AM)10184 PreservedAnalyses LoopVectorizePass::run(Function &F,
10185                                          FunctionAnalysisManager &AM) {
10186     auto &LI = AM.getResult<LoopAnalysis>(F);
10187     // There are no loops in the function. Return before computing other expensive
10188     // analyses.
10189     if (LI.empty())
10190       return PreservedAnalyses::all();
10191     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10192     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10193     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10194     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10195     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10196     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10197     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10198 
10199     LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10200     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10201     ProfileSummaryInfo *PSI =
10202         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10203     BlockFrequencyInfo *BFI = nullptr;
10204     if (PSI && PSI->hasProfileSummary())
10205       BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10206     LoopVectorizeResult Result =
10207         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10208     if (!Result.MadeAnyChange)
10209       return PreservedAnalyses::all();
10210     PreservedAnalyses PA;
10211 
10212     if (isAssignmentTrackingEnabled(*F.getParent())) {
10213       for (auto &BB : F)
10214         RemoveRedundantDbgInstrs(&BB);
10215     }
10216 
10217     PA.preserve<LoopAnalysis>();
10218     PA.preserve<DominatorTreeAnalysis>();
10219     PA.preserve<ScalarEvolutionAnalysis>();
10220     PA.preserve<LoopAccessAnalysis>();
10221 
10222     if (Result.MadeCFGChange) {
10223       // Making CFG changes likely means a loop got vectorized. Indicate that
10224       // extra simplification passes should be run.
10225       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10226       // be run if runtime checks have been added.
10227       AM.getResult<ShouldRunExtraVectorPasses>(F);
10228       PA.preserve<ShouldRunExtraVectorPasses>();
10229     } else {
10230       PA.preserveSet<CFGAnalyses>();
10231     }
10232     return PA;
10233 }
10234 
printPipeline(raw_ostream & OS,function_ref<StringRef (StringRef)> MapClassName2PassName)10235 void LoopVectorizePass::printPipeline(
10236     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10237   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10238       OS, MapClassName2PassName);
10239 
10240   OS << '<';
10241   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10242   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10243   OS << '>';
10244 }
10245