1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46 // Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanPatternMatch.h"
63 #include "VPlanTransforms.h"
64 #include "VPlanVerifier.h"
65 #include "llvm/ADT/APInt.h"
66 #include "llvm/ADT/ArrayRef.h"
67 #include "llvm/ADT/DenseMap.h"
68 #include "llvm/ADT/DenseMapInfo.h"
69 #include "llvm/ADT/Hashing.h"
70 #include "llvm/ADT/MapVector.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/ValueTracking.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfo.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/MDBuilder.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/PatternMatch.h"
122 #include "llvm/IR/ProfDataUtils.h"
123 #include "llvm/IR/Type.h"
124 #include "llvm/IR/Use.h"
125 #include "llvm/IR/User.h"
126 #include "llvm/IR/Value.h"
127 #include "llvm/IR/ValueHandle.h"
128 #include "llvm/IR/VectorBuilder.h"
129 #include "llvm/IR/Verifier.h"
130 #include "llvm/Support/Casting.h"
131 #include "llvm/Support/CommandLine.h"
132 #include "llvm/Support/Compiler.h"
133 #include "llvm/Support/Debug.h"
134 #include "llvm/Support/ErrorHandling.h"
135 #include "llvm/Support/InstructionCost.h"
136 #include "llvm/Support/MathExtras.h"
137 #include "llvm/Support/raw_ostream.h"
138 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
139 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
140 #include "llvm/Transforms/Utils/LoopSimplify.h"
141 #include "llvm/Transforms/Utils/LoopUtils.h"
142 #include "llvm/Transforms/Utils/LoopVersioning.h"
143 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
144 #include "llvm/Transforms/Utils/SizeOpts.h"
145 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
146 #include <algorithm>
147 #include <cassert>
148 #include <cmath>
149 #include <cstdint>
150 #include <functional>
151 #include <iterator>
152 #include <limits>
153 #include <map>
154 #include <memory>
155 #include <string>
156 #include <tuple>
157 #include <utility>
158
159 using namespace llvm;
160
161 #define LV_NAME "loop-vectorize"
162 #define DEBUG_TYPE LV_NAME
163
164 #ifndef NDEBUG
165 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
166 #endif
167
168 /// @{
169 /// Metadata attribute names
170 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
171 const char LLVMLoopVectorizeFollowupVectorized[] =
172 "llvm.loop.vectorize.followup_vectorized";
173 const char LLVMLoopVectorizeFollowupEpilogue[] =
174 "llvm.loop.vectorize.followup_epilogue";
175 /// @}
176
177 STATISTIC(LoopsVectorized, "Number of loops vectorized");
178 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
179 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
180
181 static cl::opt<bool> EnableEpilogueVectorization(
182 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
183 cl::desc("Enable vectorization of epilogue loops."));
184
185 static cl::opt<unsigned> EpilogueVectorizationForceVF(
186 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
187 cl::desc("When epilogue vectorization is enabled, and a value greater than "
188 "1 is specified, forces the given VF for all applicable epilogue "
189 "loops."));
190
191 static cl::opt<unsigned> EpilogueVectorizationMinVF(
192 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
193 cl::desc("Only loops with vectorization factor equal to or larger than "
194 "the specified value are considered for epilogue vectorization."));
195
196 /// Loops with a known constant trip count below this number are vectorized only
197 /// if no scalar iteration overheads are incurred.
198 static cl::opt<unsigned> TinyTripCountVectorThreshold(
199 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
200 cl::desc("Loops with a constant trip count that is smaller than this "
201 "value are vectorized only if no scalar iteration overheads "
202 "are incurred."));
203
204 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
205 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
206 cl::desc("The maximum allowed number of runtime memory checks"));
207
208 static cl::opt<bool> UseLegacyCostModel(
209 "vectorize-use-legacy-cost-model", cl::init(true), cl::Hidden,
210 cl::desc("Use the legacy cost model instead of the VPlan-based cost model. "
211 "This option will be removed in the future."));
212
213 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
214 // that predication is preferred, and this lists all options. I.e., the
215 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
216 // and predicate the instructions accordingly. If tail-folding fails, there are
217 // different fallback strategies depending on these values:
218 namespace PreferPredicateTy {
219 enum Option {
220 ScalarEpilogue = 0,
221 PredicateElseScalarEpilogue,
222 PredicateOrDontVectorize
223 };
224 } // namespace PreferPredicateTy
225
226 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
227 "prefer-predicate-over-epilogue",
228 cl::init(PreferPredicateTy::ScalarEpilogue),
229 cl::Hidden,
230 cl::desc("Tail-folding and predication preferences over creating a scalar "
231 "epilogue loop."),
232 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
233 "scalar-epilogue",
234 "Don't tail-predicate loops, create scalar epilogue"),
235 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
236 "predicate-else-scalar-epilogue",
237 "prefer tail-folding, create scalar epilogue if tail "
238 "folding fails."),
239 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
240 "predicate-dont-vectorize",
241 "prefers tail-folding, don't attempt vectorization if "
242 "tail-folding fails.")));
243
244 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
245 "force-tail-folding-style", cl::desc("Force the tail folding style"),
246 cl::init(TailFoldingStyle::None),
247 cl::values(
248 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
249 clEnumValN(
250 TailFoldingStyle::Data, "data",
251 "Create lane mask for data only, using active.lane.mask intrinsic"),
252 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
253 "data-without-lane-mask",
254 "Create lane mask with compare/stepvector"),
255 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
256 "Create lane mask using active.lane.mask intrinsic, and use "
257 "it for both data and control flow"),
258 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
259 "data-and-control-without-rt-check",
260 "Similar to data-and-control, but remove the runtime check"),
261 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
262 "Use predicated EVL instructions for tail folding. If EVL "
263 "is unsupported, fallback to data-without-lane-mask.")));
264
265 static cl::opt<bool> MaximizeBandwidth(
266 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
267 cl::desc("Maximize bandwidth when selecting vectorization factor which "
268 "will be determined by the smallest type in loop."));
269
270 static cl::opt<bool> EnableInterleavedMemAccesses(
271 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
272 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
273
274 /// An interleave-group may need masking if it resides in a block that needs
275 /// predication, or in order to mask away gaps.
276 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
277 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
278 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
279
280 static cl::opt<unsigned> ForceTargetNumScalarRegs(
281 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
282 cl::desc("A flag that overrides the target's number of scalar registers."));
283
284 static cl::opt<unsigned> ForceTargetNumVectorRegs(
285 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
286 cl::desc("A flag that overrides the target's number of vector registers."));
287
288 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
289 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
290 cl::desc("A flag that overrides the target's max interleave factor for "
291 "scalar loops."));
292
293 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
294 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
295 cl::desc("A flag that overrides the target's max interleave factor for "
296 "vectorized loops."));
297
298 cl::opt<unsigned> ForceTargetInstructionCost(
299 "force-target-instruction-cost", cl::init(0), cl::Hidden,
300 cl::desc("A flag that overrides the target's expected cost for "
301 "an instruction to a single constant value. Mostly "
302 "useful for getting consistent testing."));
303
304 static cl::opt<bool> ForceTargetSupportsScalableVectors(
305 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
306 cl::desc(
307 "Pretend that scalable vectors are supported, even if the target does "
308 "not support them. This flag should only be used for testing."));
309
310 static cl::opt<unsigned> SmallLoopCost(
311 "small-loop-cost", cl::init(20), cl::Hidden,
312 cl::desc(
313 "The cost of a loop that is considered 'small' by the interleaver."));
314
315 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
316 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
317 cl::desc("Enable the use of the block frequency analysis to access PGO "
318 "heuristics minimizing code growth in cold regions and being more "
319 "aggressive in hot regions."));
320
321 // Runtime interleave loops for load/store throughput.
322 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
323 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
324 cl::desc(
325 "Enable runtime interleaving until load/store ports are saturated"));
326
327 /// The number of stores in a loop that are allowed to need predication.
328 static cl::opt<unsigned> NumberOfStoresToPredicate(
329 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
330 cl::desc("Max number of stores to be predicated behind an if."));
331
332 static cl::opt<bool> EnableIndVarRegisterHeur(
333 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
334 cl::desc("Count the induction variable only once when interleaving"));
335
336 static cl::opt<bool> EnableCondStoresVectorization(
337 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
338 cl::desc("Enable if predication of stores during vectorization."));
339
340 static cl::opt<unsigned> MaxNestedScalarReductionIC(
341 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
342 cl::desc("The maximum interleave count to use when interleaving a scalar "
343 "reduction in a nested loop."));
344
345 static cl::opt<bool>
346 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
347 cl::Hidden,
348 cl::desc("Prefer in-loop vector reductions, "
349 "overriding the targets preference."));
350
351 static cl::opt<bool> ForceOrderedReductions(
352 "force-ordered-reductions", cl::init(false), cl::Hidden,
353 cl::desc("Enable the vectorisation of loops with in-order (strict) "
354 "FP reductions"));
355
356 static cl::opt<bool> PreferPredicatedReductionSelect(
357 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
358 cl::desc(
359 "Prefer predicating a reduction operation over an after loop select."));
360
361 namespace llvm {
362 cl::opt<bool> EnableVPlanNativePath(
363 "enable-vplan-native-path", cl::Hidden,
364 cl::desc("Enable VPlan-native vectorization path with "
365 "support for outer loop vectorization."));
366 }
367
368 // This flag enables the stress testing of the VPlan H-CFG construction in the
369 // VPlan-native vectorization path. It must be used in conjuction with
370 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
371 // verification of the H-CFGs built.
372 static cl::opt<bool> VPlanBuildStressTest(
373 "vplan-build-stress-test", cl::init(false), cl::Hidden,
374 cl::desc(
375 "Build VPlan for every supported loop nest in the function and bail "
376 "out right after the build (stress test the VPlan H-CFG construction "
377 "in the VPlan-native vectorization path)."));
378
379 cl::opt<bool> llvm::EnableLoopInterleaving(
380 "interleave-loops", cl::init(true), cl::Hidden,
381 cl::desc("Enable loop interleaving in Loop vectorization passes"));
382 cl::opt<bool> llvm::EnableLoopVectorization(
383 "vectorize-loops", cl::init(true), cl::Hidden,
384 cl::desc("Run the Loop vectorization passes"));
385
386 static cl::opt<bool> PrintVPlansInDotFormat(
387 "vplan-print-in-dot-format", cl::Hidden,
388 cl::desc("Use dot format instead of plain text when dumping VPlans"));
389
390 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
391 "force-widen-divrem-via-safe-divisor", cl::Hidden,
392 cl::desc(
393 "Override cost based safe divisor widening for div/rem instructions"));
394
395 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
396 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
397 cl::Hidden,
398 cl::desc("Try wider VFs if they enable the use of vector variants"));
399
400 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
401 // variables not overflowing do not hold. See `emitSCEVChecks`.
402 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
403 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
404 // `emitMemRuntimeChecks`.
405 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
406 // Likelyhood of bypassing the vectorized loop because there are zero trips left
407 // after prolog. See `emitIterationCountCheck`.
408 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
409
410 /// A helper function that returns true if the given type is irregular. The
411 /// type is irregular if its allocated size doesn't equal the store size of an
412 /// element of the corresponding vector type.
hasIrregularType(Type * Ty,const DataLayout & DL)413 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
414 // Determine if an array of N elements of type Ty is "bitcast compatible"
415 // with a <N x Ty> vector.
416 // This is only true if there is no padding between the array elements.
417 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
418 }
419
420 /// Returns "best known" trip count for the specified loop \p L as defined by
421 /// the following procedure:
422 /// 1) Returns exact trip count if it is known.
423 /// 2) Returns expected trip count according to profile data if any.
424 /// 3) Returns upper bound estimate if it is known.
425 /// 4) Returns std::nullopt if all of the above failed.
getSmallBestKnownTC(ScalarEvolution & SE,Loop * L)426 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
427 Loop *L) {
428 // Check if exact trip count is known.
429 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
430 return ExpectedTC;
431
432 // Check if there is an expected trip count available from profile data.
433 if (LoopVectorizeWithBlockFrequency)
434 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
435 return *EstimatedTC;
436
437 // Check if upper bound estimate is known.
438 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
439 return ExpectedTC;
440
441 return std::nullopt;
442 }
443
444 namespace {
445 // Forward declare GeneratedRTChecks.
446 class GeneratedRTChecks;
447
448 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
449 } // namespace
450
451 namespace llvm {
452
453 AnalysisKey ShouldRunExtraVectorPasses::Key;
454
455 /// InnerLoopVectorizer vectorizes loops which contain only one basic
456 /// block to a specified vectorization factor (VF).
457 /// This class performs the widening of scalars into vectors, or multiple
458 /// scalars. This class also implements the following features:
459 /// * It inserts an epilogue loop for handling loops that don't have iteration
460 /// counts that are known to be a multiple of the vectorization factor.
461 /// * It handles the code generation for reduction variables.
462 /// * Scalarization (implementation using scalars) of un-vectorizable
463 /// instructions.
464 /// InnerLoopVectorizer does not perform any vectorization-legality
465 /// checks, and relies on the caller to check for the different legality
466 /// aspects. The InnerLoopVectorizer relies on the
467 /// LoopVectorizationLegality class to provide information about the induction
468 /// and reduction variables that were found to a given vectorization factor.
469 class InnerLoopVectorizer {
470 public:
InnerLoopVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,ElementCount VecWidth,ElementCount MinProfitableTripCount,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & RTChecks)471 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
472 LoopInfo *LI, DominatorTree *DT,
473 const TargetLibraryInfo *TLI,
474 const TargetTransformInfo *TTI, AssumptionCache *AC,
475 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
476 ElementCount MinProfitableTripCount,
477 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
478 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
479 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
480 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
481 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
482 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
483 PSI(PSI), RTChecks(RTChecks) {
484 // Query this against the original loop and save it here because the profile
485 // of the original loop header may change as the transformation happens.
486 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
487 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
488
489 if (MinProfitableTripCount.isZero())
490 this->MinProfitableTripCount = VecWidth;
491 else
492 this->MinProfitableTripCount = MinProfitableTripCount;
493 }
494
495 virtual ~InnerLoopVectorizer() = default;
496
497 /// Create a new empty loop that will contain vectorized instructions later
498 /// on, while the old loop will be used as the scalar remainder. Control flow
499 /// is generated around the vectorized (and scalar epilogue) loops consisting
500 /// of various checks and bypasses. Return the pre-header block of the new
501 /// loop and the start value for the canonical induction, if it is != 0. The
502 /// latter is the case when vectorizing the epilogue loop. In the case of
503 /// epilogue vectorization, this function is overriden to handle the more
504 /// complex control flow around the loops. \p ExpandedSCEVs is used to
505 /// look up SCEV expansions for expressions needed during skeleton creation.
506 virtual std::pair<BasicBlock *, Value *>
507 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
508
509 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
510 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
511
512 // Return true if any runtime check is added.
areSafetyChecksAdded()513 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
514
515 /// A helper function to scalarize a single Instruction in the innermost loop.
516 /// Generates a sequence of scalar instances for each lane between \p MinLane
517 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
518 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
519 /// Instr's operands.
520 void scalarizeInstruction(const Instruction *Instr,
521 VPReplicateRecipe *RepRecipe,
522 const VPIteration &Instance,
523 VPTransformState &State);
524
525 /// Fix the non-induction PHIs in \p Plan.
526 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
527
528 /// Create a new phi node for the induction variable \p OrigPhi to resume
529 /// iteration count in the scalar epilogue, from where the vectorized loop
530 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
531 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
532 /// and the resume values can come from an additional bypass block, the \p
533 /// AdditionalBypass pair provides information about the bypass block and the
534 /// end value on the edge from bypass to this loop.
535 PHINode *createInductionResumeValue(
536 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
537 ArrayRef<BasicBlock *> BypassBlocks,
538 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
539
540 /// Returns the original loop trip count.
getTripCount() const541 Value *getTripCount() const { return TripCount; }
542
543 /// Used to set the trip count after ILV's construction and after the
544 /// preheader block has been executed. Note that this always holds the trip
545 /// count of the original loop for both main loop and epilogue vectorization.
setTripCount(Value * TC)546 void setTripCount(Value *TC) { TripCount = TC; }
547
548 protected:
549 friend class LoopVectorizationPlanner;
550
551 /// A small list of PHINodes.
552 using PhiVector = SmallVector<PHINode *, 4>;
553
554 /// A type for scalarized values in the new loop. Each value from the
555 /// original loop, when scalarized, is represented by UF x VF scalar values
556 /// in the new unrolled loop, where UF is the unroll factor and VF is the
557 /// vectorization factor.
558 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
559
560 /// Set up the values of the IVs correctly when exiting the vector loop.
561 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
562 Value *VectorTripCount, Value *EndValue,
563 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
564 VPlan &Plan, VPTransformState &State);
565
566 /// Iteratively sink the scalarized operands of a predicated instruction into
567 /// the block that was created for it.
568 void sinkScalarOperands(Instruction *PredInst);
569
570 /// Returns (and creates if needed) the trip count of the widened loop.
571 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
572
573 /// Emit a bypass check to see if the vector trip count is zero, including if
574 /// it overflows.
575 void emitIterationCountCheck(BasicBlock *Bypass);
576
577 /// Emit a bypass check to see if all of the SCEV assumptions we've
578 /// had to make are correct. Returns the block containing the checks or
579 /// nullptr if no checks have been added.
580 BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
581
582 /// Emit bypass checks to check any memory assumptions we may have made.
583 /// Returns the block containing the checks or nullptr if no checks have been
584 /// added.
585 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
586
587 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
588 /// vector loop preheader, middle block and scalar preheader.
589 void createVectorLoopSkeleton(StringRef Prefix);
590
591 /// Create new phi nodes for the induction variables to resume iteration count
592 /// in the scalar epilogue, from where the vectorized loop left off.
593 /// In cases where the loop skeleton is more complicated (eg. epilogue
594 /// vectorization) and the resume values can come from an additional bypass
595 /// block, the \p AdditionalBypass pair provides information about the bypass
596 /// block and the end value on the edge from bypass to this loop.
597 void createInductionResumeValues(
598 const SCEV2ValueTy &ExpandedSCEVs,
599 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
600
601 /// Complete the loop skeleton by adding debug MDs, creating appropriate
602 /// conditional branches in the middle block, preparing the builder and
603 /// running the verifier. Return the preheader of the completed vector loop.
604 BasicBlock *completeLoopSkeleton();
605
606 /// Allow subclasses to override and print debug traces before/after vplan
607 /// execution, when trace information is requested.
printDebugTracesAtStart()608 virtual void printDebugTracesAtStart(){};
printDebugTracesAtEnd()609 virtual void printDebugTracesAtEnd(){};
610
611 /// The original loop.
612 Loop *OrigLoop;
613
614 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
615 /// dynamic knowledge to simplify SCEV expressions and converts them to a
616 /// more usable form.
617 PredicatedScalarEvolution &PSE;
618
619 /// Loop Info.
620 LoopInfo *LI;
621
622 /// Dominator Tree.
623 DominatorTree *DT;
624
625 /// Target Library Info.
626 const TargetLibraryInfo *TLI;
627
628 /// Target Transform Info.
629 const TargetTransformInfo *TTI;
630
631 /// Assumption Cache.
632 AssumptionCache *AC;
633
634 /// Interface to emit optimization remarks.
635 OptimizationRemarkEmitter *ORE;
636
637 /// The vectorization SIMD factor to use. Each vector will have this many
638 /// vector elements.
639 ElementCount VF;
640
641 ElementCount MinProfitableTripCount;
642
643 /// The vectorization unroll factor to use. Each scalar is vectorized to this
644 /// many different vector instructions.
645 unsigned UF;
646
647 /// The builder that we use
648 IRBuilder<> Builder;
649
650 // --- Vectorization state ---
651
652 /// The vector-loop preheader.
653 BasicBlock *LoopVectorPreHeader;
654
655 /// The scalar-loop preheader.
656 BasicBlock *LoopScalarPreHeader;
657
658 /// Middle Block between the vector and the scalar.
659 BasicBlock *LoopMiddleBlock;
660
661 /// The unique ExitBlock of the scalar loop if one exists. Note that
662 /// there can be multiple exiting edges reaching this block.
663 BasicBlock *LoopExitBlock;
664
665 /// The scalar loop body.
666 BasicBlock *LoopScalarBody;
667
668 /// A list of all bypass blocks. The first block is the entry of the loop.
669 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
670
671 /// Store instructions that were predicated.
672 SmallVector<Instruction *, 4> PredicatedInstructions;
673
674 /// Trip count of the original loop.
675 Value *TripCount = nullptr;
676
677 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
678 Value *VectorTripCount = nullptr;
679
680 /// The legality analysis.
681 LoopVectorizationLegality *Legal;
682
683 /// The profitablity analysis.
684 LoopVectorizationCostModel *Cost;
685
686 // Record whether runtime checks are added.
687 bool AddedSafetyChecks = false;
688
689 // Holds the end values for each induction variable. We save the end values
690 // so we can later fix-up the external users of the induction variables.
691 DenseMap<PHINode *, Value *> IVEndValues;
692
693 /// BFI and PSI are used to check for profile guided size optimizations.
694 BlockFrequencyInfo *BFI;
695 ProfileSummaryInfo *PSI;
696
697 // Whether this loop should be optimized for size based on profile guided size
698 // optimizatios.
699 bool OptForSizeBasedOnProfile;
700
701 /// Structure to hold information about generated runtime checks, responsible
702 /// for cleaning the checks, if vectorization turns out unprofitable.
703 GeneratedRTChecks &RTChecks;
704
705 // Holds the resume values for reductions in the loops, used to set the
706 // correct start value of reduction PHIs when vectorizing the epilogue.
707 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
708 ReductionResumeValues;
709 };
710
711 class InnerLoopUnroller : public InnerLoopVectorizer {
712 public:
InnerLoopUnroller(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)713 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
714 LoopInfo *LI, DominatorTree *DT,
715 const TargetLibraryInfo *TLI,
716 const TargetTransformInfo *TTI, AssumptionCache *AC,
717 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
718 LoopVectorizationLegality *LVL,
719 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
720 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
721 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
722 ElementCount::getFixed(1),
723 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
724 BFI, PSI, Check) {}
725 };
726
727 /// Encapsulate information regarding vectorization of a loop and its epilogue.
728 /// This information is meant to be updated and used across two stages of
729 /// epilogue vectorization.
730 struct EpilogueLoopVectorizationInfo {
731 ElementCount MainLoopVF = ElementCount::getFixed(0);
732 unsigned MainLoopUF = 0;
733 ElementCount EpilogueVF = ElementCount::getFixed(0);
734 unsigned EpilogueUF = 0;
735 BasicBlock *MainLoopIterationCountCheck = nullptr;
736 BasicBlock *EpilogueIterationCountCheck = nullptr;
737 BasicBlock *SCEVSafetyCheck = nullptr;
738 BasicBlock *MemSafetyCheck = nullptr;
739 Value *TripCount = nullptr;
740 Value *VectorTripCount = nullptr;
741
EpilogueLoopVectorizationInfollvm::EpilogueLoopVectorizationInfo742 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
743 ElementCount EVF, unsigned EUF)
744 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
745 assert(EUF == 1 &&
746 "A high UF for the epilogue loop is likely not beneficial.");
747 }
748 };
749
750 /// An extension of the inner loop vectorizer that creates a skeleton for a
751 /// vectorized loop that has its epilogue (residual) also vectorized.
752 /// The idea is to run the vplan on a given loop twice, firstly to setup the
753 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
754 /// from the first step and vectorize the epilogue. This is achieved by
755 /// deriving two concrete strategy classes from this base class and invoking
756 /// them in succession from the loop vectorizer planner.
757 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
758 public:
InnerLoopAndEpilogueVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)759 InnerLoopAndEpilogueVectorizer(
760 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
761 DominatorTree *DT, const TargetLibraryInfo *TLI,
762 const TargetTransformInfo *TTI, AssumptionCache *AC,
763 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
764 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
765 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
766 GeneratedRTChecks &Checks)
767 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
768 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
769 CM, BFI, PSI, Checks),
770 EPI(EPI) {}
771
772 // Override this function to handle the more complex control flow around the
773 // three loops.
createVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)774 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
775 const SCEV2ValueTy &ExpandedSCEVs) final {
776 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
777 }
778
779 /// The interface for creating a vectorized skeleton using one of two
780 /// different strategies, each corresponding to one execution of the vplan
781 /// as described above.
782 virtual std::pair<BasicBlock *, Value *>
783 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
784
785 /// Holds and updates state information required to vectorize the main loop
786 /// and its epilogue in two separate passes. This setup helps us avoid
787 /// regenerating and recomputing runtime safety checks. It also helps us to
788 /// shorten the iteration-count-check path length for the cases where the
789 /// iteration count of the loop is so small that the main vector loop is
790 /// completely skipped.
791 EpilogueLoopVectorizationInfo &EPI;
792 };
793
794 /// A specialized derived class of inner loop vectorizer that performs
795 /// vectorization of *main* loops in the process of vectorizing loops and their
796 /// epilogues.
797 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
798 public:
EpilogueVectorizerMainLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)799 EpilogueVectorizerMainLoop(
800 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
801 DominatorTree *DT, const TargetLibraryInfo *TLI,
802 const TargetTransformInfo *TTI, AssumptionCache *AC,
803 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
804 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
805 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
806 GeneratedRTChecks &Check)
807 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
808 EPI, LVL, CM, BFI, PSI, Check) {}
809 /// Implements the interface for creating a vectorized skeleton using the
810 /// *main loop* strategy (ie the first pass of vplan execution).
811 std::pair<BasicBlock *, Value *>
812 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
813
814 protected:
815 /// Emits an iteration count bypass check once for the main loop (when \p
816 /// ForEpilogue is false) and once for the epilogue loop (when \p
817 /// ForEpilogue is true).
818 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
819 void printDebugTracesAtStart() override;
820 void printDebugTracesAtEnd() override;
821 };
822
823 // A specialized derived class of inner loop vectorizer that performs
824 // vectorization of *epilogue* loops in the process of vectorizing loops and
825 // their epilogues.
826 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
827 public:
EpilogueVectorizerEpilogueLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)828 EpilogueVectorizerEpilogueLoop(
829 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
830 DominatorTree *DT, const TargetLibraryInfo *TLI,
831 const TargetTransformInfo *TTI, AssumptionCache *AC,
832 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
833 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
834 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
835 GeneratedRTChecks &Checks)
836 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
837 EPI, LVL, CM, BFI, PSI, Checks) {
838 TripCount = EPI.TripCount;
839 }
840 /// Implements the interface for creating a vectorized skeleton using the
841 /// *epilogue loop* strategy (ie the second pass of vplan execution).
842 std::pair<BasicBlock *, Value *>
843 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
844
845 protected:
846 /// Emits an iteration count bypass check after the main vector loop has
847 /// finished to see if there are any iterations left to execute by either
848 /// the vector epilogue or the scalar epilogue.
849 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
850 BasicBlock *Bypass,
851 BasicBlock *Insert);
852 void printDebugTracesAtStart() override;
853 void printDebugTracesAtEnd() override;
854 };
855 } // end namespace llvm
856
857 /// Look for a meaningful debug location on the instruction or it's
858 /// operands.
getDebugLocFromInstOrOperands(Instruction * I)859 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
860 if (!I)
861 return DebugLoc();
862
863 DebugLoc Empty;
864 if (I->getDebugLoc() != Empty)
865 return I->getDebugLoc();
866
867 for (Use &Op : I->operands()) {
868 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
869 if (OpInst->getDebugLoc() != Empty)
870 return OpInst->getDebugLoc();
871 }
872
873 return I->getDebugLoc();
874 }
875
876 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
877 /// is passed, the message relates to that particular instruction.
878 #ifndef NDEBUG
debugVectorizationMessage(const StringRef Prefix,const StringRef DebugMsg,Instruction * I)879 static void debugVectorizationMessage(const StringRef Prefix,
880 const StringRef DebugMsg,
881 Instruction *I) {
882 dbgs() << "LV: " << Prefix << DebugMsg;
883 if (I != nullptr)
884 dbgs() << " " << *I;
885 else
886 dbgs() << '.';
887 dbgs() << '\n';
888 }
889 #endif
890
891 /// Create an analysis remark that explains why vectorization failed
892 ///
893 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
894 /// RemarkName is the identifier for the remark. If \p I is passed it is an
895 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
896 /// the location of the remark. \return the remark object that can be
897 /// streamed to.
createLVAnalysis(const char * PassName,StringRef RemarkName,Loop * TheLoop,Instruction * I)898 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
899 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
900 Value *CodeRegion = TheLoop->getHeader();
901 DebugLoc DL = TheLoop->getStartLoc();
902
903 if (I) {
904 CodeRegion = I->getParent();
905 // If there is no debug location attached to the instruction, revert back to
906 // using the loop's.
907 if (I->getDebugLoc())
908 DL = I->getDebugLoc();
909 }
910
911 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
912 }
913
914 namespace llvm {
915
916 /// Return a value for Step multiplied by VF.
createStepForVF(IRBuilderBase & B,Type * Ty,ElementCount VF,int64_t Step)917 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
918 int64_t Step) {
919 assert(Ty->isIntegerTy() && "Expected an integer step");
920 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
921 }
922
923 /// Return the runtime value for VF.
getRuntimeVF(IRBuilderBase & B,Type * Ty,ElementCount VF)924 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
925 return B.CreateElementCount(Ty, VF);
926 }
927
createTripCountSCEV(Type * IdxTy,PredicatedScalarEvolution & PSE,Loop * OrigLoop)928 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
929 Loop *OrigLoop) {
930 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
931 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
932
933 ScalarEvolution &SE = *PSE.getSE();
934 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
935 }
936
reportVectorizationFailure(const StringRef DebugMsg,const StringRef OREMsg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)937 void reportVectorizationFailure(const StringRef DebugMsg,
938 const StringRef OREMsg, const StringRef ORETag,
939 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
940 Instruction *I) {
941 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
942 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
943 ORE->emit(
944 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
945 << "loop not vectorized: " << OREMsg);
946 }
947
948 /// Reports an informative message: print \p Msg for debugging purposes as well
949 /// as an optimization remark. Uses either \p I as location of the remark, or
950 /// otherwise \p TheLoop.
reportVectorizationInfo(const StringRef Msg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I=nullptr)951 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
952 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
953 Instruction *I = nullptr) {
954 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
955 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
956 ORE->emit(
957 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
958 << Msg);
959 }
960
961 /// Report successful vectorization of the loop. In case an outer loop is
962 /// vectorized, prepend "outer" to the vectorization remark.
reportVectorization(OptimizationRemarkEmitter * ORE,Loop * TheLoop,VectorizationFactor VF,unsigned IC)963 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
964 VectorizationFactor VF, unsigned IC) {
965 LLVM_DEBUG(debugVectorizationMessage(
966 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
967 nullptr));
968 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
969 ORE->emit([&]() {
970 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
971 TheLoop->getHeader())
972 << "vectorized " << LoopType << "loop (vectorization width: "
973 << ore::NV("VectorizationFactor", VF.Width)
974 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
975 });
976 }
977
978 } // end namespace llvm
979
980 namespace llvm {
981
982 // Loop vectorization cost-model hints how the scalar epilogue loop should be
983 // lowered.
984 enum ScalarEpilogueLowering {
985
986 // The default: allowing scalar epilogues.
987 CM_ScalarEpilogueAllowed,
988
989 // Vectorization with OptForSize: don't allow epilogues.
990 CM_ScalarEpilogueNotAllowedOptSize,
991
992 // A special case of vectorisation with OptForSize: loops with a very small
993 // trip count are considered for vectorization under OptForSize, thereby
994 // making sure the cost of their loop body is dominant, free of runtime
995 // guards and scalar iteration overheads.
996 CM_ScalarEpilogueNotAllowedLowTripLoop,
997
998 // Loop hint predicate indicating an epilogue is undesired.
999 CM_ScalarEpilogueNotNeededUsePredicate,
1000
1001 // Directive indicating we must either tail fold or not vectorize
1002 CM_ScalarEpilogueNotAllowedUsePredicate
1003 };
1004
1005 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1006
1007 /// LoopVectorizationCostModel - estimates the expected speedups due to
1008 /// vectorization.
1009 /// In many cases vectorization is not profitable. This can happen because of
1010 /// a number of reasons. In this class we mainly attempt to predict the
1011 /// expected speedup/slowdowns due to the supported instruction set. We use the
1012 /// TargetTransformInfo to query the different backends for the cost of
1013 /// different operations.
1014 class LoopVectorizationCostModel {
1015 public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL,Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,LoopVectorizationLegality * Legal,const TargetTransformInfo & TTI,const TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,const Function * F,const LoopVectorizeHints * Hints,InterleavedAccessInfo & IAI)1016 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1017 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1018 LoopVectorizationLegality *Legal,
1019 const TargetTransformInfo &TTI,
1020 const TargetLibraryInfo *TLI, DemandedBits *DB,
1021 AssumptionCache *AC,
1022 OptimizationRemarkEmitter *ORE, const Function *F,
1023 const LoopVectorizeHints *Hints,
1024 InterleavedAccessInfo &IAI)
1025 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1026 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1027 Hints(Hints), InterleaveInfo(IAI) {}
1028
1029 /// \return An upper bound for the vectorization factors (both fixed and
1030 /// scalable). If the factors are 0, vectorization and interleaving should be
1031 /// avoided up front.
1032 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1033
1034 /// \return True if runtime checks are required for vectorization, and false
1035 /// otherwise.
1036 bool runtimeChecksRequired();
1037
1038 /// Setup cost-based decisions for user vectorization factor.
1039 /// \return true if the UserVF is a feasible VF to be chosen.
selectUserVectorizationFactor(ElementCount UserVF)1040 bool selectUserVectorizationFactor(ElementCount UserVF) {
1041 collectUniformsAndScalars(UserVF);
1042 collectInstsToScalarize(UserVF);
1043 return expectedCost(UserVF).isValid();
1044 }
1045
1046 /// \return The size (in bits) of the smallest and widest types in the code
1047 /// that needs to be vectorized. We ignore values that remain scalar such as
1048 /// 64 bit loop indices.
1049 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1050
1051 /// \return The desired interleave count.
1052 /// If interleave count has been specified by metadata it will be returned.
1053 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1054 /// are the selected vectorization factor and the cost of the selected VF.
1055 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1056
1057 /// Memory access instruction may be vectorized in more than one way.
1058 /// Form of instruction after vectorization depends on cost.
1059 /// This function takes cost-based decisions for Load/Store instructions
1060 /// and collects them in a map. This decisions map is used for building
1061 /// the lists of loop-uniform and loop-scalar instructions.
1062 /// The calculated cost is saved with widening decision in order to
1063 /// avoid redundant calculations.
1064 void setCostBasedWideningDecision(ElementCount VF);
1065
1066 /// A call may be vectorized in different ways depending on whether we have
1067 /// vectorized variants available and whether the target supports masking.
1068 /// This function analyzes all calls in the function at the supplied VF,
1069 /// makes a decision based on the costs of available options, and stores that
1070 /// decision in a map for use in planning and plan execution.
1071 void setVectorizedCallDecision(ElementCount VF);
1072
1073 /// A struct that represents some properties of the register usage
1074 /// of a loop.
1075 struct RegisterUsage {
1076 /// Holds the number of loop invariant values that are used in the loop.
1077 /// The key is ClassID of target-provided register class.
1078 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1079 /// Holds the maximum number of concurrent live intervals in the loop.
1080 /// The key is ClassID of target-provided register class.
1081 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1082 };
1083
1084 /// \return Returns information about the register usages of the loop for the
1085 /// given vectorization factors.
1086 SmallVector<RegisterUsage, 8>
1087 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1088
1089 /// Collect values we want to ignore in the cost model.
1090 void collectValuesToIgnore();
1091
1092 /// Collect all element types in the loop for which widening is needed.
1093 void collectElementTypesForWidening();
1094
1095 /// Split reductions into those that happen in the loop, and those that happen
1096 /// outside. In loop reductions are collected into InLoopReductions.
1097 void collectInLoopReductions();
1098
1099 /// Returns true if we should use strict in-order reductions for the given
1100 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1101 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1102 /// of FP operations.
useOrderedReductions(const RecurrenceDescriptor & RdxDesc) const1103 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1104 return !Hints->allowReordering() && RdxDesc.isOrdered();
1105 }
1106
1107 /// \returns The smallest bitwidth each instruction can be represented with.
1108 /// The vector equivalents of these instructions should be truncated to this
1109 /// type.
getMinimalBitwidths() const1110 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1111 return MinBWs;
1112 }
1113
1114 /// \returns True if it is more profitable to scalarize instruction \p I for
1115 /// vectorization factor \p VF.
isProfitableToScalarize(Instruction * I,ElementCount VF) const1116 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1117 assert(VF.isVector() &&
1118 "Profitable to scalarize relevant only for VF > 1.");
1119 assert(
1120 TheLoop->isInnermost() &&
1121 "cost-model should not be used for outer loops (in VPlan-native path)");
1122
1123 auto Scalars = InstsToScalarize.find(VF);
1124 assert(Scalars != InstsToScalarize.end() &&
1125 "VF not yet analyzed for scalarization profitability");
1126 return Scalars->second.contains(I);
1127 }
1128
1129 /// Returns true if \p I is known to be uniform after vectorization.
isUniformAfterVectorization(Instruction * I,ElementCount VF) const1130 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1131 assert(
1132 TheLoop->isInnermost() &&
1133 "cost-model should not be used for outer loops (in VPlan-native path)");
1134 // Pseudo probe needs to be duplicated for each unrolled iteration and
1135 // vector lane so that profiled loop trip count can be accurately
1136 // accumulated instead of being under counted.
1137 if (isa<PseudoProbeInst>(I))
1138 return false;
1139
1140 if (VF.isScalar())
1141 return true;
1142
1143 auto UniformsPerVF = Uniforms.find(VF);
1144 assert(UniformsPerVF != Uniforms.end() &&
1145 "VF not yet analyzed for uniformity");
1146 return UniformsPerVF->second.count(I);
1147 }
1148
1149 /// Returns true if \p I is known to be scalar after vectorization.
isScalarAfterVectorization(Instruction * I,ElementCount VF) const1150 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1151 assert(
1152 TheLoop->isInnermost() &&
1153 "cost-model should not be used for outer loops (in VPlan-native path)");
1154 if (VF.isScalar())
1155 return true;
1156
1157 auto ScalarsPerVF = Scalars.find(VF);
1158 assert(ScalarsPerVF != Scalars.end() &&
1159 "Scalar values are not calculated for VF");
1160 return ScalarsPerVF->second.count(I);
1161 }
1162
1163 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1164 /// for vectorization factor \p VF.
canTruncateToMinimalBitwidth(Instruction * I,ElementCount VF) const1165 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1166 return VF.isVector() && MinBWs.contains(I) &&
1167 !isProfitableToScalarize(I, VF) &&
1168 !isScalarAfterVectorization(I, VF);
1169 }
1170
1171 /// Decision that was taken during cost calculation for memory instruction.
1172 enum InstWidening {
1173 CM_Unknown,
1174 CM_Widen, // For consecutive accesses with stride +1.
1175 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1176 CM_Interleave,
1177 CM_GatherScatter,
1178 CM_Scalarize,
1179 CM_VectorCall,
1180 CM_IntrinsicCall
1181 };
1182
1183 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1184 /// instruction \p I and vector width \p VF.
setWideningDecision(Instruction * I,ElementCount VF,InstWidening W,InstructionCost Cost)1185 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1186 InstructionCost Cost) {
1187 assert(VF.isVector() && "Expected VF >=2");
1188 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1189 }
1190
1191 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1192 /// interleaving group \p Grp and vector width \p VF.
setWideningDecision(const InterleaveGroup<Instruction> * Grp,ElementCount VF,InstWidening W,InstructionCost Cost)1193 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1194 ElementCount VF, InstWidening W,
1195 InstructionCost Cost) {
1196 assert(VF.isVector() && "Expected VF >=2");
1197 /// Broadcast this decicion to all instructions inside the group.
1198 /// But the cost will be assigned to one instruction only.
1199 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1200 if (auto *I = Grp->getMember(i)) {
1201 if (Grp->getInsertPos() == I)
1202 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1203 else
1204 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1205 }
1206 }
1207 }
1208
1209 /// Return the cost model decision for the given instruction \p I and vector
1210 /// width \p VF. Return CM_Unknown if this instruction did not pass
1211 /// through the cost modeling.
getWideningDecision(Instruction * I,ElementCount VF) const1212 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1213 assert(VF.isVector() && "Expected VF to be a vector VF");
1214 assert(
1215 TheLoop->isInnermost() &&
1216 "cost-model should not be used for outer loops (in VPlan-native path)");
1217
1218 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1219 auto Itr = WideningDecisions.find(InstOnVF);
1220 if (Itr == WideningDecisions.end())
1221 return CM_Unknown;
1222 return Itr->second.first;
1223 }
1224
1225 /// Return the vectorization cost for the given instruction \p I and vector
1226 /// width \p VF.
getWideningCost(Instruction * I,ElementCount VF)1227 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1228 assert(VF.isVector() && "Expected VF >=2");
1229 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1230 assert(WideningDecisions.contains(InstOnVF) &&
1231 "The cost is not calculated");
1232 return WideningDecisions[InstOnVF].second;
1233 }
1234
1235 struct CallWideningDecision {
1236 InstWidening Kind;
1237 Function *Variant;
1238 Intrinsic::ID IID;
1239 std::optional<unsigned> MaskPos;
1240 InstructionCost Cost;
1241 };
1242
setCallWideningDecision(CallInst * CI,ElementCount VF,InstWidening Kind,Function * Variant,Intrinsic::ID IID,std::optional<unsigned> MaskPos,InstructionCost Cost)1243 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1244 Function *Variant, Intrinsic::ID IID,
1245 std::optional<unsigned> MaskPos,
1246 InstructionCost Cost) {
1247 assert(!VF.isScalar() && "Expected vector VF");
1248 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1249 MaskPos, Cost};
1250 }
1251
getCallWideningDecision(CallInst * CI,ElementCount VF) const1252 CallWideningDecision getCallWideningDecision(CallInst *CI,
1253 ElementCount VF) const {
1254 assert(!VF.isScalar() && "Expected vector VF");
1255 return CallWideningDecisions.at(std::make_pair(CI, VF));
1256 }
1257
1258 /// Return True if instruction \p I is an optimizable truncate whose operand
1259 /// is an induction variable. Such a truncate will be removed by adding a new
1260 /// induction variable with the destination type.
isOptimizableIVTruncate(Instruction * I,ElementCount VF)1261 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1262 // If the instruction is not a truncate, return false.
1263 auto *Trunc = dyn_cast<TruncInst>(I);
1264 if (!Trunc)
1265 return false;
1266
1267 // Get the source and destination types of the truncate.
1268 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1269 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1270
1271 // If the truncate is free for the given types, return false. Replacing a
1272 // free truncate with an induction variable would add an induction variable
1273 // update instruction to each iteration of the loop. We exclude from this
1274 // check the primary induction variable since it will need an update
1275 // instruction regardless.
1276 Value *Op = Trunc->getOperand(0);
1277 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1278 return false;
1279
1280 // If the truncated value is not an induction variable, return false.
1281 return Legal->isInductionPhi(Op);
1282 }
1283
1284 /// Collects the instructions to scalarize for each predicated instruction in
1285 /// the loop.
1286 void collectInstsToScalarize(ElementCount VF);
1287
1288 /// Collect Uniform and Scalar values for the given \p VF.
1289 /// The sets depend on CM decision for Load/Store instructions
1290 /// that may be vectorized as interleave, gather-scatter or scalarized.
1291 /// Also make a decision on what to do about call instructions in the loop
1292 /// at that VF -- scalarize, call a known vector routine, or call a
1293 /// vector intrinsic.
collectUniformsAndScalars(ElementCount VF)1294 void collectUniformsAndScalars(ElementCount VF) {
1295 // Do the analysis once.
1296 if (VF.isScalar() || Uniforms.contains(VF))
1297 return;
1298 setCostBasedWideningDecision(VF);
1299 setVectorizedCallDecision(VF);
1300 collectLoopUniforms(VF);
1301 collectLoopScalars(VF);
1302 }
1303
1304 /// Returns true if the target machine supports masked store operation
1305 /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedStore(Type * DataType,Value * Ptr,Align Alignment) const1306 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1307 return Legal->isConsecutivePtr(DataType, Ptr) &&
1308 TTI.isLegalMaskedStore(DataType, Alignment);
1309 }
1310
1311 /// Returns true if the target machine supports masked load operation
1312 /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedLoad(Type * DataType,Value * Ptr,Align Alignment) const1313 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1314 return Legal->isConsecutivePtr(DataType, Ptr) &&
1315 TTI.isLegalMaskedLoad(DataType, Alignment);
1316 }
1317
1318 /// Returns true if the target machine can represent \p V as a masked gather
1319 /// or scatter operation.
isLegalGatherOrScatter(Value * V,ElementCount VF)1320 bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1321 bool LI = isa<LoadInst>(V);
1322 bool SI = isa<StoreInst>(V);
1323 if (!LI && !SI)
1324 return false;
1325 auto *Ty = getLoadStoreType(V);
1326 Align Align = getLoadStoreAlignment(V);
1327 if (VF.isVector())
1328 Ty = VectorType::get(Ty, VF);
1329 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1330 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1331 }
1332
1333 /// Returns true if the target machine supports all of the reduction
1334 /// variables found for the given VF.
canVectorizeReductions(ElementCount VF) const1335 bool canVectorizeReductions(ElementCount VF) const {
1336 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1337 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1338 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1339 }));
1340 }
1341
1342 /// Given costs for both strategies, return true if the scalar predication
1343 /// lowering should be used for div/rem. This incorporates an override
1344 /// option so it is not simply a cost comparison.
isDivRemScalarWithPredication(InstructionCost ScalarCost,InstructionCost SafeDivisorCost) const1345 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1346 InstructionCost SafeDivisorCost) const {
1347 switch (ForceSafeDivisor) {
1348 case cl::BOU_UNSET:
1349 return ScalarCost < SafeDivisorCost;
1350 case cl::BOU_TRUE:
1351 return false;
1352 case cl::BOU_FALSE:
1353 return true;
1354 };
1355 llvm_unreachable("impossible case value");
1356 }
1357
1358 /// Returns true if \p I is an instruction which requires predication and
1359 /// for which our chosen predication strategy is scalarization (i.e. we
1360 /// don't have an alternate strategy such as masking available).
1361 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1362 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1363
1364 /// Returns true if \p I is an instruction that needs to be predicated
1365 /// at runtime. The result is independent of the predication mechanism.
1366 /// Superset of instructions that return true for isScalarWithPredication.
1367 bool isPredicatedInst(Instruction *I) const;
1368
1369 /// Return the costs for our two available strategies for lowering a
1370 /// div/rem operation which requires speculating at least one lane.
1371 /// First result is for scalarization (will be invalid for scalable
1372 /// vectors); second is for the safe-divisor strategy.
1373 std::pair<InstructionCost, InstructionCost>
1374 getDivRemSpeculationCost(Instruction *I,
1375 ElementCount VF) const;
1376
1377 /// Returns true if \p I is a memory instruction with consecutive memory
1378 /// access that can be widened.
1379 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1380
1381 /// Returns true if \p I is a memory instruction in an interleaved-group
1382 /// of memory accesses that can be vectorized with wide vector loads/stores
1383 /// and shuffles.
1384 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1385
1386 /// Check if \p Instr belongs to any interleaved access group.
isAccessInterleaved(Instruction * Instr) const1387 bool isAccessInterleaved(Instruction *Instr) const {
1388 return InterleaveInfo.isInterleaved(Instr);
1389 }
1390
1391 /// Get the interleaved access group that \p Instr belongs to.
1392 const InterleaveGroup<Instruction> *
getInterleavedAccessGroup(Instruction * Instr) const1393 getInterleavedAccessGroup(Instruction *Instr) const {
1394 return InterleaveInfo.getInterleaveGroup(Instr);
1395 }
1396
1397 /// Returns true if we're required to use a scalar epilogue for at least
1398 /// the final iteration of the original loop.
requiresScalarEpilogue(bool IsVectorizing) const1399 bool requiresScalarEpilogue(bool IsVectorizing) const {
1400 if (!isScalarEpilogueAllowed()) {
1401 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1402 return false;
1403 }
1404 // If we might exit from anywhere but the latch, must run the exiting
1405 // iteration in scalar form.
1406 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1407 LLVM_DEBUG(
1408 dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1409 return true;
1410 }
1411 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1412 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1413 "interleaved group requires scalar epilogue\n");
1414 return true;
1415 }
1416 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1417 return false;
1418 }
1419
1420 /// Returns true if we're required to use a scalar epilogue for at least
1421 /// the final iteration of the original loop for all VFs in \p Range.
1422 /// A scalar epilogue must either be required for all VFs in \p Range or for
1423 /// none.
requiresScalarEpilogue(VFRange Range) const1424 bool requiresScalarEpilogue(VFRange Range) const {
1425 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1426 return requiresScalarEpilogue(VF.isVector());
1427 };
1428 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1429 assert(
1430 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1431 "all VFs in range must agree on whether a scalar epilogue is required");
1432 return IsRequired;
1433 }
1434
1435 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1436 /// loop hint annotation.
isScalarEpilogueAllowed() const1437 bool isScalarEpilogueAllowed() const {
1438 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1439 }
1440
1441 /// Returns the TailFoldingStyle that is best for the current loop.
getTailFoldingStyle(bool IVUpdateMayOverflow=true) const1442 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1443 if (!ChosenTailFoldingStyle)
1444 return TailFoldingStyle::None;
1445 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1446 : ChosenTailFoldingStyle->second;
1447 }
1448
1449 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1450 /// overflow or not.
1451 /// \param IsScalableVF true if scalable vector factors enabled.
1452 /// \param UserIC User specific interleave count.
setTailFoldingStyles(bool IsScalableVF,unsigned UserIC)1453 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1454 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1455 if (!Legal->canFoldTailByMasking()) {
1456 ChosenTailFoldingStyle =
1457 std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
1458 return;
1459 }
1460
1461 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1462 ChosenTailFoldingStyle = std::make_pair(
1463 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1464 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1465 return;
1466 }
1467
1468 // Set styles when forced.
1469 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1470 ForceTailFoldingStyle.getValue());
1471 if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1472 return;
1473 // Override forced styles if needed.
1474 // FIXME: use actual opcode/data type for analysis here.
1475 // FIXME: Investigate opportunity for fixed vector factor.
1476 bool EVLIsLegal =
1477 IsScalableVF && UserIC <= 1 &&
1478 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1479 !EnableVPlanNativePath &&
1480 // FIXME: implement support for max safe dependency distance.
1481 Legal->isSafeForAnyVectorWidth();
1482 if (!EVLIsLegal) {
1483 // If for some reason EVL mode is unsupported, fallback to
1484 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1485 // in a generic way.
1486 ChosenTailFoldingStyle =
1487 std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
1488 TailFoldingStyle::DataWithoutLaneMask);
1489 LLVM_DEBUG(
1490 dbgs()
1491 << "LV: Preference for VP intrinsics indicated. Will "
1492 "not try to generate VP Intrinsics "
1493 << (UserIC > 1
1494 ? "since interleave count specified is greater than 1.\n"
1495 : "due to non-interleaving reasons.\n"));
1496 }
1497 }
1498
1499 /// Returns true if all loop blocks should be masked to fold tail loop.
foldTailByMasking() const1500 bool foldTailByMasking() const {
1501 // TODO: check if it is possible to check for None style independent of
1502 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1503 return getTailFoldingStyle() != TailFoldingStyle::None;
1504 }
1505
1506 /// Returns true if the instructions in this block requires predication
1507 /// for any reason, e.g. because tail folding now requires a predicate
1508 /// or because the block in the original loop was predicated.
blockNeedsPredicationForAnyReason(BasicBlock * BB) const1509 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1510 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1511 }
1512
1513 /// Returns true if VP intrinsics with explicit vector length support should
1514 /// be generated in the tail folded loop.
foldTailWithEVL() const1515 bool foldTailWithEVL() const {
1516 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1517 }
1518
1519 /// Returns true if the Phi is part of an inloop reduction.
isInLoopReduction(PHINode * Phi) const1520 bool isInLoopReduction(PHINode *Phi) const {
1521 return InLoopReductions.contains(Phi);
1522 }
1523
1524 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1525 /// with factor VF. Return the cost of the instruction, including
1526 /// scalarization overhead if it's needed.
1527 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1528
1529 /// Estimate cost of a call instruction CI if it were vectorized with factor
1530 /// VF. Return the cost of the instruction, including scalarization overhead
1531 /// if it's needed.
1532 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1533
1534 /// Invalidates decisions already taken by the cost model.
invalidateCostModelingDecisions()1535 void invalidateCostModelingDecisions() {
1536 WideningDecisions.clear();
1537 CallWideningDecisions.clear();
1538 Uniforms.clear();
1539 Scalars.clear();
1540 }
1541
1542 /// Returns the expected execution cost. The unit of the cost does
1543 /// not matter because we use the 'cost' units to compare different
1544 /// vector widths. The cost that is returned is *not* normalized by
1545 /// the factor width. If \p Invalid is not nullptr, this function
1546 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1547 /// each instruction that has an Invalid cost for the given VF.
1548 InstructionCost
1549 expectedCost(ElementCount VF,
1550 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1551
hasPredStores() const1552 bool hasPredStores() const { return NumPredStores > 0; }
1553
1554 /// Returns true if epilogue vectorization is considered profitable, and
1555 /// false otherwise.
1556 /// \p VF is the vectorization factor chosen for the original loop.
1557 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1558
1559 /// Returns the execution time cost of an instruction for a given vector
1560 /// width. Vector width of one means scalar.
1561 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1562
1563 /// Return the cost of instructions in an inloop reduction pattern, if I is
1564 /// part of that pattern.
1565 std::optional<InstructionCost>
1566 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1567 TTI::TargetCostKind CostKind) const;
1568
1569 private:
1570 unsigned NumPredStores = 0;
1571
1572 /// \return An upper bound for the vectorization factors for both
1573 /// fixed and scalable vectorization, where the minimum-known number of
1574 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1575 /// disabled or unsupported, then the scalable part will be equal to
1576 /// ElementCount::getScalable(0).
1577 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1578 ElementCount UserVF,
1579 bool FoldTailByMasking);
1580
1581 /// \return the maximized element count based on the targets vector
1582 /// registers and the loop trip-count, but limited to a maximum safe VF.
1583 /// This is a helper function of computeFeasibleMaxVF.
1584 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1585 unsigned SmallestType,
1586 unsigned WidestType,
1587 ElementCount MaxSafeVF,
1588 bool FoldTailByMasking);
1589
1590 /// Checks if scalable vectorization is supported and enabled. Caches the
1591 /// result to avoid repeated debug dumps for repeated queries.
1592 bool isScalableVectorizationAllowed();
1593
1594 /// \return the maximum legal scalable VF, based on the safe max number
1595 /// of elements.
1596 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1597
1598 /// Calculate vectorization cost of memory instruction \p I.
1599 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1600
1601 /// The cost computation for scalarized memory instruction.
1602 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1603
1604 /// The cost computation for interleaving group of memory instructions.
1605 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1606
1607 /// The cost computation for Gather/Scatter instruction.
1608 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1609
1610 /// The cost computation for widening instruction \p I with consecutive
1611 /// memory access.
1612 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1613
1614 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1615 /// Load: scalar load + broadcast.
1616 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1617 /// element)
1618 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1619
1620 /// Estimate the overhead of scalarizing an instruction. This is a
1621 /// convenience wrapper for the type-based getScalarizationOverhead API.
1622 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1623 TTI::TargetCostKind CostKind) const;
1624
1625 /// Returns true if an artificially high cost for emulated masked memrefs
1626 /// should be used.
1627 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1628
1629 /// Map of scalar integer values to the smallest bitwidth they can be legally
1630 /// represented as. The vector equivalents of these values should be truncated
1631 /// to this type.
1632 MapVector<Instruction *, uint64_t> MinBWs;
1633
1634 /// A type representing the costs for instructions if they were to be
1635 /// scalarized rather than vectorized. The entries are Instruction-Cost
1636 /// pairs.
1637 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1638
1639 /// A set containing all BasicBlocks that are known to present after
1640 /// vectorization as a predicated block.
1641 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1642 PredicatedBBsAfterVectorization;
1643
1644 /// Records whether it is allowed to have the original scalar loop execute at
1645 /// least once. This may be needed as a fallback loop in case runtime
1646 /// aliasing/dependence checks fail, or to handle the tail/remainder
1647 /// iterations when the trip count is unknown or doesn't divide by the VF,
1648 /// or as a peel-loop to handle gaps in interleave-groups.
1649 /// Under optsize and when the trip count is very small we don't allow any
1650 /// iterations to execute in the scalar loop.
1651 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1652
1653 /// Control finally chosen tail folding style. The first element is used if
1654 /// the IV update may overflow, the second element - if it does not.
1655 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1656 ChosenTailFoldingStyle;
1657
1658 /// true if scalable vectorization is supported and enabled.
1659 std::optional<bool> IsScalableVectorizationAllowed;
1660
1661 /// A map holding scalar costs for different vectorization factors. The
1662 /// presence of a cost for an instruction in the mapping indicates that the
1663 /// instruction will be scalarized when vectorizing with the associated
1664 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1665 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1666
1667 /// Holds the instructions known to be uniform after vectorization.
1668 /// The data is collected per VF.
1669 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1670
1671 /// Holds the instructions known to be scalar after vectorization.
1672 /// The data is collected per VF.
1673 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1674
1675 /// Holds the instructions (address computations) that are forced to be
1676 /// scalarized.
1677 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1678
1679 /// PHINodes of the reductions that should be expanded in-loop.
1680 SmallPtrSet<PHINode *, 4> InLoopReductions;
1681
1682 /// A Map of inloop reduction operations and their immediate chain operand.
1683 /// FIXME: This can be removed once reductions can be costed correctly in
1684 /// VPlan. This was added to allow quick lookup of the inloop operations.
1685 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1686
1687 /// Returns the expected difference in cost from scalarizing the expression
1688 /// feeding a predicated instruction \p PredInst. The instructions to
1689 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1690 /// non-negative return value implies the expression will be scalarized.
1691 /// Currently, only single-use chains are considered for scalarization.
1692 InstructionCost computePredInstDiscount(Instruction *PredInst,
1693 ScalarCostsTy &ScalarCosts,
1694 ElementCount VF);
1695
1696 /// Collect the instructions that are uniform after vectorization. An
1697 /// instruction is uniform if we represent it with a single scalar value in
1698 /// the vectorized loop corresponding to each vector iteration. Examples of
1699 /// uniform instructions include pointer operands of consecutive or
1700 /// interleaved memory accesses. Note that although uniformity implies an
1701 /// instruction will be scalar, the reverse is not true. In general, a
1702 /// scalarized instruction will be represented by VF scalar values in the
1703 /// vectorized loop, each corresponding to an iteration of the original
1704 /// scalar loop.
1705 void collectLoopUniforms(ElementCount VF);
1706
1707 /// Collect the instructions that are scalar after vectorization. An
1708 /// instruction is scalar if it is known to be uniform or will be scalarized
1709 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1710 /// to the list if they are used by a load/store instruction that is marked as
1711 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1712 /// VF values in the vectorized loop, each corresponding to an iteration of
1713 /// the original scalar loop.
1714 void collectLoopScalars(ElementCount VF);
1715
1716 /// Keeps cost model vectorization decision and cost for instructions.
1717 /// Right now it is used for memory instructions only.
1718 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1719 std::pair<InstWidening, InstructionCost>>;
1720
1721 DecisionList WideningDecisions;
1722
1723 using CallDecisionList =
1724 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1725
1726 CallDecisionList CallWideningDecisions;
1727
1728 /// Returns true if \p V is expected to be vectorized and it needs to be
1729 /// extracted.
needsExtract(Value * V,ElementCount VF) const1730 bool needsExtract(Value *V, ElementCount VF) const {
1731 Instruction *I = dyn_cast<Instruction>(V);
1732 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1733 TheLoop->isLoopInvariant(I))
1734 return false;
1735
1736 // Assume we can vectorize V (and hence we need extraction) if the
1737 // scalars are not computed yet. This can happen, because it is called
1738 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1739 // the scalars are collected. That should be a safe assumption in most
1740 // cases, because we check if the operands have vectorizable types
1741 // beforehand in LoopVectorizationLegality.
1742 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1743 };
1744
1745 /// Returns a range containing only operands needing to be extracted.
filterExtractingOperands(Instruction::op_range Ops,ElementCount VF) const1746 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1747 ElementCount VF) const {
1748 return SmallVector<Value *, 4>(make_filter_range(
1749 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1750 }
1751
1752 public:
1753 /// The loop that we evaluate.
1754 Loop *TheLoop;
1755
1756 /// Predicated scalar evolution analysis.
1757 PredicatedScalarEvolution &PSE;
1758
1759 /// Loop Info analysis.
1760 LoopInfo *LI;
1761
1762 /// Vectorization legality.
1763 LoopVectorizationLegality *Legal;
1764
1765 /// Vector target information.
1766 const TargetTransformInfo &TTI;
1767
1768 /// Target Library Info.
1769 const TargetLibraryInfo *TLI;
1770
1771 /// Demanded bits analysis.
1772 DemandedBits *DB;
1773
1774 /// Assumption cache.
1775 AssumptionCache *AC;
1776
1777 /// Interface to emit optimization remarks.
1778 OptimizationRemarkEmitter *ORE;
1779
1780 const Function *TheFunction;
1781
1782 /// Loop Vectorize Hint.
1783 const LoopVectorizeHints *Hints;
1784
1785 /// The interleave access information contains groups of interleaved accesses
1786 /// with the same stride and close to each other.
1787 InterleavedAccessInfo &InterleaveInfo;
1788
1789 /// Values to ignore in the cost model.
1790 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1791
1792 /// Values to ignore in the cost model when VF > 1.
1793 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1794
1795 /// All element types found in the loop.
1796 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1797 };
1798 } // end namespace llvm
1799
1800 namespace {
1801 /// Helper struct to manage generating runtime checks for vectorization.
1802 ///
1803 /// The runtime checks are created up-front in temporary blocks to allow better
1804 /// estimating the cost and un-linked from the existing IR. After deciding to
1805 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1806 /// temporary blocks are completely removed.
1807 class GeneratedRTChecks {
1808 /// Basic block which contains the generated SCEV checks, if any.
1809 BasicBlock *SCEVCheckBlock = nullptr;
1810
1811 /// The value representing the result of the generated SCEV checks. If it is
1812 /// nullptr, either no SCEV checks have been generated or they have been used.
1813 Value *SCEVCheckCond = nullptr;
1814
1815 /// Basic block which contains the generated memory runtime checks, if any.
1816 BasicBlock *MemCheckBlock = nullptr;
1817
1818 /// The value representing the result of the generated memory runtime checks.
1819 /// If it is nullptr, either no memory runtime checks have been generated or
1820 /// they have been used.
1821 Value *MemRuntimeCheckCond = nullptr;
1822
1823 DominatorTree *DT;
1824 LoopInfo *LI;
1825 TargetTransformInfo *TTI;
1826
1827 SCEVExpander SCEVExp;
1828 SCEVExpander MemCheckExp;
1829
1830 bool CostTooHigh = false;
1831 const bool AddBranchWeights;
1832
1833 Loop *OuterLoop = nullptr;
1834
1835 public:
GeneratedRTChecks(ScalarEvolution & SE,DominatorTree * DT,LoopInfo * LI,TargetTransformInfo * TTI,const DataLayout & DL,bool AddBranchWeights)1836 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1837 TargetTransformInfo *TTI, const DataLayout &DL,
1838 bool AddBranchWeights)
1839 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1840 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1841
1842 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1843 /// accurately estimate the cost of the runtime checks. The blocks are
1844 /// un-linked from the IR and is added back during vector code generation. If
1845 /// there is no vector code generation, the check blocks are removed
1846 /// completely.
Create(Loop * L,const LoopAccessInfo & LAI,const SCEVPredicate & UnionPred,ElementCount VF,unsigned IC)1847 void Create(Loop *L, const LoopAccessInfo &LAI,
1848 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1849
1850 // Hard cutoff to limit compile-time increase in case a very large number of
1851 // runtime checks needs to be generated.
1852 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1853 // profile info.
1854 CostTooHigh =
1855 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1856 if (CostTooHigh)
1857 return;
1858
1859 BasicBlock *LoopHeader = L->getHeader();
1860 BasicBlock *Preheader = L->getLoopPreheader();
1861
1862 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1863 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1864 // may be used by SCEVExpander. The blocks will be un-linked from their
1865 // predecessors and removed from LI & DT at the end of the function.
1866 if (!UnionPred.isAlwaysTrue()) {
1867 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1868 nullptr, "vector.scevcheck");
1869
1870 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1871 &UnionPred, SCEVCheckBlock->getTerminator());
1872 }
1873
1874 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1875 if (RtPtrChecking.Need) {
1876 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1877 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1878 "vector.memcheck");
1879
1880 auto DiffChecks = RtPtrChecking.getDiffChecks();
1881 if (DiffChecks) {
1882 Value *RuntimeVF = nullptr;
1883 MemRuntimeCheckCond = addDiffRuntimeChecks(
1884 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1885 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1886 if (!RuntimeVF)
1887 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1888 return RuntimeVF;
1889 },
1890 IC);
1891 } else {
1892 MemRuntimeCheckCond = addRuntimeChecks(
1893 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1894 MemCheckExp, VectorizerParams::HoistRuntimeChecks);
1895 }
1896 assert(MemRuntimeCheckCond &&
1897 "no RT checks generated although RtPtrChecking "
1898 "claimed checks are required");
1899 }
1900
1901 if (!MemCheckBlock && !SCEVCheckBlock)
1902 return;
1903
1904 // Unhook the temporary block with the checks, update various places
1905 // accordingly.
1906 if (SCEVCheckBlock)
1907 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1908 if (MemCheckBlock)
1909 MemCheckBlock->replaceAllUsesWith(Preheader);
1910
1911 if (SCEVCheckBlock) {
1912 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1913 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1914 Preheader->getTerminator()->eraseFromParent();
1915 }
1916 if (MemCheckBlock) {
1917 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1918 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1919 Preheader->getTerminator()->eraseFromParent();
1920 }
1921
1922 DT->changeImmediateDominator(LoopHeader, Preheader);
1923 if (MemCheckBlock) {
1924 DT->eraseNode(MemCheckBlock);
1925 LI->removeBlock(MemCheckBlock);
1926 }
1927 if (SCEVCheckBlock) {
1928 DT->eraseNode(SCEVCheckBlock);
1929 LI->removeBlock(SCEVCheckBlock);
1930 }
1931
1932 // Outer loop is used as part of the later cost calculations.
1933 OuterLoop = L->getParentLoop();
1934 }
1935
getCost()1936 InstructionCost getCost() {
1937 if (SCEVCheckBlock || MemCheckBlock)
1938 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1939
1940 if (CostTooHigh) {
1941 InstructionCost Cost;
1942 Cost.setInvalid();
1943 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1944 return Cost;
1945 }
1946
1947 InstructionCost RTCheckCost = 0;
1948 if (SCEVCheckBlock)
1949 for (Instruction &I : *SCEVCheckBlock) {
1950 if (SCEVCheckBlock->getTerminator() == &I)
1951 continue;
1952 InstructionCost C =
1953 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1954 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1955 RTCheckCost += C;
1956 }
1957 if (MemCheckBlock) {
1958 InstructionCost MemCheckCost = 0;
1959 for (Instruction &I : *MemCheckBlock) {
1960 if (MemCheckBlock->getTerminator() == &I)
1961 continue;
1962 InstructionCost C =
1963 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1964 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1965 MemCheckCost += C;
1966 }
1967
1968 // If the runtime memory checks are being created inside an outer loop
1969 // we should find out if these checks are outer loop invariant. If so,
1970 // the checks will likely be hoisted out and so the effective cost will
1971 // reduce according to the outer loop trip count.
1972 if (OuterLoop) {
1973 ScalarEvolution *SE = MemCheckExp.getSE();
1974 // TODO: If profitable, we could refine this further by analysing every
1975 // individual memory check, since there could be a mixture of loop
1976 // variant and invariant checks that mean the final condition is
1977 // variant.
1978 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1979 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1980 // It seems reasonable to assume that we can reduce the effective
1981 // cost of the checks even when we know nothing about the trip
1982 // count. Assume that the outer loop executes at least twice.
1983 unsigned BestTripCount = 2;
1984
1985 // If exact trip count is known use that.
1986 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
1987 BestTripCount = SmallTC;
1988 else if (LoopVectorizeWithBlockFrequency) {
1989 // Else use profile data if available.
1990 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
1991 BestTripCount = *EstimatedTC;
1992 }
1993
1994 BestTripCount = std::max(BestTripCount, 1U);
1995 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1996
1997 // Let's ensure the cost is always at least 1.
1998 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
1999 (InstructionCost::CostType)1);
2000
2001 if (BestTripCount > 1)
2002 LLVM_DEBUG(dbgs()
2003 << "We expect runtime memory checks to be hoisted "
2004 << "out of the outer loop. Cost reduced from "
2005 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2006
2007 MemCheckCost = NewMemCheckCost;
2008 }
2009 }
2010
2011 RTCheckCost += MemCheckCost;
2012 }
2013
2014 if (SCEVCheckBlock || MemCheckBlock)
2015 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2016 << "\n");
2017
2018 return RTCheckCost;
2019 }
2020
2021 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2022 /// unused.
~GeneratedRTChecks()2023 ~GeneratedRTChecks() {
2024 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2025 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2026 if (!SCEVCheckCond)
2027 SCEVCleaner.markResultUsed();
2028
2029 if (!MemRuntimeCheckCond)
2030 MemCheckCleaner.markResultUsed();
2031
2032 if (MemRuntimeCheckCond) {
2033 auto &SE = *MemCheckExp.getSE();
2034 // Memory runtime check generation creates compares that use expanded
2035 // values. Remove them before running the SCEVExpanderCleaners.
2036 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2037 if (MemCheckExp.isInsertedInstruction(&I))
2038 continue;
2039 SE.forgetValue(&I);
2040 I.eraseFromParent();
2041 }
2042 }
2043 MemCheckCleaner.cleanup();
2044 SCEVCleaner.cleanup();
2045
2046 if (SCEVCheckCond)
2047 SCEVCheckBlock->eraseFromParent();
2048 if (MemRuntimeCheckCond)
2049 MemCheckBlock->eraseFromParent();
2050 }
2051
2052 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2053 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2054 /// depending on the generated condition.
emitSCEVChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader,BasicBlock * LoopExitBlock)2055 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2056 BasicBlock *LoopVectorPreHeader,
2057 BasicBlock *LoopExitBlock) {
2058 if (!SCEVCheckCond)
2059 return nullptr;
2060
2061 Value *Cond = SCEVCheckCond;
2062 // Mark the check as used, to prevent it from being removed during cleanup.
2063 SCEVCheckCond = nullptr;
2064 if (auto *C = dyn_cast<ConstantInt>(Cond))
2065 if (C->isZero())
2066 return nullptr;
2067
2068 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2069
2070 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2071 // Create new preheader for vector loop.
2072 if (OuterLoop)
2073 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2074
2075 SCEVCheckBlock->getTerminator()->eraseFromParent();
2076 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2077 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2078 SCEVCheckBlock);
2079
2080 DT->addNewBlock(SCEVCheckBlock, Pred);
2081 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2082
2083 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2084 if (AddBranchWeights)
2085 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2086 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2087 return SCEVCheckBlock;
2088 }
2089
2090 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2091 /// the branches to branch to the vector preheader or \p Bypass, depending on
2092 /// the generated condition.
emitMemRuntimeChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader)2093 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2094 BasicBlock *LoopVectorPreHeader) {
2095 // Check if we generated code that checks in runtime if arrays overlap.
2096 if (!MemRuntimeCheckCond)
2097 return nullptr;
2098
2099 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2100 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2101 MemCheckBlock);
2102
2103 DT->addNewBlock(MemCheckBlock, Pred);
2104 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2105 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2106
2107 if (OuterLoop)
2108 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2109
2110 BranchInst &BI =
2111 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2112 if (AddBranchWeights) {
2113 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2114 }
2115 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2116 MemCheckBlock->getTerminator()->setDebugLoc(
2117 Pred->getTerminator()->getDebugLoc());
2118
2119 // Mark the check as used, to prevent it from being removed during cleanup.
2120 MemRuntimeCheckCond = nullptr;
2121 return MemCheckBlock;
2122 }
2123 };
2124 } // namespace
2125
useActiveLaneMask(TailFoldingStyle Style)2126 static bool useActiveLaneMask(TailFoldingStyle Style) {
2127 return Style == TailFoldingStyle::Data ||
2128 Style == TailFoldingStyle::DataAndControlFlow ||
2129 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2130 }
2131
useActiveLaneMaskForControlFlow(TailFoldingStyle Style)2132 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2133 return Style == TailFoldingStyle::DataAndControlFlow ||
2134 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2135 }
2136
2137 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2138 // vectorization. The loop needs to be annotated with #pragma omp simd
2139 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2140 // vector length information is not provided, vectorization is not considered
2141 // explicit. Interleave hints are not allowed either. These limitations will be
2142 // relaxed in the future.
2143 // Please, note that we are currently forced to abuse the pragma 'clang
2144 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2145 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2146 // provides *explicit vectorization hints* (LV can bypass legal checks and
2147 // assume that vectorization is legal). However, both hints are implemented
2148 // using the same metadata (llvm.loop.vectorize, processed by
2149 // LoopVectorizeHints). This will be fixed in the future when the native IR
2150 // representation for pragma 'omp simd' is introduced.
isExplicitVecOuterLoop(Loop * OuterLp,OptimizationRemarkEmitter * ORE)2151 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2152 OptimizationRemarkEmitter *ORE) {
2153 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2154 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2155
2156 // Only outer loops with an explicit vectorization hint are supported.
2157 // Unannotated outer loops are ignored.
2158 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2159 return false;
2160
2161 Function *Fn = OuterLp->getHeader()->getParent();
2162 if (!Hints.allowVectorization(Fn, OuterLp,
2163 true /*VectorizeOnlyWhenForced*/)) {
2164 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2165 return false;
2166 }
2167
2168 if (Hints.getInterleave() > 1) {
2169 // TODO: Interleave support is future work.
2170 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2171 "outer loops.\n");
2172 Hints.emitRemarkWithHints();
2173 return false;
2174 }
2175
2176 return true;
2177 }
2178
collectSupportedLoops(Loop & L,LoopInfo * LI,OptimizationRemarkEmitter * ORE,SmallVectorImpl<Loop * > & V)2179 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2180 OptimizationRemarkEmitter *ORE,
2181 SmallVectorImpl<Loop *> &V) {
2182 // Collect inner loops and outer loops without irreducible control flow. For
2183 // now, only collect outer loops that have explicit vectorization hints. If we
2184 // are stress testing the VPlan H-CFG construction, we collect the outermost
2185 // loop of every loop nest.
2186 if (L.isInnermost() || VPlanBuildStressTest ||
2187 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2188 LoopBlocksRPO RPOT(&L);
2189 RPOT.perform(LI);
2190 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2191 V.push_back(&L);
2192 // TODO: Collect inner loops inside marked outer loops in case
2193 // vectorization fails for the outer loop. Do not invoke
2194 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2195 // already known to be reducible. We can use an inherited attribute for
2196 // that.
2197 return;
2198 }
2199 }
2200 for (Loop *InnerL : L)
2201 collectSupportedLoops(*InnerL, LI, ORE, V);
2202 }
2203
2204 //===----------------------------------------------------------------------===//
2205 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2206 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2207 //===----------------------------------------------------------------------===//
2208
2209 /// Compute the transformed value of Index at offset StartValue using step
2210 /// StepValue.
2211 /// For integer induction, returns StartValue + Index * StepValue.
2212 /// For pointer induction, returns StartValue[Index * StepValue].
2213 /// FIXME: The newly created binary instructions should contain nsw/nuw
2214 /// flags, which can be found from the original scalar operations.
2215 static Value *
emitTransformedIndex(IRBuilderBase & B,Value * Index,Value * StartValue,Value * Step,InductionDescriptor::InductionKind InductionKind,const BinaryOperator * InductionBinOp)2216 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2217 Value *Step,
2218 InductionDescriptor::InductionKind InductionKind,
2219 const BinaryOperator *InductionBinOp) {
2220 Type *StepTy = Step->getType();
2221 Value *CastedIndex = StepTy->isIntegerTy()
2222 ? B.CreateSExtOrTrunc(Index, StepTy)
2223 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2224 if (CastedIndex != Index) {
2225 CastedIndex->setName(CastedIndex->getName() + ".cast");
2226 Index = CastedIndex;
2227 }
2228
2229 // Note: the IR at this point is broken. We cannot use SE to create any new
2230 // SCEV and then expand it, hoping that SCEV's simplification will give us
2231 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2232 // lead to various SCEV crashes. So all we can do is to use builder and rely
2233 // on InstCombine for future simplifications. Here we handle some trivial
2234 // cases only.
2235 auto CreateAdd = [&B](Value *X, Value *Y) {
2236 assert(X->getType() == Y->getType() && "Types don't match!");
2237 if (auto *CX = dyn_cast<ConstantInt>(X))
2238 if (CX->isZero())
2239 return Y;
2240 if (auto *CY = dyn_cast<ConstantInt>(Y))
2241 if (CY->isZero())
2242 return X;
2243 return B.CreateAdd(X, Y);
2244 };
2245
2246 // We allow X to be a vector type, in which case Y will potentially be
2247 // splatted into a vector with the same element count.
2248 auto CreateMul = [&B](Value *X, Value *Y) {
2249 assert(X->getType()->getScalarType() == Y->getType() &&
2250 "Types don't match!");
2251 if (auto *CX = dyn_cast<ConstantInt>(X))
2252 if (CX->isOne())
2253 return Y;
2254 if (auto *CY = dyn_cast<ConstantInt>(Y))
2255 if (CY->isOne())
2256 return X;
2257 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2258 if (XVTy && !isa<VectorType>(Y->getType()))
2259 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2260 return B.CreateMul(X, Y);
2261 };
2262
2263 switch (InductionKind) {
2264 case InductionDescriptor::IK_IntInduction: {
2265 assert(!isa<VectorType>(Index->getType()) &&
2266 "Vector indices not supported for integer inductions yet");
2267 assert(Index->getType() == StartValue->getType() &&
2268 "Index type does not match StartValue type");
2269 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2270 return B.CreateSub(StartValue, Index);
2271 auto *Offset = CreateMul(Index, Step);
2272 return CreateAdd(StartValue, Offset);
2273 }
2274 case InductionDescriptor::IK_PtrInduction:
2275 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2276 case InductionDescriptor::IK_FpInduction: {
2277 assert(!isa<VectorType>(Index->getType()) &&
2278 "Vector indices not supported for FP inductions yet");
2279 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2280 assert(InductionBinOp &&
2281 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2282 InductionBinOp->getOpcode() == Instruction::FSub) &&
2283 "Original bin op should be defined for FP induction");
2284
2285 Value *MulExp = B.CreateFMul(Step, Index);
2286 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2287 "induction");
2288 }
2289 case InductionDescriptor::IK_NoInduction:
2290 return nullptr;
2291 }
2292 llvm_unreachable("invalid enum");
2293 }
2294
getMaxVScale(const Function & F,const TargetTransformInfo & TTI)2295 std::optional<unsigned> getMaxVScale(const Function &F,
2296 const TargetTransformInfo &TTI) {
2297 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2298 return MaxVScale;
2299
2300 if (F.hasFnAttribute(Attribute::VScaleRange))
2301 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2302
2303 return std::nullopt;
2304 }
2305
2306 /// For the given VF and UF and maximum trip count computed for the loop, return
2307 /// whether the induction variable might overflow in the vectorized loop. If not,
2308 /// then we know a runtime overflow check always evaluates to false and can be
2309 /// removed.
isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel * Cost,ElementCount VF,std::optional<unsigned> UF=std::nullopt)2310 static bool isIndvarOverflowCheckKnownFalse(
2311 const LoopVectorizationCostModel *Cost,
2312 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2313 // Always be conservative if we don't know the exact unroll factor.
2314 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2315
2316 Type *IdxTy = Cost->Legal->getWidestInductionType();
2317 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2318
2319 // We know the runtime overflow check is known false iff the (max) trip-count
2320 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2321 // the vector loop induction variable.
2322 if (unsigned TC =
2323 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2324 uint64_t MaxVF = VF.getKnownMinValue();
2325 if (VF.isScalable()) {
2326 std::optional<unsigned> MaxVScale =
2327 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2328 if (!MaxVScale)
2329 return false;
2330 MaxVF *= *MaxVScale;
2331 }
2332
2333 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2334 }
2335
2336 return false;
2337 }
2338
2339 // Return whether we allow using masked interleave-groups (for dealing with
2340 // strided loads/stores that reside in predicated blocks, or for dealing
2341 // with gaps).
useMaskedInterleavedAccesses(const TargetTransformInfo & TTI)2342 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2343 // If an override option has been passed in for interleaved accesses, use it.
2344 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2345 return EnableMaskedInterleavedMemAccesses;
2346
2347 return TTI.enableMaskedInterleavedAccessVectorization();
2348 }
2349
scalarizeInstruction(const Instruction * Instr,VPReplicateRecipe * RepRecipe,const VPIteration & Instance,VPTransformState & State)2350 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2351 VPReplicateRecipe *RepRecipe,
2352 const VPIteration &Instance,
2353 VPTransformState &State) {
2354 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2355
2356 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2357 // the first lane and part.
2358 if (isa<NoAliasScopeDeclInst>(Instr))
2359 if (!Instance.isFirstIteration())
2360 return;
2361
2362 // Does this instruction return a value ?
2363 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2364
2365 Instruction *Cloned = Instr->clone();
2366 if (!IsVoidRetTy) {
2367 Cloned->setName(Instr->getName() + ".cloned");
2368 #if !defined(NDEBUG)
2369 // Verify that VPlan type inference results agree with the type of the
2370 // generated values.
2371 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2372 "inferred type and type from generated instructions do not match");
2373 #endif
2374 }
2375
2376 RepRecipe->setFlags(Cloned);
2377
2378 if (auto DL = Instr->getDebugLoc())
2379 State.setDebugLocFrom(DL);
2380
2381 // Replace the operands of the cloned instructions with their scalar
2382 // equivalents in the new loop.
2383 for (const auto &I : enumerate(RepRecipe->operands())) {
2384 auto InputInstance = Instance;
2385 VPValue *Operand = I.value();
2386 if (vputils::isUniformAfterVectorization(Operand))
2387 InputInstance.Lane = VPLane::getFirstLane();
2388 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2389 }
2390 State.addNewMetadata(Cloned, Instr);
2391
2392 // Place the cloned scalar in the new loop.
2393 State.Builder.Insert(Cloned);
2394
2395 State.set(RepRecipe, Cloned, Instance);
2396
2397 // If we just cloned a new assumption, add it the assumption cache.
2398 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2399 AC->registerAssumption(II);
2400
2401 // End if-block.
2402 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2403 if (IfPredicateInstr)
2404 PredicatedInstructions.push_back(Cloned);
2405 }
2406
2407 Value *
getOrCreateVectorTripCount(BasicBlock * InsertBlock)2408 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2409 if (VectorTripCount)
2410 return VectorTripCount;
2411
2412 Value *TC = getTripCount();
2413 IRBuilder<> Builder(InsertBlock->getTerminator());
2414
2415 Type *Ty = TC->getType();
2416 // This is where we can make the step a runtime constant.
2417 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2418
2419 // If the tail is to be folded by masking, round the number of iterations N
2420 // up to a multiple of Step instead of rounding down. This is done by first
2421 // adding Step-1 and then rounding down. Note that it's ok if this addition
2422 // overflows: the vector induction variable will eventually wrap to zero given
2423 // that it starts at zero and its Step is a power of two; the loop will then
2424 // exit, with the last early-exit vector comparison also producing all-true.
2425 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2426 // is accounted for in emitIterationCountCheck that adds an overflow check.
2427 if (Cost->foldTailByMasking()) {
2428 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2429 "VF*UF must be a power of 2 when folding tail by masking");
2430 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2431 "n.rnd.up");
2432 }
2433
2434 // Now we need to generate the expression for the part of the loop that the
2435 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2436 // iterations are not required for correctness, or N - Step, otherwise. Step
2437 // is equal to the vectorization factor (number of SIMD elements) times the
2438 // unroll factor (number of SIMD instructions).
2439 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2440
2441 // There are cases where we *must* run at least one iteration in the remainder
2442 // loop. See the cost model for when this can happen. If the step evenly
2443 // divides the trip count, we set the remainder to be equal to the step. If
2444 // the step does not evenly divide the trip count, no adjustment is necessary
2445 // since there will already be scalar iterations. Note that the minimum
2446 // iterations check ensures that N >= Step.
2447 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2448 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2449 R = Builder.CreateSelect(IsZero, Step, R);
2450 }
2451
2452 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2453
2454 return VectorTripCount;
2455 }
2456
emitIterationCountCheck(BasicBlock * Bypass)2457 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2458 Value *Count = getTripCount();
2459 // Reuse existing vector loop preheader for TC checks.
2460 // Note that new preheader block is generated for vector loop.
2461 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2462 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2463
2464 // Generate code to check if the loop's trip count is less than VF * UF, or
2465 // equal to it in case a scalar epilogue is required; this implies that the
2466 // vector trip count is zero. This check also covers the case where adding one
2467 // to the backedge-taken count overflowed leading to an incorrect trip count
2468 // of zero. In this case we will also jump to the scalar loop.
2469 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2470 : ICmpInst::ICMP_ULT;
2471
2472 // If tail is to be folded, vector loop takes care of all iterations.
2473 Type *CountTy = Count->getType();
2474 Value *CheckMinIters = Builder.getFalse();
2475 auto CreateStep = [&]() -> Value * {
2476 // Create step with max(MinProTripCount, UF * VF).
2477 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2478 return createStepForVF(Builder, CountTy, VF, UF);
2479
2480 Value *MinProfTC =
2481 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2482 if (!VF.isScalable())
2483 return MinProfTC;
2484 return Builder.CreateBinaryIntrinsic(
2485 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2486 };
2487
2488 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2489 if (Style == TailFoldingStyle::None)
2490 CheckMinIters =
2491 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2492 else if (VF.isScalable() &&
2493 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2494 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2495 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2496 // an overflow to zero when updating induction variables and so an
2497 // additional overflow check is required before entering the vector loop.
2498
2499 // Get the maximum unsigned value for the type.
2500 Value *MaxUIntTripCount =
2501 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2502 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2503
2504 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2505 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2506 }
2507
2508 // Create new preheader for vector loop.
2509 LoopVectorPreHeader =
2510 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2511 "vector.ph");
2512
2513 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2514 DT->getNode(Bypass)->getIDom()) &&
2515 "TC check is expected to dominate Bypass");
2516
2517 // Update dominator for Bypass & LoopExit (if needed).
2518 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2519 BranchInst &BI =
2520 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2521 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2522 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2523 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2524 LoopBypassBlocks.push_back(TCCheckBlock);
2525 }
2526
emitSCEVChecks(BasicBlock * Bypass)2527 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2528 BasicBlock *const SCEVCheckBlock =
2529 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2530 if (!SCEVCheckBlock)
2531 return nullptr;
2532
2533 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2534 (OptForSizeBasedOnProfile &&
2535 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2536 "Cannot SCEV check stride or overflow when optimizing for size");
2537
2538
2539 // Update dominator only if this is first RT check.
2540 if (LoopBypassBlocks.empty()) {
2541 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2542 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2543 // If there is an epilogue which must run, there's no edge from the
2544 // middle block to exit blocks and thus no need to update the immediate
2545 // dominator of the exit blocks.
2546 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2547 }
2548
2549 LoopBypassBlocks.push_back(SCEVCheckBlock);
2550 AddedSafetyChecks = true;
2551 return SCEVCheckBlock;
2552 }
2553
emitMemRuntimeChecks(BasicBlock * Bypass)2554 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2555 // VPlan-native path does not do any analysis for runtime checks currently.
2556 if (EnableVPlanNativePath)
2557 return nullptr;
2558
2559 BasicBlock *const MemCheckBlock =
2560 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2561
2562 // Check if we generated code that checks in runtime if arrays overlap. We put
2563 // the checks into a separate block to make the more common case of few
2564 // elements faster.
2565 if (!MemCheckBlock)
2566 return nullptr;
2567
2568 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2569 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2570 "Cannot emit memory checks when optimizing for size, unless forced "
2571 "to vectorize.");
2572 ORE->emit([&]() {
2573 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2574 OrigLoop->getStartLoc(),
2575 OrigLoop->getHeader())
2576 << "Code-size may be reduced by not forcing "
2577 "vectorization, or by source-code modifications "
2578 "eliminating the need for runtime checks "
2579 "(e.g., adding 'restrict').";
2580 });
2581 }
2582
2583 LoopBypassBlocks.push_back(MemCheckBlock);
2584
2585 AddedSafetyChecks = true;
2586
2587 return MemCheckBlock;
2588 }
2589
createVectorLoopSkeleton(StringRef Prefix)2590 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2591 LoopScalarBody = OrigLoop->getHeader();
2592 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2593 assert(LoopVectorPreHeader && "Invalid loop structure");
2594 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2595 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2596 "multiple exit loop without required epilogue?");
2597
2598 LoopMiddleBlock =
2599 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2600 LI, nullptr, Twine(Prefix) + "middle.block");
2601 LoopScalarPreHeader =
2602 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2603 nullptr, Twine(Prefix) + "scalar.ph");
2604 }
2605
createInductionResumeValue(PHINode * OrigPhi,const InductionDescriptor & II,Value * Step,ArrayRef<BasicBlock * > BypassBlocks,std::pair<BasicBlock *,Value * > AdditionalBypass)2606 PHINode *InnerLoopVectorizer::createInductionResumeValue(
2607 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2608 ArrayRef<BasicBlock *> BypassBlocks,
2609 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2610 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
2611 assert(VectorTripCount && "Expected valid arguments");
2612
2613 Instruction *OldInduction = Legal->getPrimaryInduction();
2614 Value *&EndValue = IVEndValues[OrigPhi];
2615 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
2616 if (OrigPhi == OldInduction) {
2617 // We know what the end value is.
2618 EndValue = VectorTripCount;
2619 } else {
2620 IRBuilder<> B(LoopVectorPreHeader->getTerminator());
2621
2622 // Fast-math-flags propagate from the original induction instruction.
2623 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2624 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2625
2626 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
2627 Step, II.getKind(), II.getInductionBinOp());
2628 EndValue->setName("ind.end");
2629
2630 // Compute the end value for the additional bypass (if applicable).
2631 if (AdditionalBypass.first) {
2632 B.SetInsertPoint(AdditionalBypass.first,
2633 AdditionalBypass.first->getFirstInsertionPt());
2634 EndValueFromAdditionalBypass =
2635 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
2636 Step, II.getKind(), II.getInductionBinOp());
2637 EndValueFromAdditionalBypass->setName("ind.end");
2638 }
2639 }
2640
2641 // Create phi nodes to merge from the backedge-taken check block.
2642 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
2643 LoopScalarPreHeader->getFirstNonPHI());
2644 // Copy original phi DL over to the new one.
2645 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2646
2647 // The new PHI merges the original incoming value, in case of a bypass,
2648 // or the value at the end of the vectorized loop.
2649 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
2650
2651 // Fix the scalar body counter (PHI node).
2652 // The old induction's phi node in the scalar body needs the truncated
2653 // value.
2654 for (BasicBlock *BB : BypassBlocks)
2655 BCResumeVal->addIncoming(II.getStartValue(), BB);
2656
2657 if (AdditionalBypass.first)
2658 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
2659 EndValueFromAdditionalBypass);
2660 return BCResumeVal;
2661 }
2662
2663 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2664 /// expansion results.
getExpandedStep(const InductionDescriptor & ID,const SCEV2ValueTy & ExpandedSCEVs)2665 static Value *getExpandedStep(const InductionDescriptor &ID,
2666 const SCEV2ValueTy &ExpandedSCEVs) {
2667 const SCEV *Step = ID.getStep();
2668 if (auto *C = dyn_cast<SCEVConstant>(Step))
2669 return C->getValue();
2670 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2671 return U->getValue();
2672 auto I = ExpandedSCEVs.find(Step);
2673 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2674 return I->second;
2675 }
2676
createInductionResumeValues(const SCEV2ValueTy & ExpandedSCEVs,std::pair<BasicBlock *,Value * > AdditionalBypass)2677 void InnerLoopVectorizer::createInductionResumeValues(
2678 const SCEV2ValueTy &ExpandedSCEVs,
2679 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2680 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
2681 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
2682 "Inconsistent information about additional bypass.");
2683 // We are going to resume the execution of the scalar loop.
2684 // Go over all of the induction variables that we found and fix the
2685 // PHIs that are left in the scalar version of the loop.
2686 // The starting values of PHI nodes depend on the counter of the last
2687 // iteration in the vectorized loop.
2688 // If we come from a bypass edge then we need to start from the original
2689 // start value.
2690 for (const auto &InductionEntry : Legal->getInductionVars()) {
2691 PHINode *OrigPhi = InductionEntry.first;
2692 const InductionDescriptor &II = InductionEntry.second;
2693 PHINode *BCResumeVal = createInductionResumeValue(
2694 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
2695 AdditionalBypass);
2696 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
2697 }
2698 }
2699
2700 std::pair<BasicBlock *, Value *>
createVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)2701 InnerLoopVectorizer::createVectorizedLoopSkeleton(
2702 const SCEV2ValueTy &ExpandedSCEVs) {
2703 /*
2704 In this function we generate a new loop. The new loop will contain
2705 the vectorized instructions while the old loop will continue to run the
2706 scalar remainder.
2707
2708 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2709 / | preheader are expanded here. Eventually all required SCEV
2710 / | expansion should happen here.
2711 / v
2712 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2713 | / |
2714 | / v
2715 || [ ] <-- vector pre header.
2716 |/ |
2717 | v
2718 | [ ] \
2719 | [ ]_| <-- vector loop (created during VPlan execution).
2720 | |
2721 | v
2722 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2723 | | successors created during VPlan execution)
2724 \/ |
2725 /\ v
2726 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2727 | |
2728 (opt) v <-- edge from middle to exit iff epilogue is not required.
2729 | [ ] \
2730 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
2731 \ |
2732 \ v
2733 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2734 ...
2735 */
2736
2737 // Create an empty vector loop, and prepare basic blocks for the runtime
2738 // checks.
2739 createVectorLoopSkeleton("");
2740
2741 // Now, compare the new count to zero. If it is zero skip the vector loop and
2742 // jump to the scalar loop. This check also covers the case where the
2743 // backedge-taken count is uint##_max: adding one to it will overflow leading
2744 // to an incorrect trip count of zero. In this (rare) case we will also jump
2745 // to the scalar loop.
2746 emitIterationCountCheck(LoopScalarPreHeader);
2747
2748 // Generate the code to check any assumptions that we've made for SCEV
2749 // expressions.
2750 emitSCEVChecks(LoopScalarPreHeader);
2751
2752 // Generate the code that checks in runtime if arrays overlap. We put the
2753 // checks into a separate block to make the more common case of few elements
2754 // faster.
2755 emitMemRuntimeChecks(LoopScalarPreHeader);
2756
2757 // Emit phis for the new starting index of the scalar loop.
2758 createInductionResumeValues(ExpandedSCEVs);
2759
2760 return {LoopVectorPreHeader, nullptr};
2761 }
2762
2763 // Fix up external users of the induction variable. At this point, we are
2764 // in LCSSA form, with all external PHIs that use the IV having one input value,
2765 // coming from the remainder loop. We need those PHIs to also have a correct
2766 // value for the IV when arriving directly from the middle block.
fixupIVUsers(PHINode * OrigPhi,const InductionDescriptor & II,Value * VectorTripCount,Value * EndValue,BasicBlock * MiddleBlock,BasicBlock * VectorHeader,VPlan & Plan,VPTransformState & State)2767 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
2768 const InductionDescriptor &II,
2769 Value *VectorTripCount, Value *EndValue,
2770 BasicBlock *MiddleBlock,
2771 BasicBlock *VectorHeader, VPlan &Plan,
2772 VPTransformState &State) {
2773 // There are two kinds of external IV usages - those that use the value
2774 // computed in the last iteration (the PHI) and those that use the penultimate
2775 // value (the value that feeds into the phi from the loop latch).
2776 // We allow both, but they, obviously, have different values.
2777
2778 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
2779
2780 DenseMap<Value *, Value *> MissingVals;
2781
2782 // An external user of the last iteration's value should see the value that
2783 // the remainder loop uses to initialize its own IV.
2784 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
2785 for (User *U : PostInc->users()) {
2786 Instruction *UI = cast<Instruction>(U);
2787 if (!OrigLoop->contains(UI)) {
2788 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2789 MissingVals[UI] = EndValue;
2790 }
2791 }
2792
2793 // An external user of the penultimate value need to see EndValue - Step.
2794 // The simplest way to get this is to recompute it from the constituent SCEVs,
2795 // that is Start + (Step * (CRD - 1)).
2796 for (User *U : OrigPhi->users()) {
2797 auto *UI = cast<Instruction>(U);
2798 if (!OrigLoop->contains(UI)) {
2799 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2800 IRBuilder<> B(MiddleBlock->getTerminator());
2801
2802 // Fast-math-flags propagate from the original induction instruction.
2803 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2804 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2805
2806 Value *CountMinusOne = B.CreateSub(
2807 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
2808 CountMinusOne->setName("cmo");
2809
2810 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
2811 assert(StepVPV && "step must have been expanded during VPlan execution");
2812 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2813 : State.get(StepVPV, {0, 0});
2814 Value *Escape =
2815 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
2816 II.getKind(), II.getInductionBinOp());
2817 Escape->setName("ind.escape");
2818 MissingVals[UI] = Escape;
2819 }
2820 }
2821
2822 for (auto &I : MissingVals) {
2823 PHINode *PHI = cast<PHINode>(I.first);
2824 // One corner case we have to handle is two IVs "chasing" each-other,
2825 // that is %IV2 = phi [...], [ %IV1, %latch ]
2826 // In this case, if IV1 has an external use, we need to avoid adding both
2827 // "last value of IV1" and "penultimate value of IV2". So, verify that we
2828 // don't already have an incoming value for the middle block.
2829 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
2830 PHI->addIncoming(I.second, MiddleBlock);
2831 Plan.removeLiveOut(PHI);
2832 }
2833 }
2834 }
2835
2836 namespace {
2837
2838 struct CSEDenseMapInfo {
canHandle__anon71de2b2d0c11::CSEDenseMapInfo2839 static bool canHandle(const Instruction *I) {
2840 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2841 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2842 }
2843
getEmptyKey__anon71de2b2d0c11::CSEDenseMapInfo2844 static inline Instruction *getEmptyKey() {
2845 return DenseMapInfo<Instruction *>::getEmptyKey();
2846 }
2847
getTombstoneKey__anon71de2b2d0c11::CSEDenseMapInfo2848 static inline Instruction *getTombstoneKey() {
2849 return DenseMapInfo<Instruction *>::getTombstoneKey();
2850 }
2851
getHashValue__anon71de2b2d0c11::CSEDenseMapInfo2852 static unsigned getHashValue(const Instruction *I) {
2853 assert(canHandle(I) && "Unknown instruction!");
2854 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2855 I->value_op_end()));
2856 }
2857
isEqual__anon71de2b2d0c11::CSEDenseMapInfo2858 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2859 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2860 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2861 return LHS == RHS;
2862 return LHS->isIdenticalTo(RHS);
2863 }
2864 };
2865
2866 } // end anonymous namespace
2867
2868 ///Perform cse of induction variable instructions.
cse(BasicBlock * BB)2869 static void cse(BasicBlock *BB) {
2870 // Perform simple cse.
2871 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2872 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2873 if (!CSEDenseMapInfo::canHandle(&In))
2874 continue;
2875
2876 // Check if we can replace this instruction with any of the
2877 // visited instructions.
2878 if (Instruction *V = CSEMap.lookup(&In)) {
2879 In.replaceAllUsesWith(V);
2880 In.eraseFromParent();
2881 continue;
2882 }
2883
2884 CSEMap[&In] = &In;
2885 }
2886 }
2887
2888 InstructionCost
getVectorCallCost(CallInst * CI,ElementCount VF) const2889 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2890 ElementCount VF) const {
2891 // We only need to calculate a cost if the VF is scalar; for actual vectors
2892 // we should already have a pre-calculated cost at each VF.
2893 if (!VF.isScalar())
2894 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2895
2896 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2897 Type *RetTy = CI->getType();
2898 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
2899 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
2900 return *RedCost;
2901
2902 SmallVector<Type *, 4> Tys;
2903 for (auto &ArgOp : CI->args())
2904 Tys.push_back(ArgOp->getType());
2905
2906 InstructionCost ScalarCallCost =
2907 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2908
2909 // If this is an intrinsic we may have a lower cost for it.
2910 if (getVectorIntrinsicIDForCall(CI, TLI)) {
2911 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2912 return std::min(ScalarCallCost, IntrinsicCost);
2913 }
2914 return ScalarCallCost;
2915 }
2916
MaybeVectorizeType(Type * Elt,ElementCount VF)2917 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
2918 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2919 return Elt;
2920 return VectorType::get(Elt, VF);
2921 }
2922
2923 InstructionCost
getVectorIntrinsicCost(CallInst * CI,ElementCount VF) const2924 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2925 ElementCount VF) const {
2926 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2927 assert(ID && "Expected intrinsic call!");
2928 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
2929 FastMathFlags FMF;
2930 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2931 FMF = FPMO->getFastMathFlags();
2932
2933 SmallVector<const Value *> Arguments(CI->args());
2934 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2935 SmallVector<Type *> ParamTys;
2936 std::transform(FTy->param_begin(), FTy->param_end(),
2937 std::back_inserter(ParamTys),
2938 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
2939
2940 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2941 dyn_cast<IntrinsicInst>(CI));
2942 return TTI.getIntrinsicInstrCost(CostAttrs,
2943 TargetTransformInfo::TCK_RecipThroughput);
2944 }
2945
fixVectorizedLoop(VPTransformState & State,VPlan & Plan)2946 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
2947 VPlan &Plan) {
2948 // Fix widened non-induction PHIs by setting up the PHI operands.
2949 if (EnableVPlanNativePath)
2950 fixNonInductionPHIs(Plan, State);
2951
2952 // Forget the original basic block.
2953 PSE.getSE()->forgetLoop(OrigLoop);
2954 PSE.getSE()->forgetBlockAndLoopDispositions();
2955
2956 // After vectorization, the exit blocks of the original loop will have
2957 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2958 // looked through single-entry phis.
2959 SmallVector<BasicBlock *> ExitBlocks;
2960 OrigLoop->getExitBlocks(ExitBlocks);
2961 for (BasicBlock *Exit : ExitBlocks)
2962 for (PHINode &PN : Exit->phis())
2963 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
2964
2965 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2966 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
2967 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
2968 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2969 // No edge from the middle block to the unique exit block has been inserted
2970 // and there is nothing to fix from vector loop; phis should have incoming
2971 // from scalar loop only.
2972 } else {
2973 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
2974 // the cost model.
2975
2976 // If we inserted an edge from the middle block to the unique exit block,
2977 // update uses outside the loop (phis) to account for the newly inserted
2978 // edge.
2979
2980 // Fix-up external users of the induction variables.
2981 for (const auto &Entry : Legal->getInductionVars())
2982 fixupIVUsers(Entry.first, Entry.second,
2983 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
2984 IVEndValues[Entry.first], LoopMiddleBlock,
2985 VectorLoop->getHeader(), Plan, State);
2986 }
2987
2988 // Fix live-out phis not already fixed earlier.
2989 for (const auto &KV : Plan.getLiveOuts())
2990 KV.second->fixPhi(Plan, State);
2991
2992 for (Instruction *PI : PredicatedInstructions)
2993 sinkScalarOperands(&*PI);
2994
2995 // Remove redundant induction instructions.
2996 cse(VectorLoop->getHeader());
2997
2998 // Set/update profile weights for the vector and remainder loops as original
2999 // loop iterations are now distributed among them. Note that original loop
3000 // represented by LoopScalarBody becomes remainder loop after vectorization.
3001 //
3002 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3003 // end up getting slightly roughened result but that should be OK since
3004 // profile is not inherently precise anyway. Note also possible bypass of
3005 // vector code caused by legality checks is ignored, assigning all the weight
3006 // to the vector loop, optimistically.
3007 //
3008 // For scalable vectorization we can't know at compile time how many iterations
3009 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3010 // vscale of '1'.
3011 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3012 LI->getLoopFor(LoopScalarBody),
3013 VF.getKnownMinValue() * UF);
3014 }
3015
sinkScalarOperands(Instruction * PredInst)3016 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3017 // The basic block and loop containing the predicated instruction.
3018 auto *PredBB = PredInst->getParent();
3019 auto *VectorLoop = LI->getLoopFor(PredBB);
3020
3021 // Initialize a worklist with the operands of the predicated instruction.
3022 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3023
3024 // Holds instructions that we need to analyze again. An instruction may be
3025 // reanalyzed if we don't yet know if we can sink it or not.
3026 SmallVector<Instruction *, 8> InstsToReanalyze;
3027
3028 // Returns true if a given use occurs in the predicated block. Phi nodes use
3029 // their operands in their corresponding predecessor blocks.
3030 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3031 auto *I = cast<Instruction>(U.getUser());
3032 BasicBlock *BB = I->getParent();
3033 if (auto *Phi = dyn_cast<PHINode>(I))
3034 BB = Phi->getIncomingBlock(
3035 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3036 return BB == PredBB;
3037 };
3038
3039 // Iteratively sink the scalarized operands of the predicated instruction
3040 // into the block we created for it. When an instruction is sunk, it's
3041 // operands are then added to the worklist. The algorithm ends after one pass
3042 // through the worklist doesn't sink a single instruction.
3043 bool Changed;
3044 do {
3045 // Add the instructions that need to be reanalyzed to the worklist, and
3046 // reset the changed indicator.
3047 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3048 InstsToReanalyze.clear();
3049 Changed = false;
3050
3051 while (!Worklist.empty()) {
3052 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3053
3054 // We can't sink an instruction if it is a phi node, is not in the loop,
3055 // may have side effects or may read from memory.
3056 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3057 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3058 I->mayHaveSideEffects() || I->mayReadFromMemory())
3059 continue;
3060
3061 // If the instruction is already in PredBB, check if we can sink its
3062 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3063 // sinking the scalar instruction I, hence it appears in PredBB; but it
3064 // may have failed to sink I's operands (recursively), which we try
3065 // (again) here.
3066 if (I->getParent() == PredBB) {
3067 Worklist.insert(I->op_begin(), I->op_end());
3068 continue;
3069 }
3070
3071 // It's legal to sink the instruction if all its uses occur in the
3072 // predicated block. Otherwise, there's nothing to do yet, and we may
3073 // need to reanalyze the instruction.
3074 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3075 InstsToReanalyze.push_back(I);
3076 continue;
3077 }
3078
3079 // Move the instruction to the beginning of the predicated block, and add
3080 // it's operands to the worklist.
3081 I->moveBefore(&*PredBB->getFirstInsertionPt());
3082 Worklist.insert(I->op_begin(), I->op_end());
3083
3084 // The sinking may have enabled other instructions to be sunk, so we will
3085 // need to iterate.
3086 Changed = true;
3087 }
3088 } while (Changed);
3089 }
3090
fixNonInductionPHIs(VPlan & Plan,VPTransformState & State)3091 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
3092 VPTransformState &State) {
3093 auto Iter = vp_depth_first_deep(Plan.getEntry());
3094 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3095 for (VPRecipeBase &P : VPBB->phis()) {
3096 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3097 if (!VPPhi)
3098 continue;
3099 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3100 // Make sure the builder has a valid insert point.
3101 Builder.SetInsertPoint(NewPhi);
3102 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3103 VPValue *Inc = VPPhi->getIncomingValue(i);
3104 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3105 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3106 }
3107 }
3108 }
3109 }
3110
collectLoopScalars(ElementCount VF)3111 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3112 // We should not collect Scalars more than once per VF. Right now, this
3113 // function is called from collectUniformsAndScalars(), which already does
3114 // this check. Collecting Scalars for VF=1 does not make any sense.
3115 assert(VF.isVector() && !Scalars.contains(VF) &&
3116 "This function should not be visited twice for the same VF");
3117
3118 // This avoids any chances of creating a REPLICATE recipe during planning
3119 // since that would result in generation of scalarized code during execution,
3120 // which is not supported for scalable vectors.
3121 if (VF.isScalable()) {
3122 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3123 return;
3124 }
3125
3126 SmallSetVector<Instruction *, 8> Worklist;
3127
3128 // These sets are used to seed the analysis with pointers used by memory
3129 // accesses that will remain scalar.
3130 SmallSetVector<Instruction *, 8> ScalarPtrs;
3131 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3132 auto *Latch = TheLoop->getLoopLatch();
3133
3134 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3135 // The pointer operands of loads and stores will be scalar as long as the
3136 // memory access is not a gather or scatter operation. The value operand of a
3137 // store will remain scalar if the store is scalarized.
3138 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3139 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3140 assert(WideningDecision != CM_Unknown &&
3141 "Widening decision should be ready at this moment");
3142 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3143 if (Ptr == Store->getValueOperand())
3144 return WideningDecision == CM_Scalarize;
3145 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3146 "Ptr is neither a value or pointer operand");
3147 return WideningDecision != CM_GatherScatter;
3148 };
3149
3150 // A helper that returns true if the given value is a bitcast or
3151 // getelementptr instruction contained in the loop.
3152 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3153 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3154 isa<GetElementPtrInst>(V)) &&
3155 !TheLoop->isLoopInvariant(V);
3156 };
3157
3158 // A helper that evaluates a memory access's use of a pointer. If the use will
3159 // be a scalar use and the pointer is only used by memory accesses, we place
3160 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3161 // PossibleNonScalarPtrs.
3162 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3163 // We only care about bitcast and getelementptr instructions contained in
3164 // the loop.
3165 if (!isLoopVaryingBitCastOrGEP(Ptr))
3166 return;
3167
3168 // If the pointer has already been identified as scalar (e.g., if it was
3169 // also identified as uniform), there's nothing to do.
3170 auto *I = cast<Instruction>(Ptr);
3171 if (Worklist.count(I))
3172 return;
3173
3174 // If the use of the pointer will be a scalar use, and all users of the
3175 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3176 // place the pointer in PossibleNonScalarPtrs.
3177 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3178 return isa<LoadInst>(U) || isa<StoreInst>(U);
3179 }))
3180 ScalarPtrs.insert(I);
3181 else
3182 PossibleNonScalarPtrs.insert(I);
3183 };
3184
3185 // We seed the scalars analysis with three classes of instructions: (1)
3186 // instructions marked uniform-after-vectorization and (2) bitcast,
3187 // getelementptr and (pointer) phi instructions used by memory accesses
3188 // requiring a scalar use.
3189 //
3190 // (1) Add to the worklist all instructions that have been identified as
3191 // uniform-after-vectorization.
3192 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3193
3194 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3195 // memory accesses requiring a scalar use. The pointer operands of loads and
3196 // stores will be scalar as long as the memory accesses is not a gather or
3197 // scatter operation. The value operand of a store will remain scalar if the
3198 // store is scalarized.
3199 for (auto *BB : TheLoop->blocks())
3200 for (auto &I : *BB) {
3201 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3202 evaluatePtrUse(Load, Load->getPointerOperand());
3203 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3204 evaluatePtrUse(Store, Store->getPointerOperand());
3205 evaluatePtrUse(Store, Store->getValueOperand());
3206 }
3207 }
3208 for (auto *I : ScalarPtrs)
3209 if (!PossibleNonScalarPtrs.count(I)) {
3210 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3211 Worklist.insert(I);
3212 }
3213
3214 // Insert the forced scalars.
3215 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3216 // induction variable when the PHI user is scalarized.
3217 auto ForcedScalar = ForcedScalars.find(VF);
3218 if (ForcedScalar != ForcedScalars.end())
3219 for (auto *I : ForcedScalar->second) {
3220 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3221 Worklist.insert(I);
3222 }
3223
3224 // Expand the worklist by looking through any bitcasts and getelementptr
3225 // instructions we've already identified as scalar. This is similar to the
3226 // expansion step in collectLoopUniforms(); however, here we're only
3227 // expanding to include additional bitcasts and getelementptr instructions.
3228 unsigned Idx = 0;
3229 while (Idx != Worklist.size()) {
3230 Instruction *Dst = Worklist[Idx++];
3231 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3232 continue;
3233 auto *Src = cast<Instruction>(Dst->getOperand(0));
3234 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3235 auto *J = cast<Instruction>(U);
3236 return !TheLoop->contains(J) || Worklist.count(J) ||
3237 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3238 isScalarUse(J, Src));
3239 })) {
3240 Worklist.insert(Src);
3241 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3242 }
3243 }
3244
3245 // An induction variable will remain scalar if all users of the induction
3246 // variable and induction variable update remain scalar.
3247 for (const auto &Induction : Legal->getInductionVars()) {
3248 auto *Ind = Induction.first;
3249 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3250
3251 // If tail-folding is applied, the primary induction variable will be used
3252 // to feed a vector compare.
3253 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3254 continue;
3255
3256 // Returns true if \p Indvar is a pointer induction that is used directly by
3257 // load/store instruction \p I.
3258 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3259 Instruction *I) {
3260 return Induction.second.getKind() ==
3261 InductionDescriptor::IK_PtrInduction &&
3262 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3263 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3264 };
3265
3266 // Determine if all users of the induction variable are scalar after
3267 // vectorization.
3268 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3269 auto *I = cast<Instruction>(U);
3270 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3271 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3272 });
3273 if (!ScalarInd)
3274 continue;
3275
3276 // If the induction variable update is a fixed-order recurrence, neither the
3277 // induction variable or its update should be marked scalar after
3278 // vectorization.
3279 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3280 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3281 continue;
3282
3283 // Determine if all users of the induction variable update instruction are
3284 // scalar after vectorization.
3285 auto ScalarIndUpdate =
3286 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3287 auto *I = cast<Instruction>(U);
3288 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3289 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3290 });
3291 if (!ScalarIndUpdate)
3292 continue;
3293
3294 // The induction variable and its update instruction will remain scalar.
3295 Worklist.insert(Ind);
3296 Worklist.insert(IndUpdate);
3297 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3298 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3299 << "\n");
3300 }
3301
3302 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3303 }
3304
isScalarWithPredication(Instruction * I,ElementCount VF) const3305 bool LoopVectorizationCostModel::isScalarWithPredication(
3306 Instruction *I, ElementCount VF) const {
3307 if (!isPredicatedInst(I))
3308 return false;
3309
3310 // Do we have a non-scalar lowering for this predicated
3311 // instruction? No - it is scalar with predication.
3312 switch(I->getOpcode()) {
3313 default:
3314 return true;
3315 case Instruction::Call:
3316 if (VF.isScalar())
3317 return true;
3318 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3319 .Kind == CM_Scalarize;
3320 case Instruction::Load:
3321 case Instruction::Store: {
3322 auto *Ptr = getLoadStorePointerOperand(I);
3323 auto *Ty = getLoadStoreType(I);
3324 Type *VTy = Ty;
3325 if (VF.isVector())
3326 VTy = VectorType::get(Ty, VF);
3327 const Align Alignment = getLoadStoreAlignment(I);
3328 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3329 TTI.isLegalMaskedGather(VTy, Alignment))
3330 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3331 TTI.isLegalMaskedScatter(VTy, Alignment));
3332 }
3333 case Instruction::UDiv:
3334 case Instruction::SDiv:
3335 case Instruction::SRem:
3336 case Instruction::URem: {
3337 // We have the option to use the safe-divisor idiom to avoid predication.
3338 // The cost based decision here will always select safe-divisor for
3339 // scalable vectors as scalarization isn't legal.
3340 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3341 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3342 }
3343 }
3344 }
3345
isPredicatedInst(Instruction * I) const3346 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3347 if (!blockNeedsPredicationForAnyReason(I->getParent()))
3348 return false;
3349
3350 // Can we prove this instruction is safe to unconditionally execute?
3351 // If not, we must use some form of predication.
3352 switch(I->getOpcode()) {
3353 default:
3354 return false;
3355 case Instruction::Load:
3356 case Instruction::Store: {
3357 if (!Legal->isMaskRequired(I))
3358 return false;
3359 // When we know the load's address is loop invariant and the instruction
3360 // in the original scalar loop was unconditionally executed then we
3361 // don't need to mark it as a predicated instruction. Tail folding may
3362 // introduce additional predication, but we're guaranteed to always have
3363 // at least one active lane. We call Legal->blockNeedsPredication here
3364 // because it doesn't query tail-folding. For stores, we need to prove
3365 // both speculation safety (which follows from the same argument as loads),
3366 // but also must prove the value being stored is correct. The easiest
3367 // form of the later is to require that all values stored are the same.
3368 if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
3369 (isa<LoadInst>(I) ||
3370 (isa<StoreInst>(I) &&
3371 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3372 !Legal->blockNeedsPredication(I->getParent()))
3373 return false;
3374 return true;
3375 }
3376 case Instruction::UDiv:
3377 case Instruction::SDiv:
3378 case Instruction::SRem:
3379 case Instruction::URem:
3380 // TODO: We can use the loop-preheader as context point here and get
3381 // context sensitive reasoning
3382 return !isSafeToSpeculativelyExecute(I);
3383 case Instruction::Call:
3384 return Legal->isMaskRequired(I);
3385 }
3386 }
3387
3388 std::pair<InstructionCost, InstructionCost>
getDivRemSpeculationCost(Instruction * I,ElementCount VF) const3389 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3390 ElementCount VF) const {
3391 assert(I->getOpcode() == Instruction::UDiv ||
3392 I->getOpcode() == Instruction::SDiv ||
3393 I->getOpcode() == Instruction::SRem ||
3394 I->getOpcode() == Instruction::URem);
3395 assert(!isSafeToSpeculativelyExecute(I));
3396
3397 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3398
3399 // Scalarization isn't legal for scalable vector types
3400 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3401 if (!VF.isScalable()) {
3402 // Get the scalarization cost and scale this amount by the probability of
3403 // executing the predicated block. If the instruction is not predicated,
3404 // we fall through to the next case.
3405 ScalarizationCost = 0;
3406
3407 // These instructions have a non-void type, so account for the phi nodes
3408 // that we will create. This cost is likely to be zero. The phi node
3409 // cost, if any, should be scaled by the block probability because it
3410 // models a copy at the end of each predicated block.
3411 ScalarizationCost += VF.getKnownMinValue() *
3412 TTI.getCFInstrCost(Instruction::PHI, CostKind);
3413
3414 // The cost of the non-predicated instruction.
3415 ScalarizationCost += VF.getKnownMinValue() *
3416 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3417
3418 // The cost of insertelement and extractelement instructions needed for
3419 // scalarization.
3420 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3421
3422 // Scale the cost by the probability of executing the predicated blocks.
3423 // This assumes the predicated block for each vector lane is equally
3424 // likely.
3425 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3426 }
3427 InstructionCost SafeDivisorCost = 0;
3428
3429 auto *VecTy = ToVectorTy(I->getType(), VF);
3430
3431 // The cost of the select guard to ensure all lanes are well defined
3432 // after we speculate above any internal control flow.
3433 SafeDivisorCost += TTI.getCmpSelInstrCost(
3434 Instruction::Select, VecTy,
3435 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
3436 CmpInst::BAD_ICMP_PREDICATE, CostKind);
3437
3438 // Certain instructions can be cheaper to vectorize if they have a constant
3439 // second vector operand. One example of this are shifts on x86.
3440 Value *Op2 = I->getOperand(1);
3441 auto Op2Info = TTI.getOperandInfo(Op2);
3442 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3443 Legal->isInvariant(Op2))
3444 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3445
3446 SmallVector<const Value *, 4> Operands(I->operand_values());
3447 SafeDivisorCost += TTI.getArithmeticInstrCost(
3448 I->getOpcode(), VecTy, CostKind,
3449 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3450 Op2Info, Operands, I);
3451 return {ScalarizationCost, SafeDivisorCost};
3452 }
3453
interleavedAccessCanBeWidened(Instruction * I,ElementCount VF) const3454 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3455 Instruction *I, ElementCount VF) const {
3456 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3457 assert(getWideningDecision(I, VF) == CM_Unknown &&
3458 "Decision should not be set yet.");
3459 auto *Group = getInterleavedAccessGroup(I);
3460 assert(Group && "Must have a group.");
3461
3462 // If the instruction's allocated size doesn't equal it's type size, it
3463 // requires padding and will be scalarized.
3464 auto &DL = I->getDataLayout();
3465 auto *ScalarTy = getLoadStoreType(I);
3466 if (hasIrregularType(ScalarTy, DL))
3467 return false;
3468
3469 // If the group involves a non-integral pointer, we may not be able to
3470 // losslessly cast all values to a common type.
3471 unsigned InterleaveFactor = Group->getFactor();
3472 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3473 for (unsigned i = 0; i < InterleaveFactor; i++) {
3474 Instruction *Member = Group->getMember(i);
3475 if (!Member)
3476 continue;
3477 auto *MemberTy = getLoadStoreType(Member);
3478 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3479 // Don't coerce non-integral pointers to integers or vice versa.
3480 if (MemberNI != ScalarNI) {
3481 // TODO: Consider adding special nullptr value case here
3482 return false;
3483 } else if (MemberNI && ScalarNI &&
3484 ScalarTy->getPointerAddressSpace() !=
3485 MemberTy->getPointerAddressSpace()) {
3486 return false;
3487 }
3488 }
3489
3490 // Check if masking is required.
3491 // A Group may need masking for one of two reasons: it resides in a block that
3492 // needs predication, or it was decided to use masking to deal with gaps
3493 // (either a gap at the end of a load-access that may result in a speculative
3494 // load, or any gaps in a store-access).
3495 bool PredicatedAccessRequiresMasking =
3496 blockNeedsPredicationForAnyReason(I->getParent()) &&
3497 Legal->isMaskRequired(I);
3498 bool LoadAccessWithGapsRequiresEpilogMasking =
3499 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3500 !isScalarEpilogueAllowed();
3501 bool StoreAccessWithGapsRequiresMasking =
3502 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3503 if (!PredicatedAccessRequiresMasking &&
3504 !LoadAccessWithGapsRequiresEpilogMasking &&
3505 !StoreAccessWithGapsRequiresMasking)
3506 return true;
3507
3508 // If masked interleaving is required, we expect that the user/target had
3509 // enabled it, because otherwise it either wouldn't have been created or
3510 // it should have been invalidated by the CostModel.
3511 assert(useMaskedInterleavedAccesses(TTI) &&
3512 "Masked interleave-groups for predicated accesses are not enabled.");
3513
3514 if (Group->isReverse())
3515 return false;
3516
3517 auto *Ty = getLoadStoreType(I);
3518 const Align Alignment = getLoadStoreAlignment(I);
3519 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3520 : TTI.isLegalMaskedStore(Ty, Alignment);
3521 }
3522
memoryInstructionCanBeWidened(Instruction * I,ElementCount VF)3523 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3524 Instruction *I, ElementCount VF) {
3525 // Get and ensure we have a valid memory instruction.
3526 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3527
3528 auto *Ptr = getLoadStorePointerOperand(I);
3529 auto *ScalarTy = getLoadStoreType(I);
3530
3531 // In order to be widened, the pointer should be consecutive, first of all.
3532 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3533 return false;
3534
3535 // If the instruction is a store located in a predicated block, it will be
3536 // scalarized.
3537 if (isScalarWithPredication(I, VF))
3538 return false;
3539
3540 // If the instruction's allocated size doesn't equal it's type size, it
3541 // requires padding and will be scalarized.
3542 auto &DL = I->getDataLayout();
3543 if (hasIrregularType(ScalarTy, DL))
3544 return false;
3545
3546 return true;
3547 }
3548
collectLoopUniforms(ElementCount VF)3549 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3550 // We should not collect Uniforms more than once per VF. Right now,
3551 // this function is called from collectUniformsAndScalars(), which
3552 // already does this check. Collecting Uniforms for VF=1 does not make any
3553 // sense.
3554
3555 assert(VF.isVector() && !Uniforms.contains(VF) &&
3556 "This function should not be visited twice for the same VF");
3557
3558 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
3559 // not analyze again. Uniforms.count(VF) will return 1.
3560 Uniforms[VF].clear();
3561
3562 // We now know that the loop is vectorizable!
3563 // Collect instructions inside the loop that will remain uniform after
3564 // vectorization.
3565
3566 // Global values, params and instructions outside of current loop are out of
3567 // scope.
3568 auto isOutOfScope = [&](Value *V) -> bool {
3569 Instruction *I = dyn_cast<Instruction>(V);
3570 return (!I || !TheLoop->contains(I));
3571 };
3572
3573 // Worklist containing uniform instructions demanding lane 0.
3574 SetVector<Instruction *> Worklist;
3575
3576 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3577 // that require predication must not be considered uniform after
3578 // vectorization, because that would create an erroneous replicating region
3579 // where only a single instance out of VF should be formed.
3580 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
3581 if (isOutOfScope(I)) {
3582 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3583 << *I << "\n");
3584 return;
3585 }
3586 if (isPredicatedInst(I)) {
3587 LLVM_DEBUG(
3588 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3589 << "\n");
3590 return;
3591 }
3592 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3593 Worklist.insert(I);
3594 };
3595
3596 // Start with the conditional branches exiting the loop. If the branch
3597 // condition is an instruction contained in the loop that is only used by the
3598 // branch, it is uniform.
3599 SmallVector<BasicBlock *> Exiting;
3600 TheLoop->getExitingBlocks(Exiting);
3601 for (BasicBlock *E : Exiting) {
3602 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3603 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3604 addToWorklistIfAllowed(Cmp);
3605 }
3606
3607 auto PrevVF = VF.divideCoefficientBy(2);
3608 // Return true if all lanes perform the same memory operation, and we can
3609 // thus chose to execute only one.
3610 auto isUniformMemOpUse = [&](Instruction *I) {
3611 // If the value was already known to not be uniform for the previous
3612 // (smaller VF), it cannot be uniform for the larger VF.
3613 if (PrevVF.isVector()) {
3614 auto Iter = Uniforms.find(PrevVF);
3615 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3616 return false;
3617 }
3618 if (!Legal->isUniformMemOp(*I, VF))
3619 return false;
3620 if (isa<LoadInst>(I))
3621 // Loading the same address always produces the same result - at least
3622 // assuming aliasing and ordering which have already been checked.
3623 return true;
3624 // Storing the same value on every iteration.
3625 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3626 };
3627
3628 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
3629 InstWidening WideningDecision = getWideningDecision(I, VF);
3630 assert(WideningDecision != CM_Unknown &&
3631 "Widening decision should be ready at this moment");
3632
3633 if (isUniformMemOpUse(I))
3634 return true;
3635
3636 return (WideningDecision == CM_Widen ||
3637 WideningDecision == CM_Widen_Reverse ||
3638 WideningDecision == CM_Interleave);
3639 };
3640
3641 // Returns true if Ptr is the pointer operand of a memory access instruction
3642 // I, I is known to not require scalarization, and the pointer is not also
3643 // stored.
3644 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3645 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3646 return false;
3647 return getLoadStorePointerOperand(I) == Ptr &&
3648 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3649 };
3650
3651 // Holds a list of values which are known to have at least one uniform use.
3652 // Note that there may be other uses which aren't uniform. A "uniform use"
3653 // here is something which only demands lane 0 of the unrolled iterations;
3654 // it does not imply that all lanes produce the same value (e.g. this is not
3655 // the usual meaning of uniform)
3656 SetVector<Value *> HasUniformUse;
3657
3658 // Scan the loop for instructions which are either a) known to have only
3659 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3660 for (auto *BB : TheLoop->blocks())
3661 for (auto &I : *BB) {
3662 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3663 switch (II->getIntrinsicID()) {
3664 case Intrinsic::sideeffect:
3665 case Intrinsic::experimental_noalias_scope_decl:
3666 case Intrinsic::assume:
3667 case Intrinsic::lifetime_start:
3668 case Intrinsic::lifetime_end:
3669 if (TheLoop->hasLoopInvariantOperands(&I))
3670 addToWorklistIfAllowed(&I);
3671 break;
3672 default:
3673 break;
3674 }
3675 }
3676
3677 // ExtractValue instructions must be uniform, because the operands are
3678 // known to be loop-invariant.
3679 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3680 assert(isOutOfScope(EVI->getAggregateOperand()) &&
3681 "Expected aggregate value to be loop invariant");
3682 addToWorklistIfAllowed(EVI);
3683 continue;
3684 }
3685
3686 // If there's no pointer operand, there's nothing to do.
3687 auto *Ptr = getLoadStorePointerOperand(&I);
3688 if (!Ptr)
3689 continue;
3690
3691 if (isUniformMemOpUse(&I))
3692 addToWorklistIfAllowed(&I);
3693
3694 if (isVectorizedMemAccessUse(&I, Ptr))
3695 HasUniformUse.insert(Ptr);
3696 }
3697
3698 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3699 // demanding) users. Since loops are assumed to be in LCSSA form, this
3700 // disallows uses outside the loop as well.
3701 for (auto *V : HasUniformUse) {
3702 if (isOutOfScope(V))
3703 continue;
3704 auto *I = cast<Instruction>(V);
3705 auto UsersAreMemAccesses =
3706 llvm::all_of(I->users(), [&](User *U) -> bool {
3707 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
3708 });
3709 if (UsersAreMemAccesses)
3710 addToWorklistIfAllowed(I);
3711 }
3712
3713 // Expand Worklist in topological order: whenever a new instruction
3714 // is added , its users should be already inside Worklist. It ensures
3715 // a uniform instruction will only be used by uniform instructions.
3716 unsigned idx = 0;
3717 while (idx != Worklist.size()) {
3718 Instruction *I = Worklist[idx++];
3719
3720 for (auto *OV : I->operand_values()) {
3721 // isOutOfScope operands cannot be uniform instructions.
3722 if (isOutOfScope(OV))
3723 continue;
3724 // First order recurrence Phi's should typically be considered
3725 // non-uniform.
3726 auto *OP = dyn_cast<PHINode>(OV);
3727 if (OP && Legal->isFixedOrderRecurrence(OP))
3728 continue;
3729 // If all the users of the operand are uniform, then add the
3730 // operand into the uniform worklist.
3731 auto *OI = cast<Instruction>(OV);
3732 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3733 auto *J = cast<Instruction>(U);
3734 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
3735 }))
3736 addToWorklistIfAllowed(OI);
3737 }
3738 }
3739
3740 // For an instruction to be added into Worklist above, all its users inside
3741 // the loop should also be in Worklist. However, this condition cannot be
3742 // true for phi nodes that form a cyclic dependence. We must process phi
3743 // nodes separately. An induction variable will remain uniform if all users
3744 // of the induction variable and induction variable update remain uniform.
3745 // The code below handles both pointer and non-pointer induction variables.
3746 BasicBlock *Latch = TheLoop->getLoopLatch();
3747 for (const auto &Induction : Legal->getInductionVars()) {
3748 auto *Ind = Induction.first;
3749 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3750
3751 // Determine if all users of the induction variable are uniform after
3752 // vectorization.
3753 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3754 auto *I = cast<Instruction>(U);
3755 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3756 isVectorizedMemAccessUse(I, Ind);
3757 });
3758 if (!UniformInd)
3759 continue;
3760
3761 // Determine if all users of the induction variable update instruction are
3762 // uniform after vectorization.
3763 auto UniformIndUpdate =
3764 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3765 auto *I = cast<Instruction>(U);
3766 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3767 isVectorizedMemAccessUse(I, IndUpdate);
3768 });
3769 if (!UniformIndUpdate)
3770 continue;
3771
3772 // The induction variable and its update instruction will remain uniform.
3773 addToWorklistIfAllowed(Ind);
3774 addToWorklistIfAllowed(IndUpdate);
3775 }
3776
3777 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3778 }
3779
runtimeChecksRequired()3780 bool LoopVectorizationCostModel::runtimeChecksRequired() {
3781 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3782
3783 if (Legal->getRuntimePointerChecking()->Need) {
3784 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3785 "runtime pointer checks needed. Enable vectorization of this "
3786 "loop with '#pragma clang loop vectorize(enable)' when "
3787 "compiling with -Os/-Oz",
3788 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3789 return true;
3790 }
3791
3792 if (!PSE.getPredicate().isAlwaysTrue()) {
3793 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3794 "runtime SCEV checks needed. Enable vectorization of this "
3795 "loop with '#pragma clang loop vectorize(enable)' when "
3796 "compiling with -Os/-Oz",
3797 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3798 return true;
3799 }
3800
3801 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3802 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3803 reportVectorizationFailure("Runtime stride check for small trip count",
3804 "runtime stride == 1 checks needed. Enable vectorization of "
3805 "this loop without such check by compiling with -Os/-Oz",
3806 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3807 return true;
3808 }
3809
3810 return false;
3811 }
3812
isScalableVectorizationAllowed()3813 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3814 if (IsScalableVectorizationAllowed)
3815 return *IsScalableVectorizationAllowed;
3816
3817 IsScalableVectorizationAllowed = false;
3818 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3819 return false;
3820
3821 if (Hints->isScalableVectorizationDisabled()) {
3822 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3823 "ScalableVectorizationDisabled", ORE, TheLoop);
3824 return false;
3825 }
3826
3827 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3828
3829 auto MaxScalableVF = ElementCount::getScalable(
3830 std::numeric_limits<ElementCount::ScalarTy>::max());
3831
3832 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3833 // FIXME: While for scalable vectors this is currently sufficient, this should
3834 // be replaced by a more detailed mechanism that filters out specific VFs,
3835 // instead of invalidating vectorization for a whole set of VFs based on the
3836 // MaxVF.
3837
3838 // Disable scalable vectorization if the loop contains unsupported reductions.
3839 if (!canVectorizeReductions(MaxScalableVF)) {
3840 reportVectorizationInfo(
3841 "Scalable vectorization not supported for the reduction "
3842 "operations found in this loop.",
3843 "ScalableVFUnfeasible", ORE, TheLoop);
3844 return false;
3845 }
3846
3847 // Disable scalable vectorization if the loop contains any instructions
3848 // with element types not supported for scalable vectors.
3849 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3850 return !Ty->isVoidTy() &&
3851 !this->TTI.isElementTypeLegalForScalableVector(Ty);
3852 })) {
3853 reportVectorizationInfo("Scalable vectorization is not supported "
3854 "for all element types found in this loop.",
3855 "ScalableVFUnfeasible", ORE, TheLoop);
3856 return false;
3857 }
3858
3859 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3860 reportVectorizationInfo("The target does not provide maximum vscale value "
3861 "for safe distance analysis.",
3862 "ScalableVFUnfeasible", ORE, TheLoop);
3863 return false;
3864 }
3865
3866 IsScalableVectorizationAllowed = true;
3867 return true;
3868 }
3869
3870 ElementCount
getMaxLegalScalableVF(unsigned MaxSafeElements)3871 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3872 if (!isScalableVectorizationAllowed())
3873 return ElementCount::getScalable(0);
3874
3875 auto MaxScalableVF = ElementCount::getScalable(
3876 std::numeric_limits<ElementCount::ScalarTy>::max());
3877 if (Legal->isSafeForAnyVectorWidth())
3878 return MaxScalableVF;
3879
3880 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3881 // Limit MaxScalableVF by the maximum safe dependence distance.
3882 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3883
3884 if (!MaxScalableVF)
3885 reportVectorizationInfo(
3886 "Max legal vector width too small, scalable vectorization "
3887 "unfeasible.",
3888 "ScalableVFUnfeasible", ORE, TheLoop);
3889
3890 return MaxScalableVF;
3891 }
3892
computeFeasibleMaxVF(unsigned MaxTripCount,ElementCount UserVF,bool FoldTailByMasking)3893 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3894 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3895 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3896 unsigned SmallestType, WidestType;
3897 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3898
3899 // Get the maximum safe dependence distance in bits computed by LAA.
3900 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3901 // the memory accesses that is most restrictive (involved in the smallest
3902 // dependence distance).
3903 unsigned MaxSafeElements =
3904 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3905
3906 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3907 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3908
3909 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3910 << ".\n");
3911 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3912 << ".\n");
3913
3914 // First analyze the UserVF, fall back if the UserVF should be ignored.
3915 if (UserVF) {
3916 auto MaxSafeUserVF =
3917 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3918
3919 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3920 // If `VF=vscale x N` is safe, then so is `VF=N`
3921 if (UserVF.isScalable())
3922 return FixedScalableVFPair(
3923 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3924 else
3925 return UserVF;
3926 }
3927
3928 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3929
3930 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3931 // is better to ignore the hint and let the compiler choose a suitable VF.
3932 if (!UserVF.isScalable()) {
3933 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3934 << " is unsafe, clamping to max safe VF="
3935 << MaxSafeFixedVF << ".\n");
3936 ORE->emit([&]() {
3937 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3938 TheLoop->getStartLoc(),
3939 TheLoop->getHeader())
3940 << "User-specified vectorization factor "
3941 << ore::NV("UserVectorizationFactor", UserVF)
3942 << " is unsafe, clamping to maximum safe vectorization factor "
3943 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3944 });
3945 return MaxSafeFixedVF;
3946 }
3947
3948 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3949 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3950 << " is ignored because scalable vectors are not "
3951 "available.\n");
3952 ORE->emit([&]() {
3953 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3954 TheLoop->getStartLoc(),
3955 TheLoop->getHeader())
3956 << "User-specified vectorization factor "
3957 << ore::NV("UserVectorizationFactor", UserVF)
3958 << " is ignored because the target does not support scalable "
3959 "vectors. The compiler will pick a more suitable value.";
3960 });
3961 } else {
3962 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3963 << " is unsafe. Ignoring scalable UserVF.\n");
3964 ORE->emit([&]() {
3965 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3966 TheLoop->getStartLoc(),
3967 TheLoop->getHeader())
3968 << "User-specified vectorization factor "
3969 << ore::NV("UserVectorizationFactor", UserVF)
3970 << " is unsafe. Ignoring the hint to let the compiler pick a "
3971 "more suitable value.";
3972 });
3973 }
3974 }
3975
3976 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3977 << " / " << WidestType << " bits.\n");
3978
3979 FixedScalableVFPair Result(ElementCount::getFixed(1),
3980 ElementCount::getScalable(0));
3981 if (auto MaxVF =
3982 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3983 MaxSafeFixedVF, FoldTailByMasking))
3984 Result.FixedVF = MaxVF;
3985
3986 if (auto MaxVF =
3987 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3988 MaxSafeScalableVF, FoldTailByMasking))
3989 if (MaxVF.isScalable()) {
3990 Result.ScalableVF = MaxVF;
3991 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3992 << "\n");
3993 }
3994
3995 return Result;
3996 }
3997
3998 FixedScalableVFPair
computeMaxVF(ElementCount UserVF,unsigned UserIC)3999 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4000 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4001 // TODO: It may by useful to do since it's still likely to be dynamically
4002 // uniform if the target can skip.
4003 reportVectorizationFailure(
4004 "Not inserting runtime ptr check for divergent target",
4005 "runtime pointer checks needed. Not enabled for divergent target",
4006 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4007 return FixedScalableVFPair::getNone();
4008 }
4009
4010 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4011 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4012 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4013 if (TC == 1) {
4014 reportVectorizationFailure("Single iteration (non) loop",
4015 "loop trip count is one, irrelevant for vectorization",
4016 "SingleIterationLoop", ORE, TheLoop);
4017 return FixedScalableVFPair::getNone();
4018 }
4019
4020 switch (ScalarEpilogueStatus) {
4021 case CM_ScalarEpilogueAllowed:
4022 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4023 case CM_ScalarEpilogueNotAllowedUsePredicate:
4024 [[fallthrough]];
4025 case CM_ScalarEpilogueNotNeededUsePredicate:
4026 LLVM_DEBUG(
4027 dbgs() << "LV: vector predicate hint/switch found.\n"
4028 << "LV: Not allowing scalar epilogue, creating predicated "
4029 << "vector loop.\n");
4030 break;
4031 case CM_ScalarEpilogueNotAllowedLowTripLoop:
4032 // fallthrough as a special case of OptForSize
4033 case CM_ScalarEpilogueNotAllowedOptSize:
4034 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4035 LLVM_DEBUG(
4036 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4037 else
4038 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4039 << "count.\n");
4040
4041 // Bail if runtime checks are required, which are not good when optimising
4042 // for size.
4043 if (runtimeChecksRequired())
4044 return FixedScalableVFPair::getNone();
4045
4046 break;
4047 }
4048
4049 // The only loops we can vectorize without a scalar epilogue, are loops with
4050 // a bottom-test and a single exiting block. We'd have to handle the fact
4051 // that not every instruction executes on the last iteration. This will
4052 // require a lane mask which varies through the vector loop body. (TODO)
4053 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4054 // If there was a tail-folding hint/switch, but we can't fold the tail by
4055 // masking, fallback to a vectorization with a scalar epilogue.
4056 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4057 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4058 "scalar epilogue instead.\n");
4059 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4060 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4061 }
4062 return FixedScalableVFPair::getNone();
4063 }
4064
4065 // Now try the tail folding
4066
4067 // Invalidate interleave groups that require an epilogue if we can't mask
4068 // the interleave-group.
4069 if (!useMaskedInterleavedAccesses(TTI)) {
4070 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4071 "No decisions should have been taken at this point");
4072 // Note: There is no need to invalidate any cost modeling decisions here, as
4073 // non where taken so far.
4074 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4075 }
4076
4077 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4078
4079 // Avoid tail folding if the trip count is known to be a multiple of any VF
4080 // we choose.
4081 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4082 MaxFactors.FixedVF.getFixedValue();
4083 if (MaxFactors.ScalableVF) {
4084 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4085 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4086 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4087 *MaxPowerOf2RuntimeVF,
4088 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4089 } else
4090 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4091 }
4092
4093 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4094 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4095 "MaxFixedVF must be a power of 2");
4096 unsigned MaxVFtimesIC =
4097 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4098 ScalarEvolution *SE = PSE.getSE();
4099 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4100 const SCEV *ExitCount = SE->getAddExpr(
4101 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4102 const SCEV *Rem = SE->getURemExpr(
4103 SE->applyLoopGuards(ExitCount, TheLoop),
4104 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4105 if (Rem->isZero()) {
4106 // Accept MaxFixedVF if we do not have a tail.
4107 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4108 return MaxFactors;
4109 }
4110 }
4111
4112 // If we don't know the precise trip count, or if the trip count that we
4113 // found modulo the vectorization factor is not zero, try to fold the tail
4114 // by masking.
4115 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4116 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4117 if (foldTailByMasking()) {
4118 if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4119 LLVM_DEBUG(
4120 dbgs()
4121 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4122 "try to generate VP Intrinsics with scalable vector "
4123 "factors only.\n");
4124 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4125 // for now.
4126 // TODO: extend it for fixed vectors, if required.
4127 assert(MaxFactors.ScalableVF.isScalable() &&
4128 "Expected scalable vector factor.");
4129
4130 MaxFactors.FixedVF = ElementCount::getFixed(1);
4131 }
4132 return MaxFactors;
4133 }
4134
4135 // If there was a tail-folding hint/switch, but we can't fold the tail by
4136 // masking, fallback to a vectorization with a scalar epilogue.
4137 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4138 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4139 "scalar epilogue instead.\n");
4140 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4141 return MaxFactors;
4142 }
4143
4144 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4145 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4146 return FixedScalableVFPair::getNone();
4147 }
4148
4149 if (TC == 0) {
4150 reportVectorizationFailure(
4151 "Unable to calculate the loop count due to complex control flow",
4152 "unable to calculate the loop count due to complex control flow",
4153 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4154 return FixedScalableVFPair::getNone();
4155 }
4156
4157 reportVectorizationFailure(
4158 "Cannot optimize for size and vectorize at the same time.",
4159 "cannot optimize for size and vectorize at the same time. "
4160 "Enable vectorization of this loop with '#pragma clang loop "
4161 "vectorize(enable)' when compiling with -Os/-Oz",
4162 "NoTailLoopWithOptForSize", ORE, TheLoop);
4163 return FixedScalableVFPair::getNone();
4164 }
4165
getMaximizedVFForTarget(unsigned MaxTripCount,unsigned SmallestType,unsigned WidestType,ElementCount MaxSafeVF,bool FoldTailByMasking)4166 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4167 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4168 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4169 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4170 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4171 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4172 : TargetTransformInfo::RGK_FixedWidthVector);
4173
4174 // Convenience function to return the minimum of two ElementCounts.
4175 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4176 assert((LHS.isScalable() == RHS.isScalable()) &&
4177 "Scalable flags must match");
4178 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4179 };
4180
4181 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4182 // Note that both WidestRegister and WidestType may not be a powers of 2.
4183 auto MaxVectorElementCount = ElementCount::get(
4184 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4185 ComputeScalableMaxVF);
4186 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4187 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4188 << (MaxVectorElementCount * WidestType) << " bits.\n");
4189
4190 if (!MaxVectorElementCount) {
4191 LLVM_DEBUG(dbgs() << "LV: The target has no "
4192 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4193 << " vector registers.\n");
4194 return ElementCount::getFixed(1);
4195 }
4196
4197 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4198 if (MaxVectorElementCount.isScalable() &&
4199 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4200 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4201 auto Min = Attr.getVScaleRangeMin();
4202 WidestRegisterMinEC *= Min;
4203 }
4204
4205 // When a scalar epilogue is required, at least one iteration of the scalar
4206 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4207 // max VF that results in a dead vector loop.
4208 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4209 MaxTripCount -= 1;
4210
4211 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4212 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4213 // If upper bound loop trip count (TC) is known at compile time there is no
4214 // point in choosing VF greater than TC (as done in the loop below). Select
4215 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4216 // scalable, we only fall back on a fixed VF when the TC is less than or
4217 // equal to the known number of lanes.
4218 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4219 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4220 "exceeding the constant trip count: "
4221 << ClampedUpperTripCount << "\n");
4222 return ElementCount::get(
4223 ClampedUpperTripCount,
4224 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4225 }
4226
4227 TargetTransformInfo::RegisterKind RegKind =
4228 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4229 : TargetTransformInfo::RGK_FixedWidthVector;
4230 ElementCount MaxVF = MaxVectorElementCount;
4231 if (MaximizeBandwidth ||
4232 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4233 (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4234 (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4235 auto MaxVectorElementCountMaxBW = ElementCount::get(
4236 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4237 ComputeScalableMaxVF);
4238 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4239
4240 // Collect all viable vectorization factors larger than the default MaxVF
4241 // (i.e. MaxVectorElementCount).
4242 SmallVector<ElementCount, 8> VFs;
4243 for (ElementCount VS = MaxVectorElementCount * 2;
4244 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4245 VFs.push_back(VS);
4246
4247 // For each VF calculate its register usage.
4248 auto RUs = calculateRegisterUsage(VFs);
4249
4250 // Select the largest VF which doesn't require more registers than existing
4251 // ones.
4252 for (int I = RUs.size() - 1; I >= 0; --I) {
4253 const auto &MLU = RUs[I].MaxLocalUsers;
4254 if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4255 return LU.second <= TTI.getNumberOfRegisters(LU.first);
4256 })) {
4257 MaxVF = VFs[I];
4258 break;
4259 }
4260 }
4261 if (ElementCount MinVF =
4262 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4263 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4264 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4265 << ") with target's minimum: " << MinVF << '\n');
4266 MaxVF = MinVF;
4267 }
4268 }
4269
4270 // Invalidate any widening decisions we might have made, in case the loop
4271 // requires prediction (decided later), but we have already made some
4272 // load/store widening decisions.
4273 invalidateCostModelingDecisions();
4274 }
4275 return MaxVF;
4276 }
4277
4278 /// Convenience function that returns the value of vscale_range iff
4279 /// vscale_range.min == vscale_range.max or otherwise returns the value
4280 /// returned by the corresponding TTI method.
4281 static std::optional<unsigned>
getVScaleForTuning(const Loop * L,const TargetTransformInfo & TTI)4282 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4283 const Function *Fn = L->getHeader()->getParent();
4284 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4285 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4286 auto Min = Attr.getVScaleRangeMin();
4287 auto Max = Attr.getVScaleRangeMax();
4288 if (Max && Min == Max)
4289 return Max;
4290 }
4291
4292 return TTI.getVScaleForTuning();
4293 }
4294
isMoreProfitable(const VectorizationFactor & A,const VectorizationFactor & B) const4295 bool LoopVectorizationPlanner::isMoreProfitable(
4296 const VectorizationFactor &A, const VectorizationFactor &B) const {
4297 InstructionCost CostA = A.Cost;
4298 InstructionCost CostB = B.Cost;
4299
4300 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4301
4302 // Improve estimate for the vector width if it is scalable.
4303 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4304 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4305 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4306 if (A.Width.isScalable())
4307 EstimatedWidthA *= *VScale;
4308 if (B.Width.isScalable())
4309 EstimatedWidthB *= *VScale;
4310 }
4311
4312 // Assume vscale may be larger than 1 (or the value being tuned for),
4313 // so that scalable vectorization is slightly favorable over fixed-width
4314 // vectorization.
4315 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4316 A.Width.isScalable() && !B.Width.isScalable();
4317
4318 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4319 const InstructionCost &RHS) {
4320 return PreferScalable ? LHS <= RHS : LHS < RHS;
4321 };
4322
4323 // To avoid the need for FP division:
4324 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4325 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4326 if (!MaxTripCount)
4327 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4328
4329 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4330 InstructionCost VectorCost,
4331 InstructionCost ScalarCost) {
4332 // If the trip count is a known (possibly small) constant, the trip count
4333 // will be rounded up to an integer number of iterations under
4334 // FoldTailByMasking. The total cost in that case will be
4335 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4336 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4337 // some extra overheads, but for the purpose of comparing the costs of
4338 // different VFs we can use this to compare the total loop-body cost
4339 // expected after vectorization.
4340 if (CM.foldTailByMasking())
4341 return VectorCost * divideCeil(MaxTripCount, VF);
4342 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4343 };
4344
4345 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4346 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4347 return CmpFn(RTCostA, RTCostB);
4348 }
4349
emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,OptimizationRemarkEmitter * ORE,Loop * TheLoop)4350 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4351 OptimizationRemarkEmitter *ORE,
4352 Loop *TheLoop) {
4353 if (InvalidCosts.empty())
4354 return;
4355
4356 // Emit a report of VFs with invalid costs in the loop.
4357
4358 // Group the remarks per instruction, keeping the instruction order from
4359 // InvalidCosts.
4360 std::map<Instruction *, unsigned> Numbering;
4361 unsigned I = 0;
4362 for (auto &Pair : InvalidCosts)
4363 if (!Numbering.count(Pair.first))
4364 Numbering[Pair.first] = I++;
4365
4366 // Sort the list, first on instruction(number) then on VF.
4367 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4368 if (Numbering[A.first] != Numbering[B.first])
4369 return Numbering[A.first] < Numbering[B.first];
4370 const auto &LHS = A.second;
4371 const auto &RHS = B.second;
4372 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4373 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4374 });
4375
4376 // For a list of ordered instruction-vf pairs:
4377 // [(load, vf1), (load, vf2), (store, vf1)]
4378 // Group the instructions together to emit separate remarks for:
4379 // load (vf1, vf2)
4380 // store (vf1)
4381 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4382 auto Subset = ArrayRef<InstructionVFPair>();
4383 do {
4384 if (Subset.empty())
4385 Subset = Tail.take_front(1);
4386
4387 Instruction *I = Subset.front().first;
4388
4389 // If the next instruction is different, or if there are no other pairs,
4390 // emit a remark for the collated subset. e.g.
4391 // [(load, vf1), (load, vf2))]
4392 // to emit:
4393 // remark: invalid costs for 'load' at VF=(vf, vf2)
4394 if (Subset == Tail || Tail[Subset.size()].first != I) {
4395 std::string OutString;
4396 raw_string_ostream OS(OutString);
4397 assert(!Subset.empty() && "Unexpected empty range");
4398 OS << "Instruction with invalid costs prevented vectorization at VF=(";
4399 for (const auto &Pair : Subset)
4400 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4401 OS << "):";
4402 if (auto *CI = dyn_cast<CallInst>(I))
4403 OS << " call to " << CI->getCalledFunction()->getName();
4404 else
4405 OS << " " << I->getOpcodeName();
4406 OS.flush();
4407 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
4408 Tail = Tail.drop_front(Subset.size());
4409 Subset = {};
4410 } else
4411 // Grow the subset by one element
4412 Subset = Tail.take_front(Subset.size() + 1);
4413 } while (!Tail.empty());
4414 }
4415
4416 /// Check if any recipe of \p Plan will generate a vector value, which will be
4417 /// assigned a vector register.
willGenerateVectors(VPlan & Plan,ElementCount VF,const TargetTransformInfo & TTI)4418 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4419 const TargetTransformInfo &TTI) {
4420 assert(VF.isVector() && "Checking a scalar VF?");
4421 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
4422 Plan.getCanonicalIV()->getScalarType()->getContext());
4423 DenseSet<VPRecipeBase *> EphemeralRecipes;
4424 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4425 // Set of already visited types.
4426 DenseSet<Type *> Visited;
4427 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4428 vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4429 for (VPRecipeBase &R : *VPBB) {
4430 if (EphemeralRecipes.contains(&R))
4431 continue;
4432 // Continue early if the recipe is considered to not produce a vector
4433 // result. Note that this includes VPInstruction where some opcodes may
4434 // produce a vector, to preserve existing behavior as VPInstructions model
4435 // aspects not directly mapped to existing IR instructions.
4436 switch (R.getVPDefID()) {
4437 case VPDef::VPDerivedIVSC:
4438 case VPDef::VPScalarIVStepsSC:
4439 case VPDef::VPScalarCastSC:
4440 case VPDef::VPReplicateSC:
4441 case VPDef::VPInstructionSC:
4442 case VPDef::VPCanonicalIVPHISC:
4443 case VPDef::VPVectorPointerSC:
4444 case VPDef::VPExpandSCEVSC:
4445 case VPDef::VPEVLBasedIVPHISC:
4446 case VPDef::VPPredInstPHISC:
4447 case VPDef::VPBranchOnMaskSC:
4448 continue;
4449 case VPDef::VPReductionSC:
4450 case VPDef::VPActiveLaneMaskPHISC:
4451 case VPDef::VPWidenCallSC:
4452 case VPDef::VPWidenCanonicalIVSC:
4453 case VPDef::VPWidenCastSC:
4454 case VPDef::VPWidenGEPSC:
4455 case VPDef::VPWidenSC:
4456 case VPDef::VPWidenSelectSC:
4457 case VPDef::VPBlendSC:
4458 case VPDef::VPFirstOrderRecurrencePHISC:
4459 case VPDef::VPWidenPHISC:
4460 case VPDef::VPWidenIntOrFpInductionSC:
4461 case VPDef::VPWidenPointerInductionSC:
4462 case VPDef::VPReductionPHISC:
4463 case VPDef::VPInterleaveSC:
4464 case VPDef::VPWidenLoadEVLSC:
4465 case VPDef::VPWidenLoadSC:
4466 case VPDef::VPWidenStoreEVLSC:
4467 case VPDef::VPWidenStoreSC:
4468 break;
4469 default:
4470 llvm_unreachable("unhandled recipe");
4471 }
4472
4473 auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4474 Type *VectorTy = ToVectorTy(ScalarTy, VF);
4475 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4476 if (!NumLegalParts)
4477 return false;
4478 if (VF.isScalable()) {
4479 // <vscale x 1 x iN> is assumed to be profitable over iN because
4480 // scalable registers are a distinct register class from scalar
4481 // ones. If we ever find a target which wants to lower scalable
4482 // vectors back to scalars, we'll need to update this code to
4483 // explicitly ask TTI about the register class uses for each part.
4484 return NumLegalParts <= VF.getKnownMinValue();
4485 }
4486 // Two or more parts that share a register - are vectorized.
4487 return NumLegalParts < VF.getKnownMinValue();
4488 };
4489
4490 // If no def nor is a store, e.g., branches, continue - no value to check.
4491 if (R.getNumDefinedValues() == 0 &&
4492 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4493 &R))
4494 continue;
4495 // For multi-def recipes, currently only interleaved loads, suffice to
4496 // check first def only.
4497 // For stores check their stored value; for interleaved stores suffice
4498 // the check first stored value only. In all cases this is the second
4499 // operand.
4500 VPValue *ToCheck =
4501 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4502 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4503 if (!Visited.insert({ScalarTy}).second)
4504 continue;
4505 if (WillWiden(ScalarTy))
4506 return true;
4507 }
4508 }
4509
4510 return false;
4511 }
4512
selectVectorizationFactor()4513 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4514 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4515 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4516 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4517 assert(any_of(VPlans,
4518 [](std::unique_ptr<VPlan> &P) {
4519 return P->hasVF(ElementCount::getFixed(1));
4520 }) &&
4521 "Expected Scalar VF to be a candidate");
4522
4523 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4524 ExpectedCost);
4525 VectorizationFactor ChosenFactor = ScalarCost;
4526
4527 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4528 if (ForceVectorization &&
4529 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4530 // Ignore scalar width, because the user explicitly wants vectorization.
4531 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4532 // evaluation.
4533 ChosenFactor.Cost = InstructionCost::getMax();
4534 }
4535
4536 SmallVector<InstructionVFPair> InvalidCosts;
4537 for (auto &P : VPlans) {
4538 for (ElementCount VF : P->vectorFactors()) {
4539 // The cost for scalar VF=1 is already calculated, so ignore it.
4540 if (VF.isScalar())
4541 continue;
4542
4543 InstructionCost C = CM.expectedCost(VF, &InvalidCosts);
4544 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4545
4546 #ifndef NDEBUG
4547 unsigned AssumedMinimumVscale =
4548 getVScaleForTuning(OrigLoop, TTI).value_or(1);
4549 unsigned Width =
4550 Candidate.Width.isScalable()
4551 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
4552 : Candidate.Width.getFixedValue();
4553 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4554 << " costs: " << (Candidate.Cost / Width));
4555 if (VF.isScalable())
4556 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4557 << AssumedMinimumVscale << ")");
4558 LLVM_DEBUG(dbgs() << ".\n");
4559 #endif
4560
4561 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4562 LLVM_DEBUG(
4563 dbgs()
4564 << "LV: Not considering vector loop of width " << VF
4565 << " because it will not generate any vector instructions.\n");
4566 continue;
4567 }
4568
4569 // If profitable add it to ProfitableVF list.
4570 if (isMoreProfitable(Candidate, ScalarCost))
4571 ProfitableVFs.push_back(Candidate);
4572
4573 if (isMoreProfitable(Candidate, ChosenFactor))
4574 ChosenFactor = Candidate;
4575 }
4576 }
4577
4578 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
4579
4580 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4581 reportVectorizationFailure(
4582 "There are conditional stores.",
4583 "store that is conditionally executed prevents vectorization",
4584 "ConditionalStore", ORE, OrigLoop);
4585 ChosenFactor = ScalarCost;
4586 }
4587
4588 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4589 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4590 << "LV: Vectorization seems to be not beneficial, "
4591 << "but was forced by a user.\n");
4592 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
4593 return ChosenFactor;
4594 }
4595
isCandidateForEpilogueVectorization(ElementCount VF) const4596 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4597 ElementCount VF) const {
4598 // Cross iteration phis such as reductions need special handling and are
4599 // currently unsupported.
4600 if (any_of(OrigLoop->getHeader()->phis(),
4601 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4602 return false;
4603
4604 // Phis with uses outside of the loop require special handling and are
4605 // currently unsupported.
4606 for (const auto &Entry : Legal->getInductionVars()) {
4607 // Look for uses of the value of the induction at the last iteration.
4608 Value *PostInc =
4609 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4610 for (User *U : PostInc->users())
4611 if (!OrigLoop->contains(cast<Instruction>(U)))
4612 return false;
4613 // Look for uses of penultimate value of the induction.
4614 for (User *U : Entry.first->users())
4615 if (!OrigLoop->contains(cast<Instruction>(U)))
4616 return false;
4617 }
4618
4619 // Epilogue vectorization code has not been auditted to ensure it handles
4620 // non-latch exits properly. It may be fine, but it needs auditted and
4621 // tested.
4622 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4623 return false;
4624
4625 return true;
4626 }
4627
isEpilogueVectorizationProfitable(const ElementCount VF) const4628 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4629 const ElementCount VF) const {
4630 // FIXME: We need a much better cost-model to take different parameters such
4631 // as register pressure, code size increase and cost of extra branches into
4632 // account. For now we apply a very crude heuristic and only consider loops
4633 // with vectorization factors larger than a certain value.
4634
4635 // Allow the target to opt out entirely.
4636 if (!TTI.preferEpilogueVectorization())
4637 return false;
4638
4639 // We also consider epilogue vectorization unprofitable for targets that don't
4640 // consider interleaving beneficial (eg. MVE).
4641 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4642 return false;
4643
4644 unsigned Multiplier = 1;
4645 if (VF.isScalable())
4646 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
4647 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
4648 return true;
4649 return false;
4650 }
4651
selectEpilogueVectorizationFactor(const ElementCount MainLoopVF,unsigned IC)4652 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4653 const ElementCount MainLoopVF, unsigned IC) {
4654 VectorizationFactor Result = VectorizationFactor::Disabled();
4655 if (!EnableEpilogueVectorization) {
4656 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4657 return Result;
4658 }
4659
4660 if (!CM.isScalarEpilogueAllowed()) {
4661 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4662 "epilogue is allowed.\n");
4663 return Result;
4664 }
4665
4666 // Not really a cost consideration, but check for unsupported cases here to
4667 // simplify the logic.
4668 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4669 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4670 "is not a supported candidate.\n");
4671 return Result;
4672 }
4673
4674 if (EpilogueVectorizationForceVF > 1) {
4675 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4676 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
4677 if (hasPlanWithVF(ForcedEC))
4678 return {ForcedEC, 0, 0};
4679 else {
4680 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4681 "viable.\n");
4682 return Result;
4683 }
4684 }
4685
4686 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4687 OrigLoop->getHeader()->getParent()->hasMinSize()) {
4688 LLVM_DEBUG(
4689 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4690 return Result;
4691 }
4692
4693 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
4694 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4695 "this loop\n");
4696 return Result;
4697 }
4698
4699 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4700 // the main loop handles 8 lanes per iteration. We could still benefit from
4701 // vectorizing the epilogue loop with VF=4.
4702 ElementCount EstimatedRuntimeVF = MainLoopVF;
4703 if (MainLoopVF.isScalable()) {
4704 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
4705 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
4706 EstimatedRuntimeVF *= *VScale;
4707 }
4708
4709 ScalarEvolution &SE = *PSE.getSE();
4710 Type *TCType = Legal->getWidestInductionType();
4711 const SCEV *RemainingIterations = nullptr;
4712 for (auto &NextVF : ProfitableVFs) {
4713 // Skip candidate VFs without a corresponding VPlan.
4714 if (!hasPlanWithVF(NextVF.Width))
4715 continue;
4716
4717 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
4718 // vectors) or the VF of the main loop (fixed vectors).
4719 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4720 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4721 ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
4722 continue;
4723
4724 // If NextVF is greater than the number of remaining iterations, the
4725 // epilogue loop would be dead. Skip such factors.
4726 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4727 // TODO: extend to support scalable VFs.
4728 if (!RemainingIterations) {
4729 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
4730 RemainingIterations = SE.getURemExpr(
4731 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4732 }
4733 if (SE.isKnownPredicate(
4734 CmpInst::ICMP_UGT,
4735 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4736 RemainingIterations))
4737 continue;
4738 }
4739
4740 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
4741 Result = NextVF;
4742 }
4743
4744 if (Result != VectorizationFactor::Disabled())
4745 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4746 << Result.Width << "\n");
4747 return Result;
4748 }
4749
4750 std::pair<unsigned, unsigned>
getSmallestAndWidestTypes()4751 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4752 unsigned MinWidth = -1U;
4753 unsigned MaxWidth = 8;
4754 const DataLayout &DL = TheFunction->getDataLayout();
4755 // For in-loop reductions, no element types are added to ElementTypesInLoop
4756 // if there are no loads/stores in the loop. In this case, check through the
4757 // reduction variables to determine the maximum width.
4758 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4759 // Reset MaxWidth so that we can find the smallest type used by recurrences
4760 // in the loop.
4761 MaxWidth = -1U;
4762 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4763 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4764 // When finding the min width used by the recurrence we need to account
4765 // for casts on the input operands of the recurrence.
4766 MaxWidth = std::min<unsigned>(
4767 MaxWidth, std::min<unsigned>(
4768 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4769 RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4770 }
4771 } else {
4772 for (Type *T : ElementTypesInLoop) {
4773 MinWidth = std::min<unsigned>(
4774 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4775 MaxWidth = std::max<unsigned>(
4776 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4777 }
4778 }
4779 return {MinWidth, MaxWidth};
4780 }
4781
collectElementTypesForWidening()4782 void LoopVectorizationCostModel::collectElementTypesForWidening() {
4783 ElementTypesInLoop.clear();
4784 // For each block.
4785 for (BasicBlock *BB : TheLoop->blocks()) {
4786 // For each instruction in the loop.
4787 for (Instruction &I : BB->instructionsWithoutDebug()) {
4788 Type *T = I.getType();
4789
4790 // Skip ignored values.
4791 if (ValuesToIgnore.count(&I))
4792 continue;
4793
4794 // Only examine Loads, Stores and PHINodes.
4795 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4796 continue;
4797
4798 // Examine PHI nodes that are reduction variables. Update the type to
4799 // account for the recurrence type.
4800 if (auto *PN = dyn_cast<PHINode>(&I)) {
4801 if (!Legal->isReductionVariable(PN))
4802 continue;
4803 const RecurrenceDescriptor &RdxDesc =
4804 Legal->getReductionVars().find(PN)->second;
4805 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4806 TTI.preferInLoopReduction(RdxDesc.getOpcode(),
4807 RdxDesc.getRecurrenceType(),
4808 TargetTransformInfo::ReductionFlags()))
4809 continue;
4810 T = RdxDesc.getRecurrenceType();
4811 }
4812
4813 // Examine the stored values.
4814 if (auto *ST = dyn_cast<StoreInst>(&I))
4815 T = ST->getValueOperand()->getType();
4816
4817 assert(T->isSized() &&
4818 "Expected the load/store/recurrence type to be sized");
4819
4820 ElementTypesInLoop.insert(T);
4821 }
4822 }
4823 }
4824
4825 unsigned
selectInterleaveCount(ElementCount VF,InstructionCost LoopCost)4826 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4827 InstructionCost LoopCost) {
4828 // -- The interleave heuristics --
4829 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4830 // There are many micro-architectural considerations that we can't predict
4831 // at this level. For example, frontend pressure (on decode or fetch) due to
4832 // code size, or the number and capabilities of the execution ports.
4833 //
4834 // We use the following heuristics to select the interleave count:
4835 // 1. If the code has reductions, then we interleave to break the cross
4836 // iteration dependency.
4837 // 2. If the loop is really small, then we interleave to reduce the loop
4838 // overhead.
4839 // 3. We don't interleave if we think that we will spill registers to memory
4840 // due to the increased register pressure.
4841
4842 if (!isScalarEpilogueAllowed())
4843 return 1;
4844
4845 // Do not interleave if EVL is preferred and no User IC is specified.
4846 if (foldTailWithEVL()) {
4847 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4848 "Unroll factor forced to be 1.\n");
4849 return 1;
4850 }
4851
4852 // We used the distance for the interleave count.
4853 if (!Legal->isSafeForAnyVectorWidth())
4854 return 1;
4855
4856 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
4857 const bool HasReductions = !Legal->getReductionVars().empty();
4858
4859 // If we did not calculate the cost for VF (because the user selected the VF)
4860 // then we calculate the cost of VF here.
4861 if (LoopCost == 0) {
4862 LoopCost = expectedCost(VF);
4863 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4864
4865 // Loop body is free and there is no need for interleaving.
4866 if (LoopCost == 0)
4867 return 1;
4868 }
4869
4870 RegisterUsage R = calculateRegisterUsage({VF})[0];
4871 // We divide by these constants so assume that we have at least one
4872 // instruction that uses at least one register.
4873 for (auto& pair : R.MaxLocalUsers) {
4874 pair.second = std::max(pair.second, 1U);
4875 }
4876
4877 // We calculate the interleave count using the following formula.
4878 // Subtract the number of loop invariants from the number of available
4879 // registers. These registers are used by all of the interleaved instances.
4880 // Next, divide the remaining registers by the number of registers that is
4881 // required by the loop, in order to estimate how many parallel instances
4882 // fit without causing spills. All of this is rounded down if necessary to be
4883 // a power of two. We want power of two interleave count to simplify any
4884 // addressing operations or alignment considerations.
4885 // We also want power of two interleave counts to ensure that the induction
4886 // variable of the vector loop wraps to zero, when tail is folded by masking;
4887 // this currently happens when OptForSize, in which case IC is set to 1 above.
4888 unsigned IC = UINT_MAX;
4889
4890 for (auto& pair : R.MaxLocalUsers) {
4891 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4892 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4893 << " registers of "
4894 << TTI.getRegisterClassName(pair.first) << " register class\n");
4895 if (VF.isScalar()) {
4896 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4897 TargetNumRegisters = ForceTargetNumScalarRegs;
4898 } else {
4899 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4900 TargetNumRegisters = ForceTargetNumVectorRegs;
4901 }
4902 unsigned MaxLocalUsers = pair.second;
4903 unsigned LoopInvariantRegs = 0;
4904 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
4905 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
4906
4907 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4908 MaxLocalUsers);
4909 // Don't count the induction variable as interleaved.
4910 if (EnableIndVarRegisterHeur) {
4911 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4912 std::max(1U, (MaxLocalUsers - 1)));
4913 }
4914
4915 IC = std::min(IC, TmpIC);
4916 }
4917
4918 // Clamp the interleave ranges to reasonable counts.
4919 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4920
4921 // Check if the user has overridden the max.
4922 if (VF.isScalar()) {
4923 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4924 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4925 } else {
4926 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4927 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4928 }
4929
4930 unsigned EstimatedVF = VF.getKnownMinValue();
4931 if (VF.isScalable()) {
4932 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
4933 EstimatedVF *= *VScale;
4934 }
4935 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4936
4937 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4938 if (KnownTC > 0) {
4939 // At least one iteration must be scalar when this constraint holds. So the
4940 // maximum available iterations for interleaving is one less.
4941 unsigned AvailableTC =
4942 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
4943
4944 // If trip count is known we select between two prospective ICs, where
4945 // 1) the aggressive IC is capped by the trip count divided by VF
4946 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4947 // The final IC is selected in a way that the epilogue loop trip count is
4948 // minimized while maximizing the IC itself, so that we either run the
4949 // vector loop at least once if it generates a small epilogue loop, or else
4950 // we run the vector loop at least twice.
4951
4952 unsigned InterleaveCountUB = bit_floor(
4953 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4954 unsigned InterleaveCountLB = bit_floor(std::max(
4955 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4956 MaxInterleaveCount = InterleaveCountLB;
4957
4958 if (InterleaveCountUB != InterleaveCountLB) {
4959 unsigned TailTripCountUB =
4960 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4961 unsigned TailTripCountLB =
4962 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4963 // If both produce same scalar tail, maximize the IC to do the same work
4964 // in fewer vector loop iterations
4965 if (TailTripCountUB == TailTripCountLB)
4966 MaxInterleaveCount = InterleaveCountUB;
4967 }
4968 } else if (BestKnownTC && *BestKnownTC > 0) {
4969 // At least one iteration must be scalar when this constraint holds. So the
4970 // maximum available iterations for interleaving is one less.
4971 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
4972 ? (*BestKnownTC) - 1
4973 : *BestKnownTC;
4974
4975 // If trip count is an estimated compile time constant, limit the
4976 // IC to be capped by the trip count divided by VF * 2, such that the vector
4977 // loop runs at least twice to make interleaving seem profitable when there
4978 // is an epilogue loop present. Since exact Trip count is not known we
4979 // choose to be conservative in our IC estimate.
4980 MaxInterleaveCount = bit_floor(std::max(
4981 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4982 }
4983
4984 assert(MaxInterleaveCount > 0 &&
4985 "Maximum interleave count must be greater than 0");
4986
4987 // Clamp the calculated IC to be between the 1 and the max interleave count
4988 // that the target and trip count allows.
4989 if (IC > MaxInterleaveCount)
4990 IC = MaxInterleaveCount;
4991 else
4992 // Make sure IC is greater than 0.
4993 IC = std::max(1u, IC);
4994
4995 assert(IC > 0 && "Interleave count must be greater than 0.");
4996
4997 // Interleave if we vectorized this loop and there is a reduction that could
4998 // benefit from interleaving.
4999 if (VF.isVector() && HasReductions) {
5000 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5001 return IC;
5002 }
5003
5004 // For any scalar loop that either requires runtime checks or predication we
5005 // are better off leaving this to the unroller. Note that if we've already
5006 // vectorized the loop we will have done the runtime check and so interleaving
5007 // won't require further checks.
5008 bool ScalarInterleavingRequiresPredication =
5009 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5010 return Legal->blockNeedsPredication(BB);
5011 }));
5012 bool ScalarInterleavingRequiresRuntimePointerCheck =
5013 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5014
5015 // We want to interleave small loops in order to reduce the loop overhead and
5016 // potentially expose ILP opportunities.
5017 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5018 << "LV: IC is " << IC << '\n'
5019 << "LV: VF is " << VF << '\n');
5020 const bool AggressivelyInterleaveReductions =
5021 TTI.enableAggressiveInterleaving(HasReductions);
5022 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5023 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5024 // We assume that the cost overhead is 1 and we use the cost model
5025 // to estimate the cost of the loop and interleave until the cost of the
5026 // loop overhead is about 5% of the cost of the loop.
5027 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5028 SmallLoopCost / *LoopCost.getValue()));
5029
5030 // Interleave until store/load ports (estimated by max interleave count) are
5031 // saturated.
5032 unsigned NumStores = Legal->getNumStores();
5033 unsigned NumLoads = Legal->getNumLoads();
5034 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5035 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5036
5037 // There is little point in interleaving for reductions containing selects
5038 // and compares when VF=1 since it may just create more overhead than it's
5039 // worth for loops with small trip counts. This is because we still have to
5040 // do the final reduction after the loop.
5041 bool HasSelectCmpReductions =
5042 HasReductions &&
5043 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5044 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5045 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5046 RdxDesc.getRecurrenceKind());
5047 });
5048 if (HasSelectCmpReductions) {
5049 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5050 return 1;
5051 }
5052
5053 // If we have a scalar reduction (vector reductions are already dealt with
5054 // by this point), we can increase the critical path length if the loop
5055 // we're interleaving is inside another loop. For tree-wise reductions
5056 // set the limit to 2, and for ordered reductions it's best to disable
5057 // interleaving entirely.
5058 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5059 bool HasOrderedReductions =
5060 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5061 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5062 return RdxDesc.isOrdered();
5063 });
5064 if (HasOrderedReductions) {
5065 LLVM_DEBUG(
5066 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5067 return 1;
5068 }
5069
5070 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5071 SmallIC = std::min(SmallIC, F);
5072 StoresIC = std::min(StoresIC, F);
5073 LoadsIC = std::min(LoadsIC, F);
5074 }
5075
5076 if (EnableLoadStoreRuntimeInterleave &&
5077 std::max(StoresIC, LoadsIC) > SmallIC) {
5078 LLVM_DEBUG(
5079 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5080 return std::max(StoresIC, LoadsIC);
5081 }
5082
5083 // If there are scalar reductions and TTI has enabled aggressive
5084 // interleaving for reductions, we will interleave to expose ILP.
5085 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5086 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5087 // Interleave no less than SmallIC but not as aggressive as the normal IC
5088 // to satisfy the rare situation when resources are too limited.
5089 return std::max(IC / 2, SmallIC);
5090 } else {
5091 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5092 return SmallIC;
5093 }
5094 }
5095
5096 // Interleave if this is a large loop (small loops are already dealt with by
5097 // this point) that could benefit from interleaving.
5098 if (AggressivelyInterleaveReductions) {
5099 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5100 return IC;
5101 }
5102
5103 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5104 return 1;
5105 }
5106
5107 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
calculateRegisterUsage(ArrayRef<ElementCount> VFs)5108 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5109 // This function calculates the register usage by measuring the highest number
5110 // of values that are alive at a single location. Obviously, this is a very
5111 // rough estimation. We scan the loop in a topological order in order and
5112 // assign a number to each instruction. We use RPO to ensure that defs are
5113 // met before their users. We assume that each instruction that has in-loop
5114 // users starts an interval. We record every time that an in-loop value is
5115 // used, so we have a list of the first and last occurrences of each
5116 // instruction. Next, we transpose this data structure into a multi map that
5117 // holds the list of intervals that *end* at a specific location. This multi
5118 // map allows us to perform a linear search. We scan the instructions linearly
5119 // and record each time that a new interval starts, by placing it in a set.
5120 // If we find this value in the multi-map then we remove it from the set.
5121 // The max register usage is the maximum size of the set.
5122 // We also search for instructions that are defined outside the loop, but are
5123 // used inside the loop. We need this number separately from the max-interval
5124 // usage number because when we unroll, loop-invariant values do not take
5125 // more register.
5126 LoopBlocksDFS DFS(TheLoop);
5127 DFS.perform(LI);
5128
5129 RegisterUsage RU;
5130
5131 // Each 'key' in the map opens a new interval. The values
5132 // of the map are the index of the 'last seen' usage of the
5133 // instruction that is the key.
5134 using IntervalMap = DenseMap<Instruction *, unsigned>;
5135
5136 // Maps instruction to its index.
5137 SmallVector<Instruction *, 64> IdxToInstr;
5138 // Marks the end of each interval.
5139 IntervalMap EndPoint;
5140 // Saves the list of instruction indices that are used in the loop.
5141 SmallPtrSet<Instruction *, 8> Ends;
5142 // Saves the list of values that are used in the loop but are defined outside
5143 // the loop (not including non-instruction values such as arguments and
5144 // constants).
5145 SmallSetVector<Instruction *, 8> LoopInvariants;
5146
5147 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5148 for (Instruction &I : BB->instructionsWithoutDebug()) {
5149 IdxToInstr.push_back(&I);
5150
5151 // Save the end location of each USE.
5152 for (Value *U : I.operands()) {
5153 auto *Instr = dyn_cast<Instruction>(U);
5154
5155 // Ignore non-instruction values such as arguments, constants, etc.
5156 // FIXME: Might need some motivation why these values are ignored. If
5157 // for example an argument is used inside the loop it will increase the
5158 // register pressure (so shouldn't we add it to LoopInvariants).
5159 if (!Instr)
5160 continue;
5161
5162 // If this instruction is outside the loop then record it and continue.
5163 if (!TheLoop->contains(Instr)) {
5164 LoopInvariants.insert(Instr);
5165 continue;
5166 }
5167
5168 // Overwrite previous end points.
5169 EndPoint[Instr] = IdxToInstr.size();
5170 Ends.insert(Instr);
5171 }
5172 }
5173 }
5174
5175 // Saves the list of intervals that end with the index in 'key'.
5176 using InstrList = SmallVector<Instruction *, 2>;
5177 DenseMap<unsigned, InstrList> TransposeEnds;
5178
5179 // Transpose the EndPoints to a list of values that end at each index.
5180 for (auto &Interval : EndPoint)
5181 TransposeEnds[Interval.second].push_back(Interval.first);
5182
5183 SmallPtrSet<Instruction *, 8> OpenIntervals;
5184 SmallVector<RegisterUsage, 8> RUs(VFs.size());
5185 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5186
5187 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5188
5189 const auto &TTICapture = TTI;
5190 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5191 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5192 return 0;
5193 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5194 };
5195
5196 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5197 Instruction *I = IdxToInstr[i];
5198
5199 // Remove all of the instructions that end at this location.
5200 InstrList &List = TransposeEnds[i];
5201 for (Instruction *ToRemove : List)
5202 OpenIntervals.erase(ToRemove);
5203
5204 // Ignore instructions that are never used within the loop.
5205 if (!Ends.count(I))
5206 continue;
5207
5208 // Skip ignored values.
5209 if (ValuesToIgnore.count(I))
5210 continue;
5211
5212 collectInLoopReductions();
5213
5214 // For each VF find the maximum usage of registers.
5215 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5216 // Count the number of registers used, per register class, given all open
5217 // intervals.
5218 // Note that elements in this SmallMapVector will be default constructed
5219 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5220 // there is no previous entry for ClassID.
5221 SmallMapVector<unsigned, unsigned, 4> RegUsage;
5222
5223 if (VFs[j].isScalar()) {
5224 for (auto *Inst : OpenIntervals) {
5225 unsigned ClassID =
5226 TTI.getRegisterClassForType(false, Inst->getType());
5227 // FIXME: The target might use more than one register for the type
5228 // even in the scalar case.
5229 RegUsage[ClassID] += 1;
5230 }
5231 } else {
5232 collectUniformsAndScalars(VFs[j]);
5233 for (auto *Inst : OpenIntervals) {
5234 // Skip ignored values for VF > 1.
5235 if (VecValuesToIgnore.count(Inst))
5236 continue;
5237 if (isScalarAfterVectorization(Inst, VFs[j])) {
5238 unsigned ClassID =
5239 TTI.getRegisterClassForType(false, Inst->getType());
5240 // FIXME: The target might use more than one register for the type
5241 // even in the scalar case.
5242 RegUsage[ClassID] += 1;
5243 } else {
5244 unsigned ClassID =
5245 TTI.getRegisterClassForType(true, Inst->getType());
5246 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5247 }
5248 }
5249 }
5250
5251 for (auto& pair : RegUsage) {
5252 auto &Entry = MaxUsages[j][pair.first];
5253 Entry = std::max(Entry, pair.second);
5254 }
5255 }
5256
5257 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5258 << OpenIntervals.size() << '\n');
5259
5260 // Add the current instruction to the list of open intervals.
5261 OpenIntervals.insert(I);
5262 }
5263
5264 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5265 // Note that elements in this SmallMapVector will be default constructed
5266 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5267 // there is no previous entry for ClassID.
5268 SmallMapVector<unsigned, unsigned, 4> Invariant;
5269
5270 for (auto *Inst : LoopInvariants) {
5271 // FIXME: The target might use more than one register for the type
5272 // even in the scalar case.
5273 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5274 auto *I = cast<Instruction>(U);
5275 return TheLoop != LI->getLoopFor(I->getParent()) ||
5276 isScalarAfterVectorization(I, VFs[i]);
5277 });
5278
5279 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5280 unsigned ClassID =
5281 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5282 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5283 }
5284
5285 LLVM_DEBUG({
5286 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5287 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5288 << " item\n";
5289 for (const auto &pair : MaxUsages[i]) {
5290 dbgs() << "LV(REG): RegisterClass: "
5291 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5292 << " registers\n";
5293 }
5294 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5295 << " item\n";
5296 for (const auto &pair : Invariant) {
5297 dbgs() << "LV(REG): RegisterClass: "
5298 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5299 << " registers\n";
5300 }
5301 });
5302
5303 RU.LoopInvariantRegs = Invariant;
5304 RU.MaxLocalUsers = MaxUsages[i];
5305 RUs[i] = RU;
5306 }
5307
5308 return RUs;
5309 }
5310
useEmulatedMaskMemRefHack(Instruction * I,ElementCount VF)5311 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5312 ElementCount VF) {
5313 // TODO: Cost model for emulated masked load/store is completely
5314 // broken. This hack guides the cost model to use an artificially
5315 // high enough value to practically disable vectorization with such
5316 // operations, except where previously deployed legality hack allowed
5317 // using very low cost values. This is to avoid regressions coming simply
5318 // from moving "masked load/store" check from legality to cost model.
5319 // Masked Load/Gather emulation was previously never allowed.
5320 // Limited number of Masked Store/Scatter emulation was allowed.
5321 assert((isPredicatedInst(I)) &&
5322 "Expecting a scalar emulated instruction");
5323 return isa<LoadInst>(I) ||
5324 (isa<StoreInst>(I) &&
5325 NumPredStores > NumberOfStoresToPredicate);
5326 }
5327
collectInstsToScalarize(ElementCount VF)5328 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5329 // If we aren't vectorizing the loop, or if we've already collected the
5330 // instructions to scalarize, there's nothing to do. Collection may already
5331 // have occurred if we have a user-selected VF and are now computing the
5332 // expected cost for interleaving.
5333 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5334 return;
5335
5336 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5337 // not profitable to scalarize any instructions, the presence of VF in the
5338 // map will indicate that we've analyzed it already.
5339 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5340
5341 PredicatedBBsAfterVectorization[VF].clear();
5342
5343 // Find all the instructions that are scalar with predication in the loop and
5344 // determine if it would be better to not if-convert the blocks they are in.
5345 // If so, we also record the instructions to scalarize.
5346 for (BasicBlock *BB : TheLoop->blocks()) {
5347 if (!blockNeedsPredicationForAnyReason(BB))
5348 continue;
5349 for (Instruction &I : *BB)
5350 if (isScalarWithPredication(&I, VF)) {
5351 ScalarCostsTy ScalarCosts;
5352 // Do not apply discount logic for:
5353 // 1. Scalars after vectorization, as there will only be a single copy
5354 // of the instruction.
5355 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5356 // 3. Emulated masked memrefs, if a hacked cost is needed.
5357 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5358 !useEmulatedMaskMemRefHack(&I, VF) &&
5359 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5360 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5361 // Remember that BB will remain after vectorization.
5362 PredicatedBBsAfterVectorization[VF].insert(BB);
5363 for (auto *Pred : predecessors(BB)) {
5364 if (Pred->getSingleSuccessor() == BB)
5365 PredicatedBBsAfterVectorization[VF].insert(Pred);
5366 }
5367 }
5368 }
5369 }
5370
computePredInstDiscount(Instruction * PredInst,ScalarCostsTy & ScalarCosts,ElementCount VF)5371 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5372 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5373 assert(!isUniformAfterVectorization(PredInst, VF) &&
5374 "Instruction marked uniform-after-vectorization will be predicated");
5375
5376 // Initialize the discount to zero, meaning that the scalar version and the
5377 // vector version cost the same.
5378 InstructionCost Discount = 0;
5379
5380 // Holds instructions to analyze. The instructions we visit are mapped in
5381 // ScalarCosts. Those instructions are the ones that would be scalarized if
5382 // we find that the scalar version costs less.
5383 SmallVector<Instruction *, 8> Worklist;
5384
5385 // Returns true if the given instruction can be scalarized.
5386 auto canBeScalarized = [&](Instruction *I) -> bool {
5387 // We only attempt to scalarize instructions forming a single-use chain
5388 // from the original predicated block that would otherwise be vectorized.
5389 // Although not strictly necessary, we give up on instructions we know will
5390 // already be scalar to avoid traversing chains that are unlikely to be
5391 // beneficial.
5392 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5393 isScalarAfterVectorization(I, VF))
5394 return false;
5395
5396 // If the instruction is scalar with predication, it will be analyzed
5397 // separately. We ignore it within the context of PredInst.
5398 if (isScalarWithPredication(I, VF))
5399 return false;
5400
5401 // If any of the instruction's operands are uniform after vectorization,
5402 // the instruction cannot be scalarized. This prevents, for example, a
5403 // masked load from being scalarized.
5404 //
5405 // We assume we will only emit a value for lane zero of an instruction
5406 // marked uniform after vectorization, rather than VF identical values.
5407 // Thus, if we scalarize an instruction that uses a uniform, we would
5408 // create uses of values corresponding to the lanes we aren't emitting code
5409 // for. This behavior can be changed by allowing getScalarValue to clone
5410 // the lane zero values for uniforms rather than asserting.
5411 for (Use &U : I->operands())
5412 if (auto *J = dyn_cast<Instruction>(U.get()))
5413 if (isUniformAfterVectorization(J, VF))
5414 return false;
5415
5416 // Otherwise, we can scalarize the instruction.
5417 return true;
5418 };
5419
5420 // Compute the expected cost discount from scalarizing the entire expression
5421 // feeding the predicated instruction. We currently only consider expressions
5422 // that are single-use instruction chains.
5423 Worklist.push_back(PredInst);
5424 while (!Worklist.empty()) {
5425 Instruction *I = Worklist.pop_back_val();
5426
5427 // If we've already analyzed the instruction, there's nothing to do.
5428 if (ScalarCosts.contains(I))
5429 continue;
5430
5431 // Compute the cost of the vector instruction. Note that this cost already
5432 // includes the scalarization overhead of the predicated instruction.
5433 InstructionCost VectorCost = getInstructionCost(I, VF);
5434
5435 // Compute the cost of the scalarized instruction. This cost is the cost of
5436 // the instruction as if it wasn't if-converted and instead remained in the
5437 // predicated block. We will scale this cost by block probability after
5438 // computing the scalarization overhead.
5439 InstructionCost ScalarCost =
5440 VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5441
5442 // Compute the scalarization overhead of needed insertelement instructions
5443 // and phi nodes.
5444 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5445 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5446 ScalarCost += TTI.getScalarizationOverhead(
5447 cast<VectorType>(ToVectorTy(I->getType(), VF)),
5448 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5449 /*Extract*/ false, CostKind);
5450 ScalarCost +=
5451 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5452 }
5453
5454 // Compute the scalarization overhead of needed extractelement
5455 // instructions. For each of the instruction's operands, if the operand can
5456 // be scalarized, add it to the worklist; otherwise, account for the
5457 // overhead.
5458 for (Use &U : I->operands())
5459 if (auto *J = dyn_cast<Instruction>(U.get())) {
5460 assert(VectorType::isValidElementType(J->getType()) &&
5461 "Instruction has non-scalar type");
5462 if (canBeScalarized(J))
5463 Worklist.push_back(J);
5464 else if (needsExtract(J, VF)) {
5465 ScalarCost += TTI.getScalarizationOverhead(
5466 cast<VectorType>(ToVectorTy(J->getType(), VF)),
5467 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5468 /*Extract*/ true, CostKind);
5469 }
5470 }
5471
5472 // Scale the total scalar cost by block probability.
5473 ScalarCost /= getReciprocalPredBlockProb();
5474
5475 // Compute the discount. A non-negative discount means the vector version
5476 // of the instruction costs more, and scalarizing would be beneficial.
5477 Discount += VectorCost - ScalarCost;
5478 ScalarCosts[I] = ScalarCost;
5479 }
5480
5481 return Discount;
5482 }
5483
expectedCost(ElementCount VF,SmallVectorImpl<InstructionVFPair> * Invalid)5484 InstructionCost LoopVectorizationCostModel::expectedCost(
5485 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
5486 InstructionCost Cost;
5487
5488 // For each block.
5489 for (BasicBlock *BB : TheLoop->blocks()) {
5490 InstructionCost BlockCost;
5491
5492 // For each instruction in the old loop.
5493 for (Instruction &I : BB->instructionsWithoutDebug()) {
5494 // Skip ignored values.
5495 if (ValuesToIgnore.count(&I) ||
5496 (VF.isVector() && VecValuesToIgnore.count(&I)))
5497 continue;
5498
5499 InstructionCost C = getInstructionCost(&I, VF);
5500
5501 // Check if we should override the cost.
5502 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5503 C = InstructionCost(ForceTargetInstructionCost);
5504
5505 // Keep a list of instructions with invalid costs.
5506 if (Invalid && !C.isValid())
5507 Invalid->emplace_back(&I, VF);
5508
5509 BlockCost += C;
5510 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5511 << VF << " For instruction: " << I << '\n');
5512 }
5513
5514 // If we are vectorizing a predicated block, it will have been
5515 // if-converted. This means that the block's instructions (aside from
5516 // stores and instructions that may divide by zero) will now be
5517 // unconditionally executed. For the scalar case, we may not always execute
5518 // the predicated block, if it is an if-else block. Thus, scale the block's
5519 // cost by the probability of executing it. blockNeedsPredication from
5520 // Legal is used so as to not include all blocks in tail folded loops.
5521 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5522 BlockCost /= getReciprocalPredBlockProb();
5523
5524 Cost += BlockCost;
5525 }
5526
5527 return Cost;
5528 }
5529
5530 /// Gets Address Access SCEV after verifying that the access pattern
5531 /// is loop invariant except the induction variable dependence.
5532 ///
5533 /// This SCEV can be sent to the Target in order to estimate the address
5534 /// calculation cost.
getAddressAccessSCEV(Value * Ptr,LoopVectorizationLegality * Legal,PredicatedScalarEvolution & PSE,const Loop * TheLoop)5535 static const SCEV *getAddressAccessSCEV(
5536 Value *Ptr,
5537 LoopVectorizationLegality *Legal,
5538 PredicatedScalarEvolution &PSE,
5539 const Loop *TheLoop) {
5540
5541 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5542 if (!Gep)
5543 return nullptr;
5544
5545 // We are looking for a gep with all loop invariant indices except for one
5546 // which should be an induction variable.
5547 auto SE = PSE.getSE();
5548 unsigned NumOperands = Gep->getNumOperands();
5549 for (unsigned i = 1; i < NumOperands; ++i) {
5550 Value *Opd = Gep->getOperand(i);
5551 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5552 !Legal->isInductionVariable(Opd))
5553 return nullptr;
5554 }
5555
5556 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5557 return PSE.getSCEV(Ptr);
5558 }
5559
5560 InstructionCost
getMemInstScalarizationCost(Instruction * I,ElementCount VF)5561 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5562 ElementCount VF) {
5563 assert(VF.isVector() &&
5564 "Scalarization cost of instruction implies vectorization.");
5565 if (VF.isScalable())
5566 return InstructionCost::getInvalid();
5567
5568 Type *ValTy = getLoadStoreType(I);
5569 auto SE = PSE.getSE();
5570
5571 unsigned AS = getLoadStoreAddressSpace(I);
5572 Value *Ptr = getLoadStorePointerOperand(I);
5573 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5574 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5575 // that it is being called from this specific place.
5576
5577 // Figure out whether the access is strided and get the stride value
5578 // if it's known in compile time
5579 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5580
5581 // Get the cost of the scalar memory instruction and address computation.
5582 InstructionCost Cost =
5583 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5584
5585 // Don't pass *I here, since it is scalar but will actually be part of a
5586 // vectorized loop where the user of it is a vectorized instruction.
5587 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5588 const Align Alignment = getLoadStoreAlignment(I);
5589 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5590 ValTy->getScalarType(),
5591 Alignment, AS, CostKind);
5592
5593 // Get the overhead of the extractelement and insertelement instructions
5594 // we might create due to scalarization.
5595 Cost += getScalarizationOverhead(I, VF, CostKind);
5596
5597 // If we have a predicated load/store, it will need extra i1 extracts and
5598 // conditional branches, but may not be executed for each vector lane. Scale
5599 // the cost by the probability of executing the predicated block.
5600 if (isPredicatedInst(I)) {
5601 Cost /= getReciprocalPredBlockProb();
5602
5603 // Add the cost of an i1 extract and a branch
5604 auto *Vec_i1Ty =
5605 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5606 Cost += TTI.getScalarizationOverhead(
5607 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5608 /*Insert=*/false, /*Extract=*/true, CostKind);
5609 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5610
5611 if (useEmulatedMaskMemRefHack(I, VF))
5612 // Artificially setting to a high enough value to practically disable
5613 // vectorization with such operations.
5614 Cost = 3000000;
5615 }
5616
5617 return Cost;
5618 }
5619
5620 InstructionCost
getConsecutiveMemOpCost(Instruction * I,ElementCount VF)5621 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5622 ElementCount VF) {
5623 Type *ValTy = getLoadStoreType(I);
5624 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5625 Value *Ptr = getLoadStorePointerOperand(I);
5626 unsigned AS = getLoadStoreAddressSpace(I);
5627 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5628 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5629
5630 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5631 "Stride should be 1 or -1 for consecutive memory access");
5632 const Align Alignment = getLoadStoreAlignment(I);
5633 InstructionCost Cost = 0;
5634 if (Legal->isMaskRequired(I)) {
5635 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5636 CostKind);
5637 } else {
5638 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5639 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5640 CostKind, OpInfo, I);
5641 }
5642
5643 bool Reverse = ConsecutiveStride < 0;
5644 if (Reverse)
5645 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
5646 std::nullopt, CostKind, 0);
5647 return Cost;
5648 }
5649
5650 InstructionCost
getUniformMemOpCost(Instruction * I,ElementCount VF)5651 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5652 ElementCount VF) {
5653 assert(Legal->isUniformMemOp(*I, VF));
5654
5655 Type *ValTy = getLoadStoreType(I);
5656 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5657 const Align Alignment = getLoadStoreAlignment(I);
5658 unsigned AS = getLoadStoreAddressSpace(I);
5659 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5660 if (isa<LoadInst>(I)) {
5661 return TTI.getAddressComputationCost(ValTy) +
5662 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5663 CostKind) +
5664 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5665 }
5666 StoreInst *SI = cast<StoreInst>(I);
5667
5668 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5669 return TTI.getAddressComputationCost(ValTy) +
5670 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5671 CostKind) +
5672 (isLoopInvariantStoreValue
5673 ? 0
5674 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5675 CostKind, VF.getKnownMinValue() - 1));
5676 }
5677
5678 InstructionCost
getGatherScatterCost(Instruction * I,ElementCount VF)5679 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5680 ElementCount VF) {
5681 Type *ValTy = getLoadStoreType(I);
5682 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5683 const Align Alignment = getLoadStoreAlignment(I);
5684 const Value *Ptr = getLoadStorePointerOperand(I);
5685
5686 return TTI.getAddressComputationCost(VectorTy) +
5687 TTI.getGatherScatterOpCost(
5688 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
5689 TargetTransformInfo::TCK_RecipThroughput, I);
5690 }
5691
5692 InstructionCost
getInterleaveGroupCost(Instruction * I,ElementCount VF)5693 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5694 ElementCount VF) {
5695 Type *ValTy = getLoadStoreType(I);
5696 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5697 unsigned AS = getLoadStoreAddressSpace(I);
5698 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5699
5700 auto Group = getInterleavedAccessGroup(I);
5701 assert(Group && "Fail to get an interleaved access group.");
5702
5703 unsigned InterleaveFactor = Group->getFactor();
5704 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5705
5706 // Holds the indices of existing members in the interleaved group.
5707 SmallVector<unsigned, 4> Indices;
5708 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5709 if (Group->getMember(IF))
5710 Indices.push_back(IF);
5711
5712 // Calculate the cost of the whole interleaved group.
5713 bool UseMaskForGaps =
5714 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5715 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5716 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5717 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
5718 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
5719
5720 if (Group->isReverse()) {
5721 // TODO: Add support for reversed masked interleaved access.
5722 assert(!Legal->isMaskRequired(I) &&
5723 "Reverse masked interleaved access not supported.");
5724 Cost += Group->getNumMembers() *
5725 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
5726 std::nullopt, CostKind, 0);
5727 }
5728 return Cost;
5729 }
5730
5731 std::optional<InstructionCost>
getReductionPatternCost(Instruction * I,ElementCount VF,Type * Ty,TTI::TargetCostKind CostKind) const5732 LoopVectorizationCostModel::getReductionPatternCost(
5733 Instruction *I, ElementCount VF, Type *Ty,
5734 TTI::TargetCostKind CostKind) const {
5735 using namespace llvm::PatternMatch;
5736 // Early exit for no inloop reductions
5737 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5738 return std::nullopt;
5739 auto *VectorTy = cast<VectorType>(Ty);
5740
5741 // We are looking for a pattern of, and finding the minimal acceptable cost:
5742 // reduce(mul(ext(A), ext(B))) or
5743 // reduce(mul(A, B)) or
5744 // reduce(ext(A)) or
5745 // reduce(A).
5746 // The basic idea is that we walk down the tree to do that, finding the root
5747 // reduction instruction in InLoopReductionImmediateChains. From there we find
5748 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5749 // of the components. If the reduction cost is lower then we return it for the
5750 // reduction instruction and 0 for the other instructions in the pattern. If
5751 // it is not we return an invalid cost specifying the orignal cost method
5752 // should be used.
5753 Instruction *RetI = I;
5754 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5755 if (!RetI->hasOneUser())
5756 return std::nullopt;
5757 RetI = RetI->user_back();
5758 }
5759
5760 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5761 RetI->user_back()->getOpcode() == Instruction::Add) {
5762 RetI = RetI->user_back();
5763 }
5764
5765 // Test if the found instruction is a reduction, and if not return an invalid
5766 // cost specifying the parent to use the original cost modelling.
5767 if (!InLoopReductionImmediateChains.count(RetI))
5768 return std::nullopt;
5769
5770 // Find the reduction this chain is a part of and calculate the basic cost of
5771 // the reduction on its own.
5772 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5773 Instruction *ReductionPhi = LastChain;
5774 while (!isa<PHINode>(ReductionPhi))
5775 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5776
5777 const RecurrenceDescriptor &RdxDesc =
5778 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5779
5780 InstructionCost BaseCost;
5781 RecurKind RK = RdxDesc.getRecurrenceKind();
5782 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
5783 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5784 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5785 RdxDesc.getFastMathFlags(), CostKind);
5786 } else {
5787 BaseCost = TTI.getArithmeticReductionCost(
5788 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5789 }
5790
5791 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5792 // normal fmul instruction to the cost of the fadd reduction.
5793 if (RK == RecurKind::FMulAdd)
5794 BaseCost +=
5795 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5796
5797 // If we're using ordered reductions then we can just return the base cost
5798 // here, since getArithmeticReductionCost calculates the full ordered
5799 // reduction cost when FP reassociation is not allowed.
5800 if (useOrderedReductions(RdxDesc))
5801 return BaseCost;
5802
5803 // Get the operand that was not the reduction chain and match it to one of the
5804 // patterns, returning the better cost if it is found.
5805 Instruction *RedOp = RetI->getOperand(1) == LastChain
5806 ? dyn_cast<Instruction>(RetI->getOperand(0))
5807 : dyn_cast<Instruction>(RetI->getOperand(1));
5808
5809 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5810
5811 Instruction *Op0, *Op1;
5812 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5813 match(RedOp,
5814 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
5815 match(Op0, m_ZExtOrSExt(m_Value())) &&
5816 Op0->getOpcode() == Op1->getOpcode() &&
5817 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5818 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5819 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5820
5821 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5822 // Note that the extend opcodes need to all match, or if A==B they will have
5823 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5824 // which is equally fine.
5825 bool IsUnsigned = isa<ZExtInst>(Op0);
5826 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5827 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5828
5829 InstructionCost ExtCost =
5830 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5831 TTI::CastContextHint::None, CostKind, Op0);
5832 InstructionCost MulCost =
5833 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5834 InstructionCost Ext2Cost =
5835 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5836 TTI::CastContextHint::None, CostKind, RedOp);
5837
5838 InstructionCost RedCost = TTI.getMulAccReductionCost(
5839 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5840
5841 if (RedCost.isValid() &&
5842 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5843 return I == RetI ? RedCost : 0;
5844 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5845 !TheLoop->isLoopInvariant(RedOp)) {
5846 // Matched reduce(ext(A))
5847 bool IsUnsigned = isa<ZExtInst>(RedOp);
5848 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5849 InstructionCost RedCost = TTI.getExtendedReductionCost(
5850 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5851 RdxDesc.getFastMathFlags(), CostKind);
5852
5853 InstructionCost ExtCost =
5854 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5855 TTI::CastContextHint::None, CostKind, RedOp);
5856 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5857 return I == RetI ? RedCost : 0;
5858 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5859 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5860 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5861 Op0->getOpcode() == Op1->getOpcode() &&
5862 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
5863 bool IsUnsigned = isa<ZExtInst>(Op0);
5864 Type *Op0Ty = Op0->getOperand(0)->getType();
5865 Type *Op1Ty = Op1->getOperand(0)->getType();
5866 Type *LargestOpTy =
5867 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5868 : Op0Ty;
5869 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5870
5871 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5872 // different sizes. We take the largest type as the ext to reduce, and add
5873 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5874 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5875 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5876 TTI::CastContextHint::None, CostKind, Op0);
5877 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5878 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5879 TTI::CastContextHint::None, CostKind, Op1);
5880 InstructionCost MulCost =
5881 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5882
5883 InstructionCost RedCost = TTI.getMulAccReductionCost(
5884 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5885 InstructionCost ExtraExtCost = 0;
5886 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5887 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5888 ExtraExtCost = TTI.getCastInstrCost(
5889 ExtraExtOp->getOpcode(), ExtType,
5890 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5891 TTI::CastContextHint::None, CostKind, ExtraExtOp);
5892 }
5893
5894 if (RedCost.isValid() &&
5895 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5896 return I == RetI ? RedCost : 0;
5897 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5898 // Matched reduce.add(mul())
5899 InstructionCost MulCost =
5900 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5901
5902 InstructionCost RedCost = TTI.getMulAccReductionCost(
5903 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
5904
5905 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5906 return I == RetI ? RedCost : 0;
5907 }
5908 }
5909
5910 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5911 }
5912
5913 InstructionCost
getMemoryInstructionCost(Instruction * I,ElementCount VF)5914 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5915 ElementCount VF) {
5916 // Calculate scalar cost only. Vectorization cost should be ready at this
5917 // moment.
5918 if (VF.isScalar()) {
5919 Type *ValTy = getLoadStoreType(I);
5920 const Align Alignment = getLoadStoreAlignment(I);
5921 unsigned AS = getLoadStoreAddressSpace(I);
5922
5923 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5924 return TTI.getAddressComputationCost(ValTy) +
5925 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
5926 TTI::TCK_RecipThroughput, OpInfo, I);
5927 }
5928 return getWideningCost(I, VF);
5929 }
5930
getScalarizationOverhead(Instruction * I,ElementCount VF,TTI::TargetCostKind CostKind) const5931 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
5932 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
5933
5934 // There is no mechanism yet to create a scalable scalarization loop,
5935 // so this is currently Invalid.
5936 if (VF.isScalable())
5937 return InstructionCost::getInvalid();
5938
5939 if (VF.isScalar())
5940 return 0;
5941
5942 InstructionCost Cost = 0;
5943 Type *RetTy = ToVectorTy(I->getType(), VF);
5944 if (!RetTy->isVoidTy() &&
5945 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5946 Cost += TTI.getScalarizationOverhead(
5947 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
5948 /*Insert*/ true,
5949 /*Extract*/ false, CostKind);
5950
5951 // Some targets keep addresses scalar.
5952 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5953 return Cost;
5954
5955 // Some targets support efficient element stores.
5956 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5957 return Cost;
5958
5959 // Collect operands to consider.
5960 CallInst *CI = dyn_cast<CallInst>(I);
5961 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5962
5963 // Skip operands that do not require extraction/scalarization and do not incur
5964 // any overhead.
5965 SmallVector<Type *> Tys;
5966 for (auto *V : filterExtractingOperands(Ops, VF))
5967 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
5968 return Cost + TTI.getOperandsScalarizationOverhead(
5969 filterExtractingOperands(Ops, VF), Tys, CostKind);
5970 }
5971
setCostBasedWideningDecision(ElementCount VF)5972 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5973 if (VF.isScalar())
5974 return;
5975 NumPredStores = 0;
5976 for (BasicBlock *BB : TheLoop->blocks()) {
5977 // For each instruction in the old loop.
5978 for (Instruction &I : *BB) {
5979 Value *Ptr = getLoadStorePointerOperand(&I);
5980 if (!Ptr)
5981 continue;
5982
5983 // TODO: We should generate better code and update the cost model for
5984 // predicated uniform stores. Today they are treated as any other
5985 // predicated store (see added test cases in
5986 // invariant-store-vectorization.ll).
5987 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
5988 NumPredStores++;
5989
5990 if (Legal->isUniformMemOp(I, VF)) {
5991 auto isLegalToScalarize = [&]() {
5992 if (!VF.isScalable())
5993 // Scalarization of fixed length vectors "just works".
5994 return true;
5995
5996 // We have dedicated lowering for unpredicated uniform loads and
5997 // stores. Note that even with tail folding we know that at least
5998 // one lane is active (i.e. generalized predication is not possible
5999 // here), and the logic below depends on this fact.
6000 if (!foldTailByMasking())
6001 return true;
6002
6003 // For scalable vectors, a uniform memop load is always
6004 // uniform-by-parts and we know how to scalarize that.
6005 if (isa<LoadInst>(I))
6006 return true;
6007
6008 // A uniform store isn't neccessarily uniform-by-part
6009 // and we can't assume scalarization.
6010 auto &SI = cast<StoreInst>(I);
6011 return TheLoop->isLoopInvariant(SI.getValueOperand());
6012 };
6013
6014 const InstructionCost GatherScatterCost =
6015 isLegalGatherOrScatter(&I, VF) ?
6016 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6017
6018 // Load: Scalar load + broadcast
6019 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6020 // FIXME: This cost is a significant under-estimate for tail folded
6021 // memory ops.
6022 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6023 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6024
6025 // Choose better solution for the current VF, Note that Invalid
6026 // costs compare as maximumal large. If both are invalid, we get
6027 // scalable invalid which signals a failure and a vectorization abort.
6028 if (GatherScatterCost < ScalarizationCost)
6029 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6030 else
6031 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6032 continue;
6033 }
6034
6035 // We assume that widening is the best solution when possible.
6036 if (memoryInstructionCanBeWidened(&I, VF)) {
6037 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6038 int ConsecutiveStride = Legal->isConsecutivePtr(
6039 getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6040 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6041 "Expected consecutive stride.");
6042 InstWidening Decision =
6043 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6044 setWideningDecision(&I, VF, Decision, Cost);
6045 continue;
6046 }
6047
6048 // Choose between Interleaving, Gather/Scatter or Scalarization.
6049 InstructionCost InterleaveCost = InstructionCost::getInvalid();
6050 unsigned NumAccesses = 1;
6051 if (isAccessInterleaved(&I)) {
6052 auto Group = getInterleavedAccessGroup(&I);
6053 assert(Group && "Fail to get an interleaved access group.");
6054
6055 // Make one decision for the whole group.
6056 if (getWideningDecision(&I, VF) != CM_Unknown)
6057 continue;
6058
6059 NumAccesses = Group->getNumMembers();
6060 if (interleavedAccessCanBeWidened(&I, VF))
6061 InterleaveCost = getInterleaveGroupCost(&I, VF);
6062 }
6063
6064 InstructionCost GatherScatterCost =
6065 isLegalGatherOrScatter(&I, VF)
6066 ? getGatherScatterCost(&I, VF) * NumAccesses
6067 : InstructionCost::getInvalid();
6068
6069 InstructionCost ScalarizationCost =
6070 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6071
6072 // Choose better solution for the current VF,
6073 // write down this decision and use it during vectorization.
6074 InstructionCost Cost;
6075 InstWidening Decision;
6076 if (InterleaveCost <= GatherScatterCost &&
6077 InterleaveCost < ScalarizationCost) {
6078 Decision = CM_Interleave;
6079 Cost = InterleaveCost;
6080 } else if (GatherScatterCost < ScalarizationCost) {
6081 Decision = CM_GatherScatter;
6082 Cost = GatherScatterCost;
6083 } else {
6084 Decision = CM_Scalarize;
6085 Cost = ScalarizationCost;
6086 }
6087 // If the instructions belongs to an interleave group, the whole group
6088 // receives the same decision. The whole group receives the cost, but
6089 // the cost will actually be assigned to one instruction.
6090 if (auto Group = getInterleavedAccessGroup(&I))
6091 setWideningDecision(Group, VF, Decision, Cost);
6092 else
6093 setWideningDecision(&I, VF, Decision, Cost);
6094 }
6095 }
6096
6097 // Make sure that any load of address and any other address computation
6098 // remains scalar unless there is gather/scatter support. This avoids
6099 // inevitable extracts into address registers, and also has the benefit of
6100 // activating LSR more, since that pass can't optimize vectorized
6101 // addresses.
6102 if (TTI.prefersVectorizedAddressing())
6103 return;
6104
6105 // Start with all scalar pointer uses.
6106 SmallPtrSet<Instruction *, 8> AddrDefs;
6107 for (BasicBlock *BB : TheLoop->blocks())
6108 for (Instruction &I : *BB) {
6109 Instruction *PtrDef =
6110 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6111 if (PtrDef && TheLoop->contains(PtrDef) &&
6112 getWideningDecision(&I, VF) != CM_GatherScatter)
6113 AddrDefs.insert(PtrDef);
6114 }
6115
6116 // Add all instructions used to generate the addresses.
6117 SmallVector<Instruction *, 4> Worklist;
6118 append_range(Worklist, AddrDefs);
6119 while (!Worklist.empty()) {
6120 Instruction *I = Worklist.pop_back_val();
6121 for (auto &Op : I->operands())
6122 if (auto *InstOp = dyn_cast<Instruction>(Op))
6123 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6124 AddrDefs.insert(InstOp).second)
6125 Worklist.push_back(InstOp);
6126 }
6127
6128 for (auto *I : AddrDefs) {
6129 if (isa<LoadInst>(I)) {
6130 // Setting the desired widening decision should ideally be handled in
6131 // by cost functions, but since this involves the task of finding out
6132 // if the loaded register is involved in an address computation, it is
6133 // instead changed here when we know this is the case.
6134 InstWidening Decision = getWideningDecision(I, VF);
6135 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6136 // Scalarize a widened load of address.
6137 setWideningDecision(
6138 I, VF, CM_Scalarize,
6139 (VF.getKnownMinValue() *
6140 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6141 else if (auto Group = getInterleavedAccessGroup(I)) {
6142 // Scalarize an interleave group of address loads.
6143 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6144 if (Instruction *Member = Group->getMember(I))
6145 setWideningDecision(
6146 Member, VF, CM_Scalarize,
6147 (VF.getKnownMinValue() *
6148 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6149 }
6150 }
6151 } else
6152 // Make sure I gets scalarized and a cost estimate without
6153 // scalarization overhead.
6154 ForcedScalars[VF].insert(I);
6155 }
6156 }
6157
setVectorizedCallDecision(ElementCount VF)6158 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6159 assert(!VF.isScalar() &&
6160 "Trying to set a vectorization decision for a scalar VF");
6161
6162 for (BasicBlock *BB : TheLoop->blocks()) {
6163 // For each instruction in the old loop.
6164 for (Instruction &I : *BB) {
6165 CallInst *CI = dyn_cast<CallInst>(&I);
6166
6167 if (!CI)
6168 continue;
6169
6170 InstructionCost ScalarCost = InstructionCost::getInvalid();
6171 InstructionCost VectorCost = InstructionCost::getInvalid();
6172 InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6173 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6174
6175 Function *ScalarFunc = CI->getCalledFunction();
6176 Type *ScalarRetTy = CI->getType();
6177 SmallVector<Type *, 4> Tys, ScalarTys;
6178 bool MaskRequired = Legal->isMaskRequired(CI);
6179 for (auto &ArgOp : CI->args())
6180 ScalarTys.push_back(ArgOp->getType());
6181
6182 // Compute corresponding vector type for return value and arguments.
6183 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6184 for (Type *ScalarTy : ScalarTys)
6185 Tys.push_back(ToVectorTy(ScalarTy, VF));
6186
6187 // An in-loop reduction using an fmuladd intrinsic is a special case;
6188 // we don't want the normal cost for that intrinsic.
6189 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6190 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6191 setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6192 getVectorIntrinsicIDForCall(CI, TLI),
6193 std::nullopt, *RedCost);
6194 continue;
6195 }
6196
6197 // Estimate cost of scalarized vector call. The source operands are
6198 // assumed to be vectors, so we need to extract individual elements from
6199 // there, execute VF scalar calls, and then gather the result into the
6200 // vector return value.
6201 InstructionCost ScalarCallCost =
6202 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6203
6204 // Compute costs of unpacking argument values for the scalar calls and
6205 // packing the return values to a vector.
6206 InstructionCost ScalarizationCost =
6207 getScalarizationOverhead(CI, VF, CostKind);
6208
6209 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6210
6211 // Find the cost of vectorizing the call, if we can find a suitable
6212 // vector variant of the function.
6213 bool UsesMask = false;
6214 VFInfo FuncInfo;
6215 Function *VecFunc = nullptr;
6216 // Search through any available variants for one we can use at this VF.
6217 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6218 // Must match requested VF.
6219 if (Info.Shape.VF != VF)
6220 continue;
6221
6222 // Must take a mask argument if one is required
6223 if (MaskRequired && !Info.isMasked())
6224 continue;
6225
6226 // Check that all parameter kinds are supported
6227 bool ParamsOk = true;
6228 for (VFParameter Param : Info.Shape.Parameters) {
6229 switch (Param.ParamKind) {
6230 case VFParamKind::Vector:
6231 break;
6232 case VFParamKind::OMP_Uniform: {
6233 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6234 // Make sure the scalar parameter in the loop is invariant.
6235 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6236 TheLoop))
6237 ParamsOk = false;
6238 break;
6239 }
6240 case VFParamKind::OMP_Linear: {
6241 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6242 // Find the stride for the scalar parameter in this loop and see if
6243 // it matches the stride for the variant.
6244 // TODO: do we need to figure out the cost of an extract to get the
6245 // first lane? Or do we hope that it will be folded away?
6246 ScalarEvolution *SE = PSE.getSE();
6247 const auto *SAR =
6248 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6249
6250 if (!SAR || SAR->getLoop() != TheLoop) {
6251 ParamsOk = false;
6252 break;
6253 }
6254
6255 const SCEVConstant *Step =
6256 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6257
6258 if (!Step ||
6259 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6260 ParamsOk = false;
6261
6262 break;
6263 }
6264 case VFParamKind::GlobalPredicate:
6265 UsesMask = true;
6266 break;
6267 default:
6268 ParamsOk = false;
6269 break;
6270 }
6271 }
6272
6273 if (!ParamsOk)
6274 continue;
6275
6276 // Found a suitable candidate, stop here.
6277 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6278 FuncInfo = Info;
6279 break;
6280 }
6281
6282 // Add in the cost of synthesizing a mask if one wasn't required.
6283 InstructionCost MaskCost = 0;
6284 if (VecFunc && UsesMask && !MaskRequired)
6285 MaskCost = TTI.getShuffleCost(
6286 TargetTransformInfo::SK_Broadcast,
6287 VectorType::get(IntegerType::getInt1Ty(
6288 VecFunc->getFunctionType()->getContext()),
6289 VF));
6290
6291 if (TLI && VecFunc && !CI->isNoBuiltin())
6292 VectorCost =
6293 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6294
6295 // Find the cost of an intrinsic; some targets may have instructions that
6296 // perform the operation without needing an actual call.
6297 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6298 if (IID != Intrinsic::not_intrinsic)
6299 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6300
6301 InstructionCost Cost = ScalarCost;
6302 InstWidening Decision = CM_Scalarize;
6303
6304 if (VectorCost <= Cost) {
6305 Cost = VectorCost;
6306 Decision = CM_VectorCall;
6307 }
6308
6309 if (IntrinsicCost <= Cost) {
6310 Cost = IntrinsicCost;
6311 Decision = CM_IntrinsicCall;
6312 }
6313
6314 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6315 FuncInfo.getParamIndexForOptionalMask(), Cost);
6316 }
6317 }
6318 }
6319
6320 InstructionCost
getInstructionCost(Instruction * I,ElementCount VF)6321 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6322 ElementCount VF) {
6323 // If we know that this instruction will remain uniform, check the cost of
6324 // the scalar version.
6325 if (isUniformAfterVectorization(I, VF))
6326 VF = ElementCount::getFixed(1);
6327
6328 if (VF.isVector() && isProfitableToScalarize(I, VF))
6329 return InstsToScalarize[VF][I];
6330
6331 // Forced scalars do not have any scalarization overhead.
6332 auto ForcedScalar = ForcedScalars.find(VF);
6333 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6334 auto InstSet = ForcedScalar->second;
6335 if (InstSet.count(I))
6336 return getInstructionCost(I, ElementCount::getFixed(1)) *
6337 VF.getKnownMinValue();
6338 }
6339
6340 Type *RetTy = I->getType();
6341 if (canTruncateToMinimalBitwidth(I, VF))
6342 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6343 auto SE = PSE.getSE();
6344 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6345
6346 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6347 ElementCount VF) -> bool {
6348 if (VF.isScalar())
6349 return true;
6350
6351 auto Scalarized = InstsToScalarize.find(VF);
6352 assert(Scalarized != InstsToScalarize.end() &&
6353 "VF not yet analyzed for scalarization profitability");
6354 return !Scalarized->second.count(I) &&
6355 llvm::all_of(I->users(), [&](User *U) {
6356 auto *UI = cast<Instruction>(U);
6357 return !Scalarized->second.count(UI);
6358 });
6359 };
6360 (void) hasSingleCopyAfterVectorization;
6361
6362 Type *VectorTy;
6363 if (isScalarAfterVectorization(I, VF)) {
6364 // With the exception of GEPs and PHIs, after scalarization there should
6365 // only be one copy of the instruction generated in the loop. This is
6366 // because the VF is either 1, or any instructions that need scalarizing
6367 // have already been dealt with by the time we get here. As a result,
6368 // it means we don't have to multiply the instruction cost by VF.
6369 assert(I->getOpcode() == Instruction::GetElementPtr ||
6370 I->getOpcode() == Instruction::PHI ||
6371 (I->getOpcode() == Instruction::BitCast &&
6372 I->getType()->isPointerTy()) ||
6373 hasSingleCopyAfterVectorization(I, VF));
6374 VectorTy = RetTy;
6375 } else
6376 VectorTy = ToVectorTy(RetTy, VF);
6377
6378 if (VF.isVector() && VectorTy->isVectorTy() &&
6379 !TTI.getNumberOfParts(VectorTy))
6380 return InstructionCost::getInvalid();
6381
6382 // TODO: We need to estimate the cost of intrinsic calls.
6383 switch (I->getOpcode()) {
6384 case Instruction::GetElementPtr:
6385 // We mark this instruction as zero-cost because the cost of GEPs in
6386 // vectorized code depends on whether the corresponding memory instruction
6387 // is scalarized or not. Therefore, we handle GEPs with the memory
6388 // instruction cost.
6389 return 0;
6390 case Instruction::Br: {
6391 // In cases of scalarized and predicated instructions, there will be VF
6392 // predicated blocks in the vectorized loop. Each branch around these
6393 // blocks requires also an extract of its vector compare i1 element.
6394 // Note that the conditional branch from the loop latch will be replaced by
6395 // a single branch controlling the loop, so there is no extra overhead from
6396 // scalarization.
6397 bool ScalarPredicatedBB = false;
6398 BranchInst *BI = cast<BranchInst>(I);
6399 if (VF.isVector() && BI->isConditional() &&
6400 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6401 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6402 BI->getParent() != TheLoop->getLoopLatch())
6403 ScalarPredicatedBB = true;
6404
6405 if (ScalarPredicatedBB) {
6406 // Not possible to scalarize scalable vector with predicated instructions.
6407 if (VF.isScalable())
6408 return InstructionCost::getInvalid();
6409 // Return cost for branches around scalarized and predicated blocks.
6410 auto *Vec_i1Ty =
6411 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6412 return (
6413 TTI.getScalarizationOverhead(
6414 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6415 /*Insert*/ false, /*Extract*/ true, CostKind) +
6416 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6417 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6418 // The back-edge branch will remain, as will all scalar branches.
6419 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6420 else
6421 // This branch will be eliminated by if-conversion.
6422 return 0;
6423 // Note: We currently assume zero cost for an unconditional branch inside
6424 // a predicated block since it will become a fall-through, although we
6425 // may decide in the future to call TTI for all branches.
6426 }
6427 case Instruction::PHI: {
6428 auto *Phi = cast<PHINode>(I);
6429
6430 // First-order recurrences are replaced by vector shuffles inside the loop.
6431 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6432 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6433 // penultimate value of the recurrence.
6434 // TODO: Consider vscale_range info.
6435 if (VF.isScalable() && VF.getKnownMinValue() == 1)
6436 return InstructionCost::getInvalid();
6437 SmallVector<int> Mask(VF.getKnownMinValue());
6438 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6439 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6440 cast<VectorType>(VectorTy), Mask, CostKind,
6441 VF.getKnownMinValue() - 1);
6442 }
6443
6444 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6445 // converted into select instructions. We require N - 1 selects per phi
6446 // node, where N is the number of incoming values.
6447 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6448 return (Phi->getNumIncomingValues() - 1) *
6449 TTI.getCmpSelInstrCost(
6450 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6451 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6452 CmpInst::BAD_ICMP_PREDICATE, CostKind);
6453
6454 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6455 }
6456 case Instruction::UDiv:
6457 case Instruction::SDiv:
6458 case Instruction::URem:
6459 case Instruction::SRem:
6460 if (VF.isVector() && isPredicatedInst(I)) {
6461 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6462 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6463 ScalarCost : SafeDivisorCost;
6464 }
6465 // We've proven all lanes safe to speculate, fall through.
6466 [[fallthrough]];
6467 case Instruction::Add:
6468 case Instruction::FAdd:
6469 case Instruction::Sub:
6470 case Instruction::FSub:
6471 case Instruction::Mul:
6472 case Instruction::FMul:
6473 case Instruction::FDiv:
6474 case Instruction::FRem:
6475 case Instruction::Shl:
6476 case Instruction::LShr:
6477 case Instruction::AShr:
6478 case Instruction::And:
6479 case Instruction::Or:
6480 case Instruction::Xor: {
6481 // If we're speculating on the stride being 1, the multiplication may
6482 // fold away. We can generalize this for all operations using the notion
6483 // of neutral elements. (TODO)
6484 if (I->getOpcode() == Instruction::Mul &&
6485 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6486 PSE.getSCEV(I->getOperand(1))->isOne()))
6487 return 0;
6488
6489 // Detect reduction patterns
6490 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6491 return *RedCost;
6492
6493 // Certain instructions can be cheaper to vectorize if they have a constant
6494 // second vector operand. One example of this are shifts on x86.
6495 Value *Op2 = I->getOperand(1);
6496 auto Op2Info = TTI.getOperandInfo(Op2);
6497 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6498 Legal->isInvariant(Op2))
6499 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6500
6501 SmallVector<const Value *, 4> Operands(I->operand_values());
6502 return TTI.getArithmeticInstrCost(
6503 I->getOpcode(), VectorTy, CostKind,
6504 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6505 Op2Info, Operands, I, TLI);
6506 }
6507 case Instruction::FNeg: {
6508 return TTI.getArithmeticInstrCost(
6509 I->getOpcode(), VectorTy, CostKind,
6510 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6511 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6512 I->getOperand(0), I);
6513 }
6514 case Instruction::Select: {
6515 SelectInst *SI = cast<SelectInst>(I);
6516 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6517 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6518
6519 const Value *Op0, *Op1;
6520 using namespace llvm::PatternMatch;
6521 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6522 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6523 // select x, y, false --> x & y
6524 // select x, true, y --> x | y
6525 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6526 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6527 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6528 Op1->getType()->getScalarSizeInBits() == 1);
6529
6530 SmallVector<const Value *, 2> Operands{Op0, Op1};
6531 return TTI.getArithmeticInstrCost(
6532 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6533 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6534 }
6535
6536 Type *CondTy = SI->getCondition()->getType();
6537 if (!ScalarCond)
6538 CondTy = VectorType::get(CondTy, VF);
6539
6540 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6541 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6542 Pred = Cmp->getPredicate();
6543 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6544 CostKind, I);
6545 }
6546 case Instruction::ICmp:
6547 case Instruction::FCmp: {
6548 Type *ValTy = I->getOperand(0)->getType();
6549 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6550 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6551 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6552 VectorTy = ToVectorTy(ValTy, VF);
6553 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6554 cast<CmpInst>(I)->getPredicate(), CostKind,
6555 I);
6556 }
6557 case Instruction::Store:
6558 case Instruction::Load: {
6559 ElementCount Width = VF;
6560 if (Width.isVector()) {
6561 InstWidening Decision = getWideningDecision(I, Width);
6562 assert(Decision != CM_Unknown &&
6563 "CM decision should be taken at this point");
6564 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6565 return InstructionCost::getInvalid();
6566 if (Decision == CM_Scalarize)
6567 Width = ElementCount::getFixed(1);
6568 }
6569 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
6570 return getMemoryInstructionCost(I, VF);
6571 }
6572 case Instruction::BitCast:
6573 if (I->getType()->isPointerTy())
6574 return 0;
6575 [[fallthrough]];
6576 case Instruction::ZExt:
6577 case Instruction::SExt:
6578 case Instruction::FPToUI:
6579 case Instruction::FPToSI:
6580 case Instruction::FPExt:
6581 case Instruction::PtrToInt:
6582 case Instruction::IntToPtr:
6583 case Instruction::SIToFP:
6584 case Instruction::UIToFP:
6585 case Instruction::Trunc:
6586 case Instruction::FPTrunc: {
6587 // Computes the CastContextHint from a Load/Store instruction.
6588 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6589 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6590 "Expected a load or a store!");
6591
6592 if (VF.isScalar() || !TheLoop->contains(I))
6593 return TTI::CastContextHint::Normal;
6594
6595 switch (getWideningDecision(I, VF)) {
6596 case LoopVectorizationCostModel::CM_GatherScatter:
6597 return TTI::CastContextHint::GatherScatter;
6598 case LoopVectorizationCostModel::CM_Interleave:
6599 return TTI::CastContextHint::Interleave;
6600 case LoopVectorizationCostModel::CM_Scalarize:
6601 case LoopVectorizationCostModel::CM_Widen:
6602 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6603 : TTI::CastContextHint::Normal;
6604 case LoopVectorizationCostModel::CM_Widen_Reverse:
6605 return TTI::CastContextHint::Reversed;
6606 case LoopVectorizationCostModel::CM_Unknown:
6607 llvm_unreachable("Instr did not go through cost modelling?");
6608 case LoopVectorizationCostModel::CM_VectorCall:
6609 case LoopVectorizationCostModel::CM_IntrinsicCall:
6610 llvm_unreachable_internal("Instr has invalid widening decision");
6611 }
6612
6613 llvm_unreachable("Unhandled case!");
6614 };
6615
6616 unsigned Opcode = I->getOpcode();
6617 TTI::CastContextHint CCH = TTI::CastContextHint::None;
6618 // For Trunc, the context is the only user, which must be a StoreInst.
6619 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6620 if (I->hasOneUse())
6621 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6622 CCH = ComputeCCH(Store);
6623 }
6624 // For Z/Sext, the context is the operand, which must be a LoadInst.
6625 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6626 Opcode == Instruction::FPExt) {
6627 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6628 CCH = ComputeCCH(Load);
6629 }
6630
6631 // We optimize the truncation of induction variables having constant
6632 // integer steps. The cost of these truncations is the same as the scalar
6633 // operation.
6634 if (isOptimizableIVTruncate(I, VF)) {
6635 auto *Trunc = cast<TruncInst>(I);
6636 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6637 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6638 }
6639
6640 // Detect reduction patterns
6641 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6642 return *RedCost;
6643
6644 Type *SrcScalarTy = I->getOperand(0)->getType();
6645 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6646 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6647 SrcScalarTy =
6648 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6649 Type *SrcVecTy =
6650 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6651
6652 if (canTruncateToMinimalBitwidth(I, VF)) {
6653 // If the result type is <= the source type, there will be no extend
6654 // after truncating the users to the minimal required bitwidth.
6655 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6656 (I->getOpcode() == Instruction::ZExt ||
6657 I->getOpcode() == Instruction::SExt))
6658 return 0;
6659 }
6660
6661 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6662 }
6663 case Instruction::Call:
6664 return getVectorCallCost(cast<CallInst>(I), VF);
6665 case Instruction::ExtractValue:
6666 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
6667 case Instruction::Alloca:
6668 // We cannot easily widen alloca to a scalable alloca, as
6669 // the result would need to be a vector of pointers.
6670 if (VF.isScalable())
6671 return InstructionCost::getInvalid();
6672 [[fallthrough]];
6673 default:
6674 // This opcode is unknown. Assume that it is the same as 'mul'.
6675 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6676 } // end of switch.
6677 }
6678
collectValuesToIgnore()6679 void LoopVectorizationCostModel::collectValuesToIgnore() {
6680 // Ignore ephemeral values.
6681 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6682
6683 SmallVector<Value *, 4> DeadInterleavePointerOps;
6684 for (BasicBlock *BB : TheLoop->blocks())
6685 for (Instruction &I : *BB) {
6686 // Find all stores to invariant variables. Since they are going to sink
6687 // outside the loop we do not need calculate cost for them.
6688 StoreInst *SI;
6689 if ((SI = dyn_cast<StoreInst>(&I)) &&
6690 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
6691 ValuesToIgnore.insert(&I);
6692
6693 // For interleave groups, we only create a pointer for the start of the
6694 // interleave group. Queue up addresses of group members except the insert
6695 // position for further processing.
6696 if (isAccessInterleaved(&I)) {
6697 auto *Group = getInterleavedAccessGroup(&I);
6698 if (Group->getInsertPos() == &I)
6699 continue;
6700 Value *PointerOp = getLoadStorePointerOperand(&I);
6701 DeadInterleavePointerOps.push_back(PointerOp);
6702 }
6703 }
6704
6705 // Mark ops feeding interleave group members as free, if they are only used
6706 // by other dead computations.
6707 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6708 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6709 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6710 Instruction *UI = cast<Instruction>(U);
6711 return !VecValuesToIgnore.contains(U) &&
6712 (!isAccessInterleaved(UI) ||
6713 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6714 }))
6715 continue;
6716 VecValuesToIgnore.insert(Op);
6717 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
6718 }
6719
6720 // Ignore type-promoting instructions we identified during reduction
6721 // detection.
6722 for (const auto &Reduction : Legal->getReductionVars()) {
6723 const RecurrenceDescriptor &RedDes = Reduction.second;
6724 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6725 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6726 }
6727 // Ignore type-casting instructions we identified during induction
6728 // detection.
6729 for (const auto &Induction : Legal->getInductionVars()) {
6730 const InductionDescriptor &IndDes = Induction.second;
6731 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6732 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6733 }
6734 }
6735
collectInLoopReductions()6736 void LoopVectorizationCostModel::collectInLoopReductions() {
6737 for (const auto &Reduction : Legal->getReductionVars()) {
6738 PHINode *Phi = Reduction.first;
6739 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6740
6741 // We don't collect reductions that are type promoted (yet).
6742 if (RdxDesc.getRecurrenceType() != Phi->getType())
6743 continue;
6744
6745 // If the target would prefer this reduction to happen "in-loop", then we
6746 // want to record it as such.
6747 unsigned Opcode = RdxDesc.getOpcode();
6748 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6749 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6750 TargetTransformInfo::ReductionFlags()))
6751 continue;
6752
6753 // Check that we can correctly put the reductions into the loop, by
6754 // finding the chain of operations that leads from the phi to the loop
6755 // exit value.
6756 SmallVector<Instruction *, 4> ReductionOperations =
6757 RdxDesc.getReductionOpChain(Phi, TheLoop);
6758 bool InLoop = !ReductionOperations.empty();
6759
6760 if (InLoop) {
6761 InLoopReductions.insert(Phi);
6762 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6763 Instruction *LastChain = Phi;
6764 for (auto *I : ReductionOperations) {
6765 InLoopReductionImmediateChains[I] = LastChain;
6766 LastChain = I;
6767 }
6768 }
6769 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6770 << " reduction for phi: " << *Phi << "\n");
6771 }
6772 }
6773
createICmp(CmpInst::Predicate Pred,VPValue * A,VPValue * B,DebugLoc DL,const Twine & Name)6774 VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
6775 DebugLoc DL, const Twine &Name) {
6776 assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
6777 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
6778 return tryInsertInstruction(
6779 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
6780 }
6781
6782 // This function will select a scalable VF if the target supports scalable
6783 // vectors and a fixed one otherwise.
6784 // TODO: we could return a pair of values that specify the max VF and
6785 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6786 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6787 // doesn't have a cost model that can choose which plan to execute if
6788 // more than one is generated.
determineVPlanVF(const TargetTransformInfo & TTI,LoopVectorizationCostModel & CM)6789 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6790 LoopVectorizationCostModel &CM) {
6791 unsigned WidestType;
6792 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6793
6794 TargetTransformInfo::RegisterKind RegKind =
6795 TTI.enableScalableVectorization()
6796 ? TargetTransformInfo::RGK_ScalableVector
6797 : TargetTransformInfo::RGK_FixedWidthVector;
6798
6799 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
6800 unsigned N = RegSize.getKnownMinValue() / WidestType;
6801 return ElementCount::get(N, RegSize.isScalable());
6802 }
6803
6804 VectorizationFactor
planInVPlanNativePath(ElementCount UserVF)6805 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6806 ElementCount VF = UserVF;
6807 // Outer loop handling: They may require CFG and instruction level
6808 // transformations before even evaluating whether vectorization is profitable.
6809 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6810 // the vectorization pipeline.
6811 if (!OrigLoop->isInnermost()) {
6812 // If the user doesn't provide a vectorization factor, determine a
6813 // reasonable one.
6814 if (UserVF.isZero()) {
6815 VF = determineVPlanVF(TTI, CM);
6816 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6817
6818 // Make sure we have a VF > 1 for stress testing.
6819 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6820 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6821 << "overriding computed VF.\n");
6822 VF = ElementCount::getFixed(4);
6823 }
6824 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6825 !ForceTargetSupportsScalableVectors) {
6826 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6827 << "not supported by the target.\n");
6828 reportVectorizationFailure(
6829 "Scalable vectorization requested but not supported by the target",
6830 "the scalable user-specified vectorization width for outer-loop "
6831 "vectorization cannot be used because the target does not support "
6832 "scalable vectors.",
6833 "ScalableVFUnfeasible", ORE, OrigLoop);
6834 return VectorizationFactor::Disabled();
6835 }
6836 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6837 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6838 "VF needs to be a power of two");
6839 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6840 << "VF " << VF << " to build VPlans.\n");
6841 buildVPlans(VF, VF);
6842
6843 // For VPlan build stress testing, we bail out after VPlan construction.
6844 if (VPlanBuildStressTest)
6845 return VectorizationFactor::Disabled();
6846
6847 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6848 }
6849
6850 LLVM_DEBUG(
6851 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6852 "VPlan-native path.\n");
6853 return VectorizationFactor::Disabled();
6854 }
6855
6856 std::optional<VectorizationFactor>
plan(ElementCount UserVF,unsigned UserIC)6857 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6858 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6859 CM.collectValuesToIgnore();
6860 CM.collectElementTypesForWidening();
6861
6862 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6863 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6864 return std::nullopt;
6865
6866 // Invalidate interleave groups if all blocks of loop will be predicated.
6867 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
6868 !useMaskedInterleavedAccesses(TTI)) {
6869 LLVM_DEBUG(
6870 dbgs()
6871 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6872 "which requires masked-interleaved support.\n");
6873 if (CM.InterleaveInfo.invalidateGroups())
6874 // Invalidating interleave groups also requires invalidating all decisions
6875 // based on them, which includes widening decisions and uniform and scalar
6876 // values.
6877 CM.invalidateCostModelingDecisions();
6878 }
6879
6880 if (CM.foldTailByMasking())
6881 Legal->prepareToFoldTailByMasking();
6882
6883 ElementCount MaxUserVF =
6884 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6885 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
6886 if (!UserVF.isZero() && UserVFIsLegal) {
6887 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6888 "VF needs to be a power of two");
6889 // Collect the instructions (and their associated costs) that will be more
6890 // profitable to scalarize.
6891 CM.collectInLoopReductions();
6892 if (CM.selectUserVectorizationFactor(UserVF)) {
6893 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6894 buildVPlansWithVPRecipes(UserVF, UserVF);
6895 if (!hasPlanWithVF(UserVF)) {
6896 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
6897 << ".\n");
6898 return std::nullopt;
6899 }
6900
6901 LLVM_DEBUG(printPlans(dbgs()));
6902 return {{UserVF, 0, 0}};
6903 } else
6904 reportVectorizationInfo("UserVF ignored because of invalid costs.",
6905 "InvalidCost", ORE, OrigLoop);
6906 }
6907
6908 // Collect the Vectorization Factor Candidates.
6909 SmallVector<ElementCount> VFCandidates;
6910 for (auto VF = ElementCount::getFixed(1);
6911 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
6912 VFCandidates.push_back(VF);
6913 for (auto VF = ElementCount::getScalable(1);
6914 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
6915 VFCandidates.push_back(VF);
6916
6917 CM.collectInLoopReductions();
6918 for (const auto &VF : VFCandidates) {
6919 // Collect Uniform and Scalar instructions after vectorization with VF.
6920 CM.collectUniformsAndScalars(VF);
6921
6922 // Collect the instructions (and their associated costs) that will be more
6923 // profitable to scalarize.
6924 if (VF.isVector())
6925 CM.collectInstsToScalarize(VF);
6926 }
6927
6928 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
6929 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
6930
6931 LLVM_DEBUG(printPlans(dbgs()));
6932 if (VPlans.empty())
6933 return std::nullopt;
6934 if (all_of(VPlans,
6935 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }))
6936 return VectorizationFactor::Disabled();
6937
6938 // Select the optimal vectorization factor according to the legacy cost-model.
6939 // This is now only used to verify the decisions by the new VPlan-based
6940 // cost-model and will be retired once the VPlan-based cost-model is
6941 // stabilized.
6942 VectorizationFactor VF = selectVectorizationFactor();
6943 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
6944 if (!hasPlanWithVF(VF.Width)) {
6945 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
6946 << ".\n");
6947 return std::nullopt;
6948 }
6949 return VF;
6950 }
6951
getLegacyCost(Instruction * UI,ElementCount VF) const6952 InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6953 ElementCount VF) const {
6954 return CM.getInstructionCost(UI, VF);
6955 }
6956
skipCostComputation(Instruction * UI,bool IsVector) const6957 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6958 return CM.ValuesToIgnore.contains(UI) ||
6959 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
6960 SkipCostComputation.contains(UI);
6961 }
6962
cost(VPlan & Plan,ElementCount VF) const6963 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6964 ElementCount VF) const {
6965 InstructionCost Cost = 0;
6966 LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
6967 VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
6968
6969 // Cost modeling for inductions is inaccurate in the legacy cost model
6970 // compared to the recipes that are generated. To match here initially during
6971 // VPlan cost model bring up directly use the induction costs from the legacy
6972 // cost model. Note that we do this as pre-processing; the VPlan may not have
6973 // any recipes associated with the original induction increment instruction
6974 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6975 // the cost of induction phis and increments (both that are represented by
6976 // recipes and those that are not), to avoid distinguishing between them here,
6977 // and skip all recipes that represent induction phis and increments (the
6978 // former case) later on, if they exist, to avoid counting them twice.
6979 // Similarly we pre-compute the cost of any optimized truncates.
6980 // TODO: Switch to more accurate costing based on VPlan.
6981 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6982 Instruction *IVInc = cast<Instruction>(
6983 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
6984 SmallVector<Instruction *> IVInsts = {IV, IVInc};
6985 for (User *U : IV->users()) {
6986 auto *CI = cast<Instruction>(U);
6987 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
6988 continue;
6989 IVInsts.push_back(CI);
6990 }
6991 for (Instruction *IVInst : IVInsts) {
6992 if (!CostCtx.SkipCostComputation.insert(IVInst).second)
6993 continue;
6994 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
6995 LLVM_DEBUG({
6996 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6997 << ": induction instruction " << *IVInst << "\n";
6998 });
6999 Cost += InductionCost;
7000 }
7001 }
7002
7003 /// Compute the cost of all exiting conditions of the loop using the legacy
7004 /// cost model. This is to match the legacy behavior, which adds the cost of
7005 /// all exit conditions. Note that this over-estimates the cost, as there will
7006 /// be a single condition to control the vector loop.
7007 SmallVector<BasicBlock *> Exiting;
7008 CM.TheLoop->getExitingBlocks(Exiting);
7009 SetVector<Instruction *> ExitInstrs;
7010 // Collect all exit conditions.
7011 for (BasicBlock *EB : Exiting) {
7012 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7013 if (!Term)
7014 continue;
7015 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7016 ExitInstrs.insert(CondI);
7017 }
7018 }
7019 // Compute the cost of all instructions only feeding the exit conditions.
7020 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7021 Instruction *CondI = ExitInstrs[I];
7022 if (!OrigLoop->contains(CondI) ||
7023 !CostCtx.SkipCostComputation.insert(CondI).second)
7024 continue;
7025 Cost += CostCtx.getLegacyCost(CondI, VF);
7026 for (Value *Op : CondI->operands()) {
7027 auto *OpI = dyn_cast<Instruction>(Op);
7028 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7029 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7030 !ExitInstrs.contains(cast<Instruction>(U));
7031 }))
7032 continue;
7033 ExitInstrs.insert(OpI);
7034 }
7035 }
7036
7037 // The legacy cost model has special logic to compute the cost of in-loop
7038 // reductions, which may be smaller than the sum of all instructions involved
7039 // in the reduction. For AnyOf reductions, VPlan codegen may remove the select
7040 // which the legacy cost model uses to assign cost. Pre-compute their costs
7041 // for now.
7042 // TODO: Switch to costing based on VPlan once the logic has been ported.
7043 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7044 if (!CM.isInLoopReduction(RedPhi) &&
7045 !RecurrenceDescriptor::isAnyOfRecurrenceKind(
7046 RdxDesc.getRecurrenceKind()))
7047 continue;
7048
7049 // AnyOf reduction codegen may remove the select. To match the legacy cost
7050 // model, pre-compute the cost for AnyOf reductions here.
7051 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7052 RdxDesc.getRecurrenceKind())) {
7053 auto *Select = cast<SelectInst>(*find_if(
7054 RedPhi->users(), [](User *U) { return isa<SelectInst>(U); }));
7055 assert(!CostCtx.SkipCostComputation.contains(Select) &&
7056 "reduction op visited multiple times");
7057 CostCtx.SkipCostComputation.insert(Select);
7058 auto ReductionCost = CostCtx.getLegacyCost(Select, VF);
7059 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7060 << ":\n any-of reduction " << *Select << "\n");
7061 Cost += ReductionCost;
7062 continue;
7063 }
7064
7065 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7066 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7067 ChainOps.end());
7068 // Also include the operands of instructions in the chain, as the cost-model
7069 // may mark extends as free.
7070 for (auto *ChainOp : ChainOps) {
7071 for (Value *Op : ChainOp->operands()) {
7072 if (auto *I = dyn_cast<Instruction>(Op))
7073 ChainOpsAndOperands.insert(I);
7074 }
7075 }
7076
7077 // Pre-compute the cost for I, if it has a reduction pattern cost.
7078 for (Instruction *I : ChainOpsAndOperands) {
7079 auto ReductionCost = CM.getReductionPatternCost(
7080 I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7081 if (!ReductionCost)
7082 continue;
7083
7084 assert(!CostCtx.SkipCostComputation.contains(I) &&
7085 "reduction op visited multiple times");
7086 CostCtx.SkipCostComputation.insert(I);
7087 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7088 << ":\n in-loop reduction " << *I << "\n");
7089 Cost += *ReductionCost;
7090 }
7091 }
7092
7093 // Pre-compute the costs for branches except for the backedge, as the number
7094 // of replicate regions in a VPlan may not directly match the number of
7095 // branches, which would lead to different decisions.
7096 // TODO: Compute cost of branches for each replicate region in the VPlan,
7097 // which is more accurate than the legacy cost model.
7098 for (BasicBlock *BB : OrigLoop->blocks()) {
7099 if (BB == OrigLoop->getLoopLatch())
7100 continue;
7101 CostCtx.SkipCostComputation.insert(BB->getTerminator());
7102 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7103 Cost += BranchCost;
7104 }
7105 // Now compute and add the VPlan-based cost.
7106 Cost += Plan.cost(VF, CostCtx);
7107 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
7108 return Cost;
7109 }
7110
getBestPlan() const7111 VPlan &LoopVectorizationPlanner::getBestPlan() const {
7112 // If there is a single VPlan with a single VF, return it directly.
7113 VPlan &FirstPlan = *VPlans[0];
7114 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7115 return FirstPlan;
7116
7117 VPlan *BestPlan = &FirstPlan;
7118 ElementCount ScalarVF = ElementCount::getFixed(1);
7119 assert(hasPlanWithVF(ScalarVF) &&
7120 "More than a single plan/VF w/o any plan having scalar VF");
7121
7122 // TODO: Compute scalar cost using VPlan-based cost model.
7123 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7124 VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
7125
7126 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7127 if (ForceVectorization) {
7128 // Ignore scalar width, because the user explicitly wants vectorization.
7129 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7130 // evaluation.
7131 BestFactor.Cost = InstructionCost::getMax();
7132 }
7133
7134 for (auto &P : VPlans) {
7135 for (ElementCount VF : P->vectorFactors()) {
7136 if (VF.isScalar())
7137 continue;
7138 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7139 LLVM_DEBUG(
7140 dbgs()
7141 << "LV: Not considering vector loop of width " << VF
7142 << " because it will not generate any vector instructions.\n");
7143 continue;
7144 }
7145
7146 InstructionCost Cost = cost(*P, VF);
7147 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7148 if (isMoreProfitable(CurrentFactor, BestFactor)) {
7149 BestFactor = CurrentFactor;
7150 BestPlan = &*P;
7151 }
7152 }
7153 }
7154 BestPlan->setVF(BestFactor.Width);
7155 return *BestPlan;
7156 }
7157
getBestPlanFor(ElementCount VF) const7158 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7159 assert(count_if(VPlans,
7160 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7161 1 &&
7162 "Best VF has not a single VPlan.");
7163
7164 for (const VPlanPtr &Plan : VPlans) {
7165 if (Plan->hasVF(VF))
7166 return *Plan.get();
7167 }
7168 llvm_unreachable("No plan found!");
7169 }
7170
AddRuntimeUnrollDisableMetaData(Loop * L)7171 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7172 SmallVector<Metadata *, 4> MDs;
7173 // Reserve first location for self reference to the LoopID metadata node.
7174 MDs.push_back(nullptr);
7175 bool IsUnrollMetadata = false;
7176 MDNode *LoopID = L->getLoopID();
7177 if (LoopID) {
7178 // First find existing loop unrolling disable metadata.
7179 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7180 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7181 if (MD) {
7182 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7183 IsUnrollMetadata =
7184 S && S->getString().starts_with("llvm.loop.unroll.disable");
7185 }
7186 MDs.push_back(LoopID->getOperand(i));
7187 }
7188 }
7189
7190 if (!IsUnrollMetadata) {
7191 // Add runtime unroll disable metadata.
7192 LLVMContext &Context = L->getHeader()->getContext();
7193 SmallVector<Metadata *, 1> DisableOperands;
7194 DisableOperands.push_back(
7195 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7196 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7197 MDs.push_back(DisableNode);
7198 MDNode *NewLoopID = MDNode::get(Context, MDs);
7199 // Set operand 0 to refer to the loop id itself.
7200 NewLoopID->replaceOperandWith(0, NewLoopID);
7201 L->setLoopID(NewLoopID);
7202 }
7203 }
7204
7205 // Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7206 // create a merge phi node for it and add it to \p ReductionResumeValues.
createAndCollectMergePhiForReduction(VPInstruction * RedResult,DenseMap<const RecurrenceDescriptor *,Value * > & ReductionResumeValues,VPTransformState & State,Loop * OrigLoop,BasicBlock * LoopMiddleBlock,bool VectorizingEpilogue)7207 static void createAndCollectMergePhiForReduction(
7208 VPInstruction *RedResult,
7209 DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
7210 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
7211 bool VectorizingEpilogue) {
7212 if (!RedResult ||
7213 RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7214 return;
7215
7216 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7217 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7218
7219 Value *FinalValue =
7220 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7221 auto *ResumePhi =
7222 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7223 if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
7224 RdxDesc.getRecurrenceKind())) {
7225 auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
7226 assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
7227 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
7228 ResumePhi = cast<PHINode>(Cmp->getOperand(0));
7229 }
7230 assert((!VectorizingEpilogue || ResumePhi) &&
7231 "when vectorizing the epilogue loop, we need a resume phi from main "
7232 "vector loop");
7233
7234 // TODO: bc.merge.rdx should not be created here, instead it should be
7235 // modeled in VPlan.
7236 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7237 // Create a phi node that merges control-flow from the backedge-taken check
7238 // block and the middle block.
7239 auto *BCBlockPhi =
7240 PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7241 LoopScalarPreHeader->getTerminator()->getIterator());
7242
7243 // If we are fixing reductions in the epilogue loop then we should already
7244 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7245 // we carry over the incoming values correctly.
7246 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7247 if (Incoming == LoopMiddleBlock)
7248 BCBlockPhi->addIncoming(FinalValue, Incoming);
7249 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7250 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7251 Incoming);
7252 else
7253 BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
7254 }
7255
7256 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7257 // TODO: This fixup should instead be modeled in VPlan.
7258 // Fix the scalar loop reduction variable with the incoming reduction sum
7259 // from the vector body and from the backedge value.
7260 int IncomingEdgeBlockIdx =
7261 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7262 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7263 // Pick the other block.
7264 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7265 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7266 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7267 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7268
7269 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7270 }
7271
7272 std::pair<DenseMap<const SCEV *, Value *>,
7273 DenseMap<const RecurrenceDescriptor *, Value *>>
executePlan(ElementCount BestVF,unsigned BestUF,VPlan & BestVPlan,InnerLoopVectorizer & ILV,DominatorTree * DT,bool IsEpilogueVectorization,const DenseMap<const SCEV *,Value * > * ExpandedSCEVs)7274 LoopVectorizationPlanner::executePlan(
7275 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7276 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7277 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7278 assert(BestVPlan.hasVF(BestVF) &&
7279 "Trying to execute plan with unsupported VF");
7280 assert(BestVPlan.hasUF(BestUF) &&
7281 "Trying to execute plan with unsupported UF");
7282 assert(
7283 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7284 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7285 (void)IsEpilogueVectorization;
7286
7287 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7288
7289 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7290 << ", UF=" << BestUF << '\n');
7291 BestVPlan.setName("Final VPlan");
7292 LLVM_DEBUG(BestVPlan.dump());
7293
7294 // Perform the actual loop transformation.
7295 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7296 OrigLoop->getHeader()->getContext());
7297
7298 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7299 // before making any changes to the CFG.
7300 if (!BestVPlan.getPreheader()->empty()) {
7301 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7302 State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7303 BestVPlan.getPreheader()->execute(&State);
7304 }
7305 if (!ILV.getTripCount())
7306 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7307 else
7308 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7309 "count during epilogue vectorization");
7310
7311 // 1. Set up the skeleton for vectorization, including vector pre-header and
7312 // middle block. The vector loop is created during VPlan execution.
7313 Value *CanonicalIVStartValue;
7314 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7315 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7316 : State.ExpandedSCEVs);
7317 #ifdef EXPENSIVE_CHECKS
7318 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7319 #endif
7320
7321 // Only use noalias metadata when using memory checks guaranteeing no overlap
7322 // across all iterations.
7323 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7324 std::unique_ptr<LoopVersioning> LVer = nullptr;
7325 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7326 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7327
7328 // We currently don't use LoopVersioning for the actual loop cloning but we
7329 // still use it to add the noalias metadata.
7330 // TODO: Find a better way to re-use LoopVersioning functionality to add
7331 // metadata.
7332 LVer = std::make_unique<LoopVersioning>(
7333 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7334 PSE.getSE());
7335 State.LVer = &*LVer;
7336 State.LVer->prepareNoAliasMetadata();
7337 }
7338
7339 ILV.printDebugTracesAtStart();
7340
7341 //===------------------------------------------------===//
7342 //
7343 // Notice: any optimization or new instruction that go
7344 // into the code below should also be implemented in
7345 // the cost-model.
7346 //
7347 //===------------------------------------------------===//
7348
7349 // 2. Copy and widen instructions from the old loop into the new loop.
7350 BestVPlan.prepareToExecute(ILV.getTripCount(),
7351 ILV.getOrCreateVectorTripCount(nullptr),
7352 CanonicalIVStartValue, State);
7353
7354 BestVPlan.execute(&State);
7355
7356 // 2.5 Collect reduction resume values.
7357 DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
7358 auto *ExitVPBB =
7359 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7360 for (VPRecipeBase &R : *ExitVPBB) {
7361 createAndCollectMergePhiForReduction(
7362 dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
7363 State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
7364 }
7365
7366 // 2.6. Maintain Loop Hints
7367 // Keep all loop hints from the original loop on the vector loop (we'll
7368 // replace the vectorizer-specific hints below).
7369 MDNode *OrigLoopID = OrigLoop->getLoopID();
7370
7371 std::optional<MDNode *> VectorizedLoopID =
7372 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7373 LLVMLoopVectorizeFollowupVectorized});
7374
7375 VPBasicBlock *HeaderVPBB =
7376 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7377 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7378 if (VectorizedLoopID)
7379 L->setLoopID(*VectorizedLoopID);
7380 else {
7381 // Keep all loop hints from the original loop on the vector loop (we'll
7382 // replace the vectorizer-specific hints below).
7383 if (MDNode *LID = OrigLoop->getLoopID())
7384 L->setLoopID(LID);
7385
7386 LoopVectorizeHints Hints(L, true, *ORE);
7387 Hints.setAlreadyVectorized();
7388 }
7389 TargetTransformInfo::UnrollingPreferences UP;
7390 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7391 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7392 AddRuntimeUnrollDisableMetaData(L);
7393
7394 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7395 // predication, updating analyses.
7396 ILV.fixVectorizedLoop(State, BestVPlan);
7397
7398 ILV.printDebugTracesAtEnd();
7399
7400 // 4. Adjust branch weight of the branch in the middle block.
7401 auto *MiddleTerm =
7402 cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7403 if (MiddleTerm->isConditional() &&
7404 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7405 // Assume that `Count % VectorTripCount` is equally distributed.
7406 unsigned TripCount = State.UF * State.VF.getKnownMinValue();
7407 assert(TripCount > 0 && "trip count should not be zero");
7408 const uint32_t Weights[] = {1, TripCount - 1};
7409 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7410 }
7411
7412 return {State.ExpandedSCEVs, ReductionResumeValues};
7413 }
7414
7415 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
printPlans(raw_ostream & O)7416 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7417 for (const auto &Plan : VPlans)
7418 if (PrintVPlansInDotFormat)
7419 Plan->printDOT(O);
7420 else
7421 Plan->print(O);
7422 }
7423 #endif
7424
7425 //===--------------------------------------------------------------------===//
7426 // EpilogueVectorizerMainLoop
7427 //===--------------------------------------------------------------------===//
7428
7429 /// This function is partially responsible for generating the control flow
7430 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7431 std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)7432 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7433 const SCEV2ValueTy &ExpandedSCEVs) {
7434 createVectorLoopSkeleton("");
7435
7436 // Generate the code to check the minimum iteration count of the vector
7437 // epilogue (see below).
7438 EPI.EpilogueIterationCountCheck =
7439 emitIterationCountCheck(LoopScalarPreHeader, true);
7440 EPI.EpilogueIterationCountCheck->setName("iter.check");
7441
7442 // Generate the code to check any assumptions that we've made for SCEV
7443 // expressions.
7444 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7445
7446 // Generate the code that checks at runtime if arrays overlap. We put the
7447 // checks into a separate block to make the more common case of few elements
7448 // faster.
7449 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7450
7451 // Generate the iteration count check for the main loop, *after* the check
7452 // for the epilogue loop, so that the path-length is shorter for the case
7453 // that goes directly through the vector epilogue. The longer-path length for
7454 // the main loop is compensated for, by the gain from vectorizing the larger
7455 // trip count. Note: the branch will get updated later on when we vectorize
7456 // the epilogue.
7457 EPI.MainLoopIterationCountCheck =
7458 emitIterationCountCheck(LoopScalarPreHeader, false);
7459
7460 // Generate the induction variable.
7461 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7462
7463 // Skip induction resume value creation here because they will be created in
7464 // the second pass for the scalar loop. The induction resume values for the
7465 // inductions in the epilogue loop are created before executing the plan for
7466 // the epilogue loop.
7467
7468 return {LoopVectorPreHeader, nullptr};
7469 }
7470
printDebugTracesAtStart()7471 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7472 LLVM_DEBUG({
7473 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7474 << "Main Loop VF:" << EPI.MainLoopVF
7475 << ", Main Loop UF:" << EPI.MainLoopUF
7476 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7477 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7478 });
7479 }
7480
printDebugTracesAtEnd()7481 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7482 DEBUG_WITH_TYPE(VerboseDebug, {
7483 dbgs() << "intermediate fn:\n"
7484 << *OrigLoop->getHeader()->getParent() << "\n";
7485 });
7486 }
7487
7488 BasicBlock *
emitIterationCountCheck(BasicBlock * Bypass,bool ForEpilogue)7489 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7490 bool ForEpilogue) {
7491 assert(Bypass && "Expected valid bypass basic block.");
7492 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7493 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7494 Value *Count = getTripCount();
7495 // Reuse existing vector loop preheader for TC checks.
7496 // Note that new preheader block is generated for vector loop.
7497 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7498 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7499
7500 // Generate code to check if the loop's trip count is less than VF * UF of the
7501 // main vector loop.
7502 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7503 : VF.isVector())
7504 ? ICmpInst::ICMP_ULE
7505 : ICmpInst::ICMP_ULT;
7506
7507 Value *CheckMinIters = Builder.CreateICmp(
7508 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7509 "min.iters.check");
7510
7511 if (!ForEpilogue)
7512 TCCheckBlock->setName("vector.main.loop.iter.check");
7513
7514 // Create new preheader for vector loop.
7515 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7516 DT, LI, nullptr, "vector.ph");
7517
7518 if (ForEpilogue) {
7519 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7520 DT->getNode(Bypass)->getIDom()) &&
7521 "TC check is expected to dominate Bypass");
7522
7523 // Update dominator for Bypass.
7524 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7525 LoopBypassBlocks.push_back(TCCheckBlock);
7526
7527 // Save the trip count so we don't have to regenerate it in the
7528 // vec.epilog.iter.check. This is safe to do because the trip count
7529 // generated here dominates the vector epilog iter check.
7530 EPI.TripCount = Count;
7531 }
7532
7533 BranchInst &BI =
7534 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7535 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7536 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7537 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7538
7539 return TCCheckBlock;
7540 }
7541
7542 //===--------------------------------------------------------------------===//
7543 // EpilogueVectorizerEpilogueLoop
7544 //===--------------------------------------------------------------------===//
7545
7546 /// This function is partially responsible for generating the control flow
7547 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7548 std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)7549 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7550 const SCEV2ValueTy &ExpandedSCEVs) {
7551 createVectorLoopSkeleton("vec.epilog.");
7552
7553 // Now, compare the remaining count and if there aren't enough iterations to
7554 // execute the vectorized epilogue skip to the scalar part.
7555 LoopVectorPreHeader->setName("vec.epilog.ph");
7556 BasicBlock *VecEpilogueIterationCountCheck =
7557 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
7558 nullptr, "vec.epilog.iter.check", true);
7559 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7560 VecEpilogueIterationCountCheck);
7561
7562 // Adjust the control flow taking the state info from the main loop
7563 // vectorization into account.
7564 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7565 "expected this to be saved from the previous pass.");
7566 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7567 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7568
7569 DT->changeImmediateDominator(LoopVectorPreHeader,
7570 EPI.MainLoopIterationCountCheck);
7571
7572 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7573 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7574
7575 if (EPI.SCEVSafetyCheck)
7576 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7577 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7578 if (EPI.MemSafetyCheck)
7579 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7580 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7581
7582 DT->changeImmediateDominator(
7583 VecEpilogueIterationCountCheck,
7584 VecEpilogueIterationCountCheck->getSinglePredecessor());
7585
7586 DT->changeImmediateDominator(LoopScalarPreHeader,
7587 EPI.EpilogueIterationCountCheck);
7588 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7589 // If there is an epilogue which must run, there's no edge from the
7590 // middle block to exit blocks and thus no need to update the immediate
7591 // dominator of the exit blocks.
7592 DT->changeImmediateDominator(LoopExitBlock,
7593 EPI.EpilogueIterationCountCheck);
7594
7595 // Keep track of bypass blocks, as they feed start values to the induction and
7596 // reduction phis in the scalar loop preheader.
7597 if (EPI.SCEVSafetyCheck)
7598 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7599 if (EPI.MemSafetyCheck)
7600 LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7601 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7602
7603 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7604 // reductions which merge control-flow from the latch block and the middle
7605 // block. Update the incoming values here and move the Phi into the preheader.
7606 SmallVector<PHINode *, 4> PhisInBlock;
7607 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7608 PhisInBlock.push_back(&Phi);
7609
7610 for (PHINode *Phi : PhisInBlock) {
7611 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7612 Phi->replaceIncomingBlockWith(
7613 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7614 VecEpilogueIterationCountCheck);
7615
7616 // If the phi doesn't have an incoming value from the
7617 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7618 // value and also those from other check blocks. This is needed for
7619 // reduction phis only.
7620 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7621 return EPI.EpilogueIterationCountCheck == IncB;
7622 }))
7623 continue;
7624 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7625 if (EPI.SCEVSafetyCheck)
7626 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7627 if (EPI.MemSafetyCheck)
7628 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7629 }
7630
7631 // Generate a resume induction for the vector epilogue and put it in the
7632 // vector epilogue preheader
7633 Type *IdxTy = Legal->getWidestInductionType();
7634 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7635 EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7636 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7637 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7638 EPI.MainLoopIterationCountCheck);
7639
7640 // Generate induction resume values. These variables save the new starting
7641 // indexes for the scalar loop. They are used to test if there are any tail
7642 // iterations left once the vector loop has completed.
7643 // Note that when the vectorized epilogue is skipped due to iteration count
7644 // check, then the resume value for the induction variable comes from
7645 // the trip count of the main vector loop, hence passing the AdditionalBypass
7646 // argument.
7647 createInductionResumeValues(ExpandedSCEVs,
7648 {VecEpilogueIterationCountCheck,
7649 EPI.VectorTripCount} /* AdditionalBypass */);
7650
7651 return {LoopVectorPreHeader, EPResumeVal};
7652 }
7653
7654 BasicBlock *
emitMinimumVectorEpilogueIterCountCheck(BasicBlock * Bypass,BasicBlock * Insert)7655 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7656 BasicBlock *Bypass, BasicBlock *Insert) {
7657
7658 assert(EPI.TripCount &&
7659 "Expected trip count to have been safed in the first pass.");
7660 assert(
7661 (!isa<Instruction>(EPI.TripCount) ||
7662 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7663 "saved trip count does not dominate insertion point.");
7664 Value *TC = EPI.TripCount;
7665 IRBuilder<> Builder(Insert->getTerminator());
7666 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7667
7668 // Generate code to check if the loop's trip count is less than VF * UF of the
7669 // vector epilogue loop.
7670 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7671 ? ICmpInst::ICMP_ULE
7672 : ICmpInst::ICMP_ULT;
7673
7674 Value *CheckMinIters =
7675 Builder.CreateICmp(P, Count,
7676 createStepForVF(Builder, Count->getType(),
7677 EPI.EpilogueVF, EPI.EpilogueUF),
7678 "min.epilog.iters.check");
7679
7680 BranchInst &BI =
7681 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7682 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7683 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7684 unsigned EpilogueLoopStep =
7685 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7686 // We assume the remaining `Count` is equally distributed in
7687 // [0, MainLoopStep)
7688 // So the probability for `Count < EpilogueLoopStep` should be
7689 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7690 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7691 const uint32_t Weights[] = {EstimatedSkipCount,
7692 MainLoopStep - EstimatedSkipCount};
7693 setBranchWeights(BI, Weights, /*IsExpected=*/false);
7694 }
7695 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7696 LoopBypassBlocks.push_back(Insert);
7697 return Insert;
7698 }
7699
printDebugTracesAtStart()7700 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7701 LLVM_DEBUG({
7702 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7703 << "Epilogue Loop VF:" << EPI.EpilogueVF
7704 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7705 });
7706 }
7707
printDebugTracesAtEnd()7708 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7709 DEBUG_WITH_TYPE(VerboseDebug, {
7710 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7711 });
7712 }
7713
getDecisionAndClampRange(const std::function<bool (ElementCount)> & Predicate,VFRange & Range)7714 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7715 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7716 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7717 bool PredicateAtRangeStart = Predicate(Range.Start);
7718
7719 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7720 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7721 Range.End = TmpVF;
7722 break;
7723 }
7724
7725 return PredicateAtRangeStart;
7726 }
7727
7728 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7729 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7730 /// of VF's starting at a given VF and extending it as much as possible. Each
7731 /// vectorization decision can potentially shorten this sub-range during
7732 /// buildVPlan().
buildVPlans(ElementCount MinVF,ElementCount MaxVF)7733 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7734 ElementCount MaxVF) {
7735 auto MaxVFTimes2 = MaxVF * 2;
7736 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7737 VFRange SubRange = {VF, MaxVFTimes2};
7738 VPlans.push_back(buildVPlan(SubRange));
7739 VF = SubRange.End;
7740 }
7741 }
7742
7743 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
mapToVPValues(User::op_range Operands)7744 VPRecipeBuilder::mapToVPValues(User::op_range Operands) {
7745 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
7746 if (auto *I = dyn_cast<Instruction>(Op)) {
7747 if (auto *R = Ingredient2Recipe.lookup(I))
7748 return R->getVPSingleValue();
7749 }
7750 return Plan.getOrAddLiveIn(Op);
7751 };
7752 return map_range(Operands, Fn);
7753 }
7754
createEdgeMask(BasicBlock * Src,BasicBlock * Dst)7755 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
7756 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7757
7758 // Look for cached value.
7759 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7760 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7761 if (ECEntryIt != EdgeMaskCache.end())
7762 return ECEntryIt->second;
7763
7764 VPValue *SrcMask = getBlockInMask(Src);
7765
7766 // The terminator has to be a branch inst!
7767 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7768 assert(BI && "Unexpected terminator found");
7769
7770 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7771 return EdgeMaskCache[Edge] = SrcMask;
7772
7773 // If source is an exiting block, we know the exit edge is dynamically dead
7774 // in the vector loop, and thus we don't need to restrict the mask. Avoid
7775 // adding uses of an otherwise potentially dead instruction.
7776 if (OrigLoop->isLoopExiting(Src))
7777 return EdgeMaskCache[Edge] = SrcMask;
7778
7779 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan);
7780 assert(EdgeMask && "No Edge Mask found for condition");
7781
7782 if (BI->getSuccessor(0) != Dst)
7783 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7784
7785 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
7786 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
7787 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
7788 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
7789 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
7790 }
7791
7792 return EdgeMaskCache[Edge] = EdgeMask;
7793 }
7794
getEdgeMask(BasicBlock * Src,BasicBlock * Dst) const7795 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
7796 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7797
7798 // Look for cached value.
7799 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7800 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
7801 assert(ECEntryIt != EdgeMaskCache.end() &&
7802 "looking up mask for edge which has not been created");
7803 return ECEntryIt->second;
7804 }
7805
createHeaderMask()7806 void VPRecipeBuilder::createHeaderMask() {
7807 BasicBlock *Header = OrigLoop->getHeader();
7808
7809 // When not folding the tail, use nullptr to model all-true mask.
7810 if (!CM.foldTailByMasking()) {
7811 BlockMaskCache[Header] = nullptr;
7812 return;
7813 }
7814
7815 // Introduce the early-exit compare IV <= BTC to form header block mask.
7816 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
7817 // constructing the desired canonical IV in the header block as its first
7818 // non-phi instructions.
7819
7820 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
7821 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
7822 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
7823 HeaderVPBB->insert(IV, NewInsertionPoint);
7824
7825 VPBuilder::InsertPointGuard Guard(Builder);
7826 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
7827 VPValue *BlockMask = nullptr;
7828 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
7829 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
7830 BlockMaskCache[Header] = BlockMask;
7831 }
7832
getBlockInMask(BasicBlock * BB) const7833 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
7834 // Return the cached value.
7835 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
7836 assert(BCEntryIt != BlockMaskCache.end() &&
7837 "Trying to access mask for block without one.");
7838 return BCEntryIt->second;
7839 }
7840
createBlockInMask(BasicBlock * BB)7841 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
7842 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7843 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
7844 assert(OrigLoop->getHeader() != BB &&
7845 "Loop header must have cached block mask");
7846
7847 // All-one mask is modelled as no-mask following the convention for masked
7848 // load/store/gather/scatter. Initialize BlockMask to no-mask.
7849 VPValue *BlockMask = nullptr;
7850 // This is the block mask. We OR all incoming edges.
7851 for (auto *Predecessor : predecessors(BB)) {
7852 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
7853 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
7854 BlockMaskCache[BB] = EdgeMask;
7855 return;
7856 }
7857
7858 if (!BlockMask) { // BlockMask has its initialized nullptr value.
7859 BlockMask = EdgeMask;
7860 continue;
7861 }
7862
7863 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
7864 }
7865
7866 BlockMaskCache[BB] = BlockMask;
7867 }
7868
7869 VPWidenMemoryRecipe *
tryToWidenMemory(Instruction * I,ArrayRef<VPValue * > Operands,VFRange & Range)7870 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
7871 VFRange &Range) {
7872 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7873 "Must be called with either a load or store");
7874
7875 auto willWiden = [&](ElementCount VF) -> bool {
7876 LoopVectorizationCostModel::InstWidening Decision =
7877 CM.getWideningDecision(I, VF);
7878 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7879 "CM decision should be taken at this point.");
7880 if (Decision == LoopVectorizationCostModel::CM_Interleave)
7881 return true;
7882 if (CM.isScalarAfterVectorization(I, VF) ||
7883 CM.isProfitableToScalarize(I, VF))
7884 return false;
7885 return Decision != LoopVectorizationCostModel::CM_Scalarize;
7886 };
7887
7888 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7889 return nullptr;
7890
7891 VPValue *Mask = nullptr;
7892 if (Legal->isMaskRequired(I))
7893 Mask = getBlockInMask(I->getParent());
7894
7895 // Determine if the pointer operand of the access is either consecutive or
7896 // reverse consecutive.
7897 LoopVectorizationCostModel::InstWidening Decision =
7898 CM.getWideningDecision(I, Range.Start);
7899 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7900 bool Consecutive =
7901 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
7902
7903 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
7904 if (Consecutive) {
7905 auto *GEP = dyn_cast<GetElementPtrInst>(
7906 Ptr->getUnderlyingValue()->stripPointerCasts());
7907 auto *VectorPtr = new VPVectorPointerRecipe(
7908 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
7909 I->getDebugLoc());
7910 Builder.getInsertBlock()->appendRecipe(VectorPtr);
7911 Ptr = VectorPtr;
7912 }
7913 if (LoadInst *Load = dyn_cast<LoadInst>(I))
7914 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7915 I->getDebugLoc());
7916
7917 StoreInst *Store = cast<StoreInst>(I);
7918 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
7919 Reverse, I->getDebugLoc());
7920 }
7921
7922 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
7923 /// insert a recipe to expand the step for the induction recipe.
7924 static VPWidenIntOrFpInductionRecipe *
createWidenInductionRecipes(PHINode * Phi,Instruction * PhiOrTrunc,VPValue * Start,const InductionDescriptor & IndDesc,VPlan & Plan,ScalarEvolution & SE,Loop & OrigLoop)7925 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
7926 VPValue *Start, const InductionDescriptor &IndDesc,
7927 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
7928 assert(IndDesc.getStartValue() ==
7929 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
7930 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7931 "step must be loop invariant");
7932
7933 VPValue *Step =
7934 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
7935 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
7936 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
7937 }
7938 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
7939 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
7940 }
7941
tryToOptimizeInductionPHI(PHINode * Phi,ArrayRef<VPValue * > Operands,VFRange & Range)7942 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
7943 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
7944
7945 // Check if this is an integer or fp induction. If so, build the recipe that
7946 // produces its scalar and vector values.
7947 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7948 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
7949 *PSE.getSE(), *OrigLoop);
7950
7951 // Check if this is pointer induction. If so, build the recipe for it.
7952 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7953 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
7954 *PSE.getSE());
7955 return new VPWidenPointerInductionRecipe(
7956 Phi, Operands[0], Step, *II,
7957 LoopVectorizationPlanner::getDecisionAndClampRange(
7958 [&](ElementCount VF) {
7959 return CM.isScalarAfterVectorization(Phi, VF);
7960 },
7961 Range));
7962 }
7963 return nullptr;
7964 }
7965
tryToOptimizeInductionTruncate(TruncInst * I,ArrayRef<VPValue * > Operands,VFRange & Range)7966 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
7967 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
7968 // Optimize the special case where the source is a constant integer
7969 // induction variable. Notice that we can only optimize the 'trunc' case
7970 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7971 // (c) other casts depend on pointer size.
7972
7973 // Determine whether \p K is a truncation based on an induction variable that
7974 // can be optimized.
7975 auto isOptimizableIVTruncate =
7976 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7977 return [=](ElementCount VF) -> bool {
7978 return CM.isOptimizableIVTruncate(K, VF);
7979 };
7980 };
7981
7982 if (LoopVectorizationPlanner::getDecisionAndClampRange(
7983 isOptimizableIVTruncate(I), Range)) {
7984
7985 auto *Phi = cast<PHINode>(I->getOperand(0));
7986 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
7987 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
7988 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
7989 *OrigLoop);
7990 }
7991 return nullptr;
7992 }
7993
tryToBlend(PHINode * Phi,ArrayRef<VPValue * > Operands)7994 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
7995 ArrayRef<VPValue *> Operands) {
7996 unsigned NumIncoming = Phi->getNumIncomingValues();
7997
7998 // We know that all PHIs in non-header blocks are converted into selects, so
7999 // we don't have to worry about the insertion order and we can just use the
8000 // builder. At this point we generate the predication tree. There may be
8001 // duplications since this is a simple recursive scan, but future
8002 // optimizations will clean it up.
8003 // TODO: At the moment the first mask is always skipped, but it would be
8004 // better to skip the most expensive mask.
8005 SmallVector<VPValue *, 2> OperandsWithMask;
8006
8007 for (unsigned In = 0; In < NumIncoming; In++) {
8008 OperandsWithMask.push_back(Operands[In]);
8009 VPValue *EdgeMask =
8010 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8011 if (!EdgeMask) {
8012 assert(In == 0 && "Both null and non-null edge masks found");
8013 assert(all_equal(Operands) &&
8014 "Distinct incoming values with one having a full mask");
8015 break;
8016 }
8017 if (In == 0)
8018 continue;
8019 OperandsWithMask.push_back(EdgeMask);
8020 }
8021 return new VPBlendRecipe(Phi, OperandsWithMask);
8022 }
8023
tryToWidenCall(CallInst * CI,ArrayRef<VPValue * > Operands,VFRange & Range)8024 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8025 ArrayRef<VPValue *> Operands,
8026 VFRange &Range) {
8027 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8028 [this, CI](ElementCount VF) {
8029 return CM.isScalarWithPredication(CI, VF);
8030 },
8031 Range);
8032
8033 if (IsPredicated)
8034 return nullptr;
8035
8036 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8037 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8038 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8039 ID == Intrinsic::pseudoprobe ||
8040 ID == Intrinsic::experimental_noalias_scope_decl))
8041 return nullptr;
8042
8043 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8044 Ops.push_back(Operands.back());
8045
8046 // Is it beneficial to perform intrinsic call compared to lib call?
8047 bool ShouldUseVectorIntrinsic =
8048 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8049 [&](ElementCount VF) -> bool {
8050 return CM.getCallWideningDecision(CI, VF).Kind ==
8051 LoopVectorizationCostModel::CM_IntrinsicCall;
8052 },
8053 Range);
8054 if (ShouldUseVectorIntrinsic)
8055 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID,
8056 CI->getDebugLoc());
8057
8058 Function *Variant = nullptr;
8059 std::optional<unsigned> MaskPos;
8060 // Is better to call a vectorized version of the function than to to scalarize
8061 // the call?
8062 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8063 [&](ElementCount VF) -> bool {
8064 // The following case may be scalarized depending on the VF.
8065 // The flag shows whether we can use a usual Call for vectorized
8066 // version of the instruction.
8067
8068 // If we've found a variant at a previous VF, then stop looking. A
8069 // vectorized variant of a function expects input in a certain shape
8070 // -- basically the number of input registers, the number of lanes
8071 // per register, and whether there's a mask required.
8072 // We store a pointer to the variant in the VPWidenCallRecipe, so
8073 // once we have an appropriate variant it's only valid for that VF.
8074 // This will force a different vplan to be generated for each VF that
8075 // finds a valid variant.
8076 if (Variant)
8077 return false;
8078 LoopVectorizationCostModel::CallWideningDecision Decision =
8079 CM.getCallWideningDecision(CI, VF);
8080 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8081 Variant = Decision.Variant;
8082 MaskPos = Decision.MaskPos;
8083 return true;
8084 }
8085
8086 return false;
8087 },
8088 Range);
8089 if (ShouldUseVectorCall) {
8090 if (MaskPos.has_value()) {
8091 // We have 2 cases that would require a mask:
8092 // 1) The block needs to be predicated, either due to a conditional
8093 // in the scalar loop or use of an active lane mask with
8094 // tail-folding, and we use the appropriate mask for the block.
8095 // 2) No mask is required for the block, but the only available
8096 // vector variant at this VF requires a mask, so we synthesize an
8097 // all-true mask.
8098 VPValue *Mask = nullptr;
8099 if (Legal->isMaskRequired(CI))
8100 Mask = getBlockInMask(CI->getParent());
8101 else
8102 Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue(
8103 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8104
8105 Ops.insert(Ops.begin() + *MaskPos, Mask);
8106 }
8107
8108 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()),
8109 Intrinsic::not_intrinsic, CI->getDebugLoc(),
8110 Variant);
8111 }
8112
8113 return nullptr;
8114 }
8115
shouldWiden(Instruction * I,VFRange & Range) const8116 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8117 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8118 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8119 // Instruction should be widened, unless it is scalar after vectorization,
8120 // scalarization is profitable or it is predicated.
8121 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8122 return CM.isScalarAfterVectorization(I, VF) ||
8123 CM.isProfitableToScalarize(I, VF) ||
8124 CM.isScalarWithPredication(I, VF);
8125 };
8126 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8127 Range);
8128 }
8129
tryToWiden(Instruction * I,ArrayRef<VPValue * > Operands,VPBasicBlock * VPBB)8130 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8131 ArrayRef<VPValue *> Operands,
8132 VPBasicBlock *VPBB) {
8133 switch (I->getOpcode()) {
8134 default:
8135 return nullptr;
8136 case Instruction::SDiv:
8137 case Instruction::UDiv:
8138 case Instruction::SRem:
8139 case Instruction::URem: {
8140 // If not provably safe, use a select to form a safe divisor before widening the
8141 // div/rem operation itself. Otherwise fall through to general handling below.
8142 if (CM.isPredicatedInst(I)) {
8143 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8144 VPValue *Mask = getBlockInMask(I->getParent());
8145 VPValue *One =
8146 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8147 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8148 Ops[1] = SafeRHS;
8149 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8150 }
8151 [[fallthrough]];
8152 }
8153 case Instruction::Add:
8154 case Instruction::And:
8155 case Instruction::AShr:
8156 case Instruction::FAdd:
8157 case Instruction::FCmp:
8158 case Instruction::FDiv:
8159 case Instruction::FMul:
8160 case Instruction::FNeg:
8161 case Instruction::FRem:
8162 case Instruction::FSub:
8163 case Instruction::ICmp:
8164 case Instruction::LShr:
8165 case Instruction::Mul:
8166 case Instruction::Or:
8167 case Instruction::Select:
8168 case Instruction::Shl:
8169 case Instruction::Sub:
8170 case Instruction::Xor:
8171 case Instruction::Freeze:
8172 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8173 };
8174 }
8175
fixHeaderPhis()8176 void VPRecipeBuilder::fixHeaderPhis() {
8177 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8178 for (VPHeaderPHIRecipe *R : PhisToFix) {
8179 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8180 VPRecipeBase *IncR =
8181 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8182 R->addOperand(IncR->getVPSingleValue());
8183 }
8184 }
8185
handleReplication(Instruction * I,VFRange & Range)8186 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
8187 VFRange &Range) {
8188 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8189 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8190 Range);
8191
8192 bool IsPredicated = CM.isPredicatedInst(I);
8193
8194 // Even if the instruction is not marked as uniform, there are certain
8195 // intrinsic calls that can be effectively treated as such, so we check for
8196 // them here. Conservatively, we only do this for scalable vectors, since
8197 // for fixed-width VFs we can always fall back on full scalarization.
8198 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8199 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8200 case Intrinsic::assume:
8201 case Intrinsic::lifetime_start:
8202 case Intrinsic::lifetime_end:
8203 // For scalable vectors if one of the operands is variant then we still
8204 // want to mark as uniform, which will generate one instruction for just
8205 // the first lane of the vector. We can't scalarize the call in the same
8206 // way as for fixed-width vectors because we don't know how many lanes
8207 // there are.
8208 //
8209 // The reasons for doing it this way for scalable vectors are:
8210 // 1. For the assume intrinsic generating the instruction for the first
8211 // lane is still be better than not generating any at all. For
8212 // example, the input may be a splat across all lanes.
8213 // 2. For the lifetime start/end intrinsics the pointer operand only
8214 // does anything useful when the input comes from a stack object,
8215 // which suggests it should always be uniform. For non-stack objects
8216 // the effect is to poison the object, which still allows us to
8217 // remove the call.
8218 IsUniform = true;
8219 break;
8220 default:
8221 break;
8222 }
8223 }
8224 VPValue *BlockInMask = nullptr;
8225 if (!IsPredicated) {
8226 // Finalize the recipe for Instr, first if it is not predicated.
8227 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8228 } else {
8229 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8230 // Instructions marked for predication are replicated and a mask operand is
8231 // added initially. Masked replicate recipes will later be placed under an
8232 // if-then construct to prevent side-effects. Generate recipes to compute
8233 // the block mask for this region.
8234 BlockInMask = getBlockInMask(I->getParent());
8235 }
8236
8237 // Note that there is some custom logic to mark some intrinsics as uniform
8238 // manually above for scalable vectors, which this assert needs to account for
8239 // as well.
8240 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8241 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8242 "Should not predicate a uniform recipe");
8243 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8244 IsUniform, BlockInMask);
8245 return Recipe;
8246 }
8247
8248 VPRecipeBase *
tryToCreateWidenRecipe(Instruction * Instr,ArrayRef<VPValue * > Operands,VFRange & Range,VPBasicBlock * VPBB)8249 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8250 ArrayRef<VPValue *> Operands,
8251 VFRange &Range, VPBasicBlock *VPBB) {
8252 // First, check for specific widening recipes that deal with inductions, Phi
8253 // nodes, calls and memory operations.
8254 VPRecipeBase *Recipe;
8255 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8256 if (Phi->getParent() != OrigLoop->getHeader())
8257 return tryToBlend(Phi, Operands);
8258
8259 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8260 return Recipe;
8261
8262 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8263 assert((Legal->isReductionVariable(Phi) ||
8264 Legal->isFixedOrderRecurrence(Phi)) &&
8265 "can only widen reductions and fixed-order recurrences here");
8266 VPValue *StartV = Operands[0];
8267 if (Legal->isReductionVariable(Phi)) {
8268 const RecurrenceDescriptor &RdxDesc =
8269 Legal->getReductionVars().find(Phi)->second;
8270 assert(RdxDesc.getRecurrenceStartValue() ==
8271 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8272 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8273 CM.isInLoopReduction(Phi),
8274 CM.useOrderedReductions(RdxDesc));
8275 } else {
8276 // TODO: Currently fixed-order recurrences are modeled as chains of
8277 // first-order recurrences. If there are no users of the intermediate
8278 // recurrences in the chain, the fixed order recurrence should be modeled
8279 // directly, enabling more efficient codegen.
8280 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8281 }
8282
8283 PhisToFix.push_back(PhiRecipe);
8284 return PhiRecipe;
8285 }
8286
8287 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8288 cast<TruncInst>(Instr), Operands, Range)))
8289 return Recipe;
8290
8291 // All widen recipes below deal only with VF > 1.
8292 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8293 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8294 return nullptr;
8295
8296 if (auto *CI = dyn_cast<CallInst>(Instr))
8297 return tryToWidenCall(CI, Operands, Range);
8298
8299 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8300 return tryToWidenMemory(Instr, Operands, Range);
8301
8302 if (!shouldWiden(Instr, Range))
8303 return nullptr;
8304
8305 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8306 return new VPWidenGEPRecipe(GEP,
8307 make_range(Operands.begin(), Operands.end()));
8308
8309 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8310 return new VPWidenSelectRecipe(
8311 *SI, make_range(Operands.begin(), Operands.end()));
8312 }
8313
8314 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8315 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8316 *CI);
8317 }
8318
8319 return tryToWiden(Instr, Operands, VPBB);
8320 }
8321
buildVPlansWithVPRecipes(ElementCount MinVF,ElementCount MaxVF)8322 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8323 ElementCount MaxVF) {
8324 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8325
8326 auto MaxVFTimes2 = MaxVF * 2;
8327 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8328 VFRange SubRange = {VF, MaxVFTimes2};
8329 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8330 // Now optimize the initial VPlan.
8331 if (!Plan->hasVF(ElementCount::getFixed(1)))
8332 VPlanTransforms::truncateToMinimalBitwidths(
8333 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8334 VPlanTransforms::optimize(*Plan, *PSE.getSE());
8335 // TODO: try to put it close to addActiveLaneMask().
8336 // Discard the plan if it is not EVL-compatible
8337 if (CM.foldTailWithEVL() &&
8338 !VPlanTransforms::tryAddExplicitVectorLength(*Plan))
8339 break;
8340 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8341 VPlans.push_back(std::move(Plan));
8342 }
8343 VF = SubRange.End;
8344 }
8345 }
8346
8347 // Add the necessary canonical IV and branch recipes required to control the
8348 // loop.
addCanonicalIVRecipes(VPlan & Plan,Type * IdxTy,bool HasNUW,DebugLoc DL)8349 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8350 DebugLoc DL) {
8351 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8352 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8353
8354 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8355 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8356 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8357 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8358 Header->insert(CanonicalIVPHI, Header->begin());
8359
8360 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8361 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8362 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8363 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8364 "index.next");
8365 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8366
8367 // Add the BranchOnCount VPInstruction to the latch.
8368 Builder.createNaryOp(VPInstruction::BranchOnCount,
8369 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8370 }
8371
8372 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8373 // original exit block.
addUsersInExitBlock(VPBasicBlock * HeaderVPBB,Loop * OrigLoop,VPRecipeBuilder & Builder,VPlan & Plan)8374 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8375 VPRecipeBuilder &Builder, VPlan &Plan) {
8376 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8377 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8378 // Only handle single-exit loops with unique exit blocks for now.
8379 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8380 return;
8381
8382 // Introduce VPUsers modeling the exit values.
8383 for (PHINode &ExitPhi : ExitBB->phis()) {
8384 Value *IncomingValue =
8385 ExitPhi.getIncomingValueForBlock(ExitingBB);
8386 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
8387 // Exit values for inductions are computed and updated outside of VPlan and
8388 // independent of induction recipes.
8389 // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8390 // live-outs.
8391 if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
8392 !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
8393 isa<VPWidenPointerInductionRecipe>(V))
8394 continue;
8395 Plan.addLiveOut(&ExitPhi, V);
8396 }
8397 }
8398
8399 /// Feed a resume value for every FOR from the vector loop to the scalar loop,
8400 /// if middle block branches to scalar preheader, by introducing ExtractFromEnd
8401 /// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the
8402 /// latter and corresponds to the scalar header.
addLiveOutsForFirstOrderRecurrences(VPlan & Plan)8403 static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
8404 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8405
8406 // Start by finding out if middle block branches to scalar preheader, which is
8407 // not a VPIRBasicBlock, unlike Exit block - the other possible successor of
8408 // middle block.
8409 // TODO: Should be replaced by
8410 // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
8411 // scalar region is modeled as well.
8412 VPBasicBlock *ScalarPHVPBB = nullptr;
8413 auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
8414 for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) {
8415 if (isa<VPIRBasicBlock>(Succ))
8416 continue;
8417 assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?");
8418 ScalarPHVPBB = cast<VPBasicBlock>(Succ);
8419 }
8420 if (!ScalarPHVPBB)
8421 return;
8422
8423 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8424 VPBuilder MiddleBuilder(MiddleVPBB);
8425 // Reset insert point so new recipes are inserted before terminator and
8426 // condition, if there is either the former or both.
8427 if (auto *Terminator = MiddleVPBB->getTerminator()) {
8428 auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
8429 assert((!Condition || Condition->getParent() == MiddleVPBB) &&
8430 "Condition expected in MiddleVPBB");
8431 MiddleBuilder.setInsertPoint(Condition ? Condition : Terminator);
8432 }
8433 VPValue *OneVPV = Plan.getOrAddLiveIn(
8434 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
8435
8436 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8437 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
8438 if (!FOR)
8439 continue;
8440
8441 // Extract the resume value and create a new VPLiveOut for it.
8442 auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd,
8443 {FOR->getBackedgeValue(), OneVPV},
8444 {}, "vector.recur.extract");
8445 auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
8446 VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
8447 "scalar.recur.init");
8448 Plan.addLiveOut(cast<PHINode>(FOR->getUnderlyingInstr()), ResumePhiRecipe);
8449 }
8450 }
8451
8452 VPlanPtr
tryToBuildVPlanWithVPRecipes(VFRange & Range)8453 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8454
8455 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8456
8457 // ---------------------------------------------------------------------------
8458 // Build initial VPlan: Scan the body of the loop in a topological order to
8459 // visit each basic block after having visited its predecessor basic blocks.
8460 // ---------------------------------------------------------------------------
8461
8462 // Create initial VPlan skeleton, having a basic block for the pre-header
8463 // which contains SCEV expansions that need to happen before the CFG is
8464 // modified; a basic block for the vector pre-header, followed by a region for
8465 // the vector loop, followed by the middle basic block. The skeleton vector
8466 // loop region contains a header and latch basic blocks.
8467
8468 bool RequiresScalarEpilogueCheck =
8469 LoopVectorizationPlanner::getDecisionAndClampRange(
8470 [this](ElementCount VF) {
8471 return !CM.requiresScalarEpilogue(VF.isVector());
8472 },
8473 Range);
8474 VPlanPtr Plan = VPlan::createInitialVPlan(
8475 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8476 *PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(),
8477 OrigLoop);
8478
8479 // Don't use getDecisionAndClampRange here, because we don't know the UF
8480 // so this function is better to be conservative, rather than to split
8481 // it up into different VPlans.
8482 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8483 bool IVUpdateMayOverflow = false;
8484 for (ElementCount VF : Range)
8485 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8486
8487 DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8488 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8489 // When not folding the tail, we know that the induction increment will not
8490 // overflow.
8491 bool HasNUW = Style == TailFoldingStyle::None;
8492 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8493
8494 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8495
8496 // ---------------------------------------------------------------------------
8497 // Pre-construction: record ingredients whose recipes we'll need to further
8498 // process after constructing the initial VPlan.
8499 // ---------------------------------------------------------------------------
8500
8501 // For each interleave group which is relevant for this (possibly trimmed)
8502 // Range, add it to the set of groups to be later applied to the VPlan and add
8503 // placeholders for its members' Recipes which we'll be replacing with a
8504 // single VPInterleaveRecipe.
8505 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8506 auto applyIG = [IG, this](ElementCount VF) -> bool {
8507 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8508 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8509 LoopVectorizationCostModel::CM_Interleave);
8510 // For scalable vectors, the only interleave factor currently supported
8511 // is 2 since we require the (de)interleave2 intrinsics instead of
8512 // shufflevectors.
8513 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8514 "Unsupported interleave factor for scalable vectors");
8515 return Result;
8516 };
8517 if (!getDecisionAndClampRange(applyIG, Range))
8518 continue;
8519 InterleaveGroups.insert(IG);
8520 };
8521
8522 // ---------------------------------------------------------------------------
8523 // Construct recipes for the instructions in the loop
8524 // ---------------------------------------------------------------------------
8525
8526 // Scan the body of the loop in a topological order to visit each basic block
8527 // after having visited its predecessor basic blocks.
8528 LoopBlocksDFS DFS(OrigLoop);
8529 DFS.perform(LI);
8530
8531 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
8532 VPBasicBlock *VPBB = HeaderVPBB;
8533 BasicBlock *HeaderBB = OrigLoop->getHeader();
8534 bool NeedsMasks =
8535 CM.foldTailByMasking() ||
8536 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
8537 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8538 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8539 });
8540 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8541 // Relevant instructions from basic block BB will be grouped into VPRecipe
8542 // ingredients and fill a new VPBasicBlock.
8543 if (VPBB != HeaderVPBB)
8544 VPBB->setName(BB->getName());
8545 Builder.setInsertPoint(VPBB);
8546
8547 if (VPBB == HeaderVPBB)
8548 RecipeBuilder.createHeaderMask();
8549 else if (NeedsMasks)
8550 RecipeBuilder.createBlockInMask(BB);
8551
8552 // Introduce each ingredient into VPlan.
8553 // TODO: Model and preserve debug intrinsics in VPlan.
8554 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8555 Instruction *Instr = &I;
8556 SmallVector<VPValue *, 4> Operands;
8557 auto *Phi = dyn_cast<PHINode>(Instr);
8558 if (Phi && Phi->getParent() == HeaderBB) {
8559 Operands.push_back(Plan->getOrAddLiveIn(
8560 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8561 } else {
8562 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
8563 Operands = {OpRange.begin(), OpRange.end()};
8564 }
8565
8566 // Invariant stores inside loop will be deleted and a single store
8567 // with the final reduction value will be added to the exit block
8568 StoreInst *SI;
8569 if ((SI = dyn_cast<StoreInst>(&I)) &&
8570 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8571 continue;
8572
8573 VPRecipeBase *Recipe =
8574 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8575 if (!Recipe)
8576 Recipe = RecipeBuilder.handleReplication(Instr, Range);
8577
8578 RecipeBuilder.setRecipe(Instr, Recipe);
8579 if (isa<VPHeaderPHIRecipe>(Recipe)) {
8580 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8581 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8582 // recipes and need to be moved to the phi section of HeaderVPBB:
8583 // * tail-folding (non-phi recipes computing the header mask are
8584 // introduced earlier than regular header phi recipes, and should appear
8585 // after them)
8586 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8587
8588 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8589 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8590 "unexpected recipe needs moving");
8591 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8592 } else
8593 VPBB->appendRecipe(Recipe);
8594 }
8595
8596 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8597 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8598 }
8599
8600 // After here, VPBB should not be used.
8601 VPBB = nullptr;
8602
8603 if (CM.requiresScalarEpilogue(Range)) {
8604 // No edge from the middle block to the unique exit block has been inserted
8605 // and there is nothing to fix from vector loop; phis should have incoming
8606 // from scalar loop only.
8607 } else
8608 addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);
8609
8610 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8611 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8612 "entry block must be set to a VPRegionBlock having a non-empty entry "
8613 "VPBasicBlock");
8614 RecipeBuilder.fixHeaderPhis();
8615
8616 addLiveOutsForFirstOrderRecurrences(*Plan);
8617
8618 // ---------------------------------------------------------------------------
8619 // Transform initial VPlan: Apply previously taken decisions, in order, to
8620 // bring the VPlan to its final state.
8621 // ---------------------------------------------------------------------------
8622
8623 // Adjust the recipes for any inloop reductions.
8624 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
8625
8626 // Interleave memory: for each Interleave Group we marked earlier as relevant
8627 // for this VPlan, replace the Recipes widening its memory instructions with a
8628 // single VPInterleaveRecipe at its insertion point.
8629 for (const auto *IG : InterleaveGroups) {
8630 auto *Recipe =
8631 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
8632 SmallVector<VPValue *, 4> StoredValues;
8633 for (unsigned i = 0; i < IG->getFactor(); ++i)
8634 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8635 auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
8636 StoredValues.push_back(StoreR->getStoredValue());
8637 }
8638
8639 bool NeedsMaskForGaps =
8640 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8641 assert((!NeedsMaskForGaps || useMaskedInterleavedAccesses(CM.TTI)) &&
8642 "masked interleaved groups are not allowed.");
8643 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8644 Recipe->getMask(), NeedsMaskForGaps);
8645 VPIG->insertBefore(Recipe);
8646 unsigned J = 0;
8647 for (unsigned i = 0; i < IG->getFactor(); ++i)
8648 if (Instruction *Member = IG->getMember(i)) {
8649 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8650 if (!Member->getType()->isVoidTy()) {
8651 VPValue *OriginalV = MemberR->getVPSingleValue();
8652 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8653 J++;
8654 }
8655 MemberR->eraseFromParent();
8656 }
8657 }
8658
8659 for (ElementCount VF : Range)
8660 Plan->addVF(VF);
8661 Plan->setName("Initial VPlan");
8662
8663 // Replace VPValues for known constant strides guaranteed by predicate scalar
8664 // evolution.
8665 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8666 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8667 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8668 // Only handle constant strides for now.
8669 if (!ScevStride)
8670 continue;
8671
8672 auto *CI = Plan->getOrAddLiveIn(
8673 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
8674 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
8675 StrideVPV->replaceAllUsesWith(CI);
8676
8677 // The versioned value may not be used in the loop directly but through a
8678 // sext/zext. Add new live-ins in those cases.
8679 for (Value *U : StrideV->users()) {
8680 if (!isa<SExtInst, ZExtInst>(U))
8681 continue;
8682 VPValue *StrideVPV = Plan->getLiveIn(U);
8683 if (!StrideVPV)
8684 continue;
8685 unsigned BW = U->getType()->getScalarSizeInBits();
8686 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
8687 : ScevStride->getAPInt().zext(BW);
8688 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
8689 StrideVPV->replaceAllUsesWith(CI);
8690 }
8691 }
8692
8693 VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) {
8694 return Legal->blockNeedsPredication(BB);
8695 });
8696
8697 // Sink users of fixed-order recurrence past the recipe defining the previous
8698 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8699 if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
8700 return nullptr;
8701
8702 if (useActiveLaneMask(Style)) {
8703 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8704 // TailFoldingStyle is visible there.
8705 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8706 bool WithoutRuntimeCheck =
8707 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8708 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8709 WithoutRuntimeCheck);
8710 }
8711 return Plan;
8712 }
8713
buildVPlan(VFRange & Range)8714 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8715 // Outer loop handling: They may require CFG and instruction level
8716 // transformations before even evaluating whether vectorization is profitable.
8717 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8718 // the vectorization pipeline.
8719 assert(!OrigLoop->isInnermost());
8720 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8721
8722 // Create new empty VPlan
8723 auto Plan = VPlan::createInitialVPlan(
8724 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8725 *PSE.getSE(), true, false, OrigLoop);
8726
8727 // Build hierarchical CFG
8728 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8729 HCFGBuilder.buildHierarchicalCFG();
8730
8731 for (ElementCount VF : Range)
8732 Plan->addVF(VF);
8733
8734 VPlanTransforms::VPInstructionsToVPRecipes(
8735 Plan,
8736 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8737 *PSE.getSE(), *TLI);
8738
8739 // Remove the existing terminator of the exiting block of the top-most region.
8740 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8741 auto *Term =
8742 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8743 Term->eraseFromParent();
8744
8745 // Tail folding is not supported for outer loops, so the induction increment
8746 // is guaranteed to not wrap.
8747 bool HasNUW = true;
8748 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8749 DebugLoc());
8750 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8751 return Plan;
8752 }
8753
8754 // Adjust the recipes for reductions. For in-loop reductions the chain of
8755 // instructions leading from the loop exit instr to the phi need to be converted
8756 // to reductions, with one operand being vector and the other being the scalar
8757 // reduction chain. For other reductions, a select is introduced between the phi
8758 // and live-out recipes when folding the tail.
8759 //
8760 // A ComputeReductionResult recipe is added to the middle block, also for
8761 // in-loop reductions which compute their result in-loop, because generating
8762 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8763 //
8764 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8765 // with a boolean reduction phi node to check if the condition is true in any
8766 // iteration. The final value is selected by the final ComputeReductionResult.
adjustRecipesForReductions(VPlanPtr & Plan,VPRecipeBuilder & RecipeBuilder,ElementCount MinVF)8767 void LoopVectorizationPlanner::adjustRecipesForReductions(
8768 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8769 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8770 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8771 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8772 // sank outside of the loop would keep the same order as they had in the
8773 // original loop.
8774 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8775 for (VPRecipeBase &R : Header->phis()) {
8776 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8777 ReductionPHIList.emplace_back(ReductionPhi);
8778 }
8779 bool HasIntermediateStore = false;
8780 stable_sort(ReductionPHIList,
8781 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8782 const VPReductionPHIRecipe *R2) {
8783 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8784 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8785 HasIntermediateStore |= IS1 || IS2;
8786
8787 // If neither of the recipes has an intermediate store, keep the
8788 // order the same.
8789 if (!IS1 && !IS2)
8790 return false;
8791
8792 // If only one of the recipes has an intermediate store, then
8793 // move it towards the beginning of the list.
8794 if (IS1 && !IS2)
8795 return true;
8796
8797 if (!IS1 && IS2)
8798 return false;
8799
8800 // If both recipes have an intermediate store, then the recipe
8801 // with the later store should be processed earlier. So it
8802 // should go to the beginning of the list.
8803 return DT->dominates(IS2, IS1);
8804 });
8805
8806 if (HasIntermediateStore && ReductionPHIList.size() > 1)
8807 for (VPRecipeBase *R : ReductionPHIList)
8808 R->moveBefore(*Header, Header->getFirstNonPhi());
8809
8810 for (VPRecipeBase &R : Header->phis()) {
8811 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8812 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8813 continue;
8814
8815 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8816 RecurKind Kind = RdxDesc.getRecurrenceKind();
8817 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
8818 "AnyOf reductions are not allowed for in-loop reductions");
8819
8820 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8821 SetVector<VPSingleDefRecipe *> Worklist;
8822 Worklist.insert(PhiR);
8823 for (unsigned I = 0; I != Worklist.size(); ++I) {
8824 VPSingleDefRecipe *Cur = Worklist[I];
8825 for (VPUser *U : Cur->users()) {
8826 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8827 if (!UserRecipe) {
8828 assert(isa<VPLiveOut>(U) &&
8829 "U must either be a VPSingleDef or VPLiveOut");
8830 continue;
8831 }
8832 Worklist.insert(UserRecipe);
8833 }
8834 }
8835
8836 // Visit operation "Links" along the reduction chain top-down starting from
8837 // the phi until LoopExitValue. We keep track of the previous item
8838 // (PreviousLink) to tell which of the two operands of a Link will remain
8839 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8840 // the select instructions. Blend recipes of in-loop reduction phi's will
8841 // get folded to their non-phi operand, as the reduction recipe handles the
8842 // condition directly.
8843 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8844 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
8845 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8846
8847 // Index of the first operand which holds a non-mask vector operand.
8848 unsigned IndexOfFirstOperand;
8849 // Recognize a call to the llvm.fmuladd intrinsic.
8850 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8851 VPValue *VecOp;
8852 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8853 if (IsFMulAdd) {
8854 assert(
8855 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
8856 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8857 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8858 isa<VPWidenCallRecipe>(CurrentLink)) &&
8859 CurrentLink->getOperand(2) == PreviousLink &&
8860 "expected a call where the previous link is the added operand");
8861
8862 // If the instruction is a call to the llvm.fmuladd intrinsic then we
8863 // need to create an fmul recipe (multiplying the first two operands of
8864 // the fmuladd together) to use as the vector operand for the fadd
8865 // reduction.
8866 VPInstruction *FMulRecipe = new VPInstruction(
8867 Instruction::FMul,
8868 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
8869 CurrentLinkI->getFastMathFlags());
8870 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
8871 VecOp = FMulRecipe;
8872 } else {
8873 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
8874 if (PhiR->isInLoop() && Blend) {
8875 assert(Blend->getNumIncomingValues() == 2 &&
8876 "Blend must have 2 incoming values");
8877 if (Blend->getIncomingValue(0) == PhiR)
8878 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
8879 else {
8880 assert(Blend->getIncomingValue(1) == PhiR &&
8881 "PhiR must be an operand of the blend");
8882 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
8883 }
8884 continue;
8885 }
8886
8887 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8888 if (isa<VPWidenRecipe>(CurrentLink)) {
8889 assert(isa<CmpInst>(CurrentLinkI) &&
8890 "need to have the compare of the select");
8891 continue;
8892 }
8893 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
8894 "must be a select recipe");
8895 IndexOfFirstOperand = 1;
8896 } else {
8897 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
8898 "Expected to replace a VPWidenSC");
8899 IndexOfFirstOperand = 0;
8900 }
8901 // Note that for non-commutable operands (cmp-selects), the semantics of
8902 // the cmp-select are captured in the recurrence kind.
8903 unsigned VecOpId =
8904 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
8905 ? IndexOfFirstOperand + 1
8906 : IndexOfFirstOperand;
8907 VecOp = CurrentLink->getOperand(VecOpId);
8908 assert(VecOp != PreviousLink &&
8909 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
8910 (VecOpId - IndexOfFirstOperand)) ==
8911 PreviousLink &&
8912 "PreviousLink must be the operand other than VecOp");
8913 }
8914
8915 BasicBlock *BB = CurrentLinkI->getParent();
8916 VPValue *CondOp = nullptr;
8917 if (CM.blockNeedsPredicationForAnyReason(BB))
8918 CondOp = RecipeBuilder.getBlockInMask(BB);
8919
8920 VPReductionRecipe *RedRecipe =
8921 new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
8922 CondOp, CM.useOrderedReductions(RdxDesc));
8923 // Append the recipe to the end of the VPBasicBlock because we need to
8924 // ensure that it comes after all of it's inputs, including CondOp.
8925 // Note that this transformation may leave over dead recipes (including
8926 // CurrentLink), which will be cleaned by a later VPlan transform.
8927 LinkVPBB->appendRecipe(RedRecipe);
8928 CurrentLink->replaceAllUsesWith(RedRecipe);
8929 PreviousLink = RedRecipe;
8930 }
8931 }
8932 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8933 Builder.setInsertPoint(&*LatchVPBB->begin());
8934 VPBasicBlock *MiddleVPBB =
8935 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
8936 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8937 for (VPRecipeBase &R :
8938 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8939 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8940 if (!PhiR)
8941 continue;
8942
8943 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8944 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8945 // with a boolean reduction phi node to check if the condition is true in
8946 // any iteration. The final value is selected by the final
8947 // ComputeReductionResult.
8948 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
8949 RdxDesc.getRecurrenceKind())) {
8950 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
8951 return isa<VPWidenSelectRecipe>(U) ||
8952 (isa<VPReplicateRecipe>(U) &&
8953 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
8954 Instruction::Select);
8955 }));
8956 VPValue *Cmp = Select->getOperand(0);
8957 // If the compare is checking the reduction PHI node, adjust it to check
8958 // the start value.
8959 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
8960 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
8961 if (CmpR->getOperand(I) == PhiR)
8962 CmpR->setOperand(I, PhiR->getStartValue());
8963 }
8964 VPBuilder::InsertPointGuard Guard(Builder);
8965 Builder.setInsertPoint(Select);
8966
8967 // If the true value of the select is the reduction phi, the new value is
8968 // selected if the negated condition is true in any iteration.
8969 if (Select->getOperand(1) == PhiR)
8970 Cmp = Builder.createNot(Cmp);
8971 VPValue *Or = Builder.createOr(PhiR, Cmp);
8972 Select->getVPSingleValue()->replaceAllUsesWith(Or);
8973
8974 // Convert the reduction phi to operate on bools.
8975 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
8976 OrigLoop->getHeader()->getContext())));
8977 }
8978
8979 // If tail is folded by masking, introduce selects between the phi
8980 // and the live-out instruction of each reduction, at the beginning of the
8981 // dedicated latch block.
8982 auto *OrigExitingVPV = PhiR->getBackedgeValue();
8983 auto *NewExitingVPV = PhiR->getBackedgeValue();
8984 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
8985 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
8986 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
8987 "reduction recipe must be defined before latch");
8988 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
8989 std::optional<FastMathFlags> FMFs =
8990 PhiTy->isFloatingPointTy()
8991 ? std::make_optional(RdxDesc.getFastMathFlags())
8992 : std::nullopt;
8993 NewExitingVPV =
8994 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
8995 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
8996 return isa<VPInstruction>(&U) &&
8997 cast<VPInstruction>(&U)->getOpcode() ==
8998 VPInstruction::ComputeReductionResult;
8999 });
9000 if (PreferPredicatedReductionSelect ||
9001 TTI.preferPredicatedReductionSelect(
9002 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9003 TargetTransformInfo::ReductionFlags()))
9004 PhiR->setOperand(1, NewExitingVPV);
9005 }
9006
9007 // If the vector reduction can be performed in a smaller type, we truncate
9008 // then extend the loop exit value to enable InstCombine to evaluate the
9009 // entire expression in the smaller type.
9010 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9011 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9012 !RecurrenceDescriptor::isAnyOfRecurrenceKind(
9013 RdxDesc.getRecurrenceKind())) {
9014 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9015 Type *RdxTy = RdxDesc.getRecurrenceType();
9016 auto *Trunc =
9017 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9018 auto *Extnd =
9019 RdxDesc.isSigned()
9020 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9021 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9022
9023 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9024 Extnd->insertAfter(Trunc);
9025 if (PhiR->getOperand(1) == NewExitingVPV)
9026 PhiR->setOperand(1, Extnd->getVPSingleValue());
9027 NewExitingVPV = Extnd;
9028 }
9029
9030 // We want code in the middle block to appear to execute on the location of
9031 // the scalar loop's latch terminator because: (a) it is all compiler
9032 // generated, (b) these instructions are always executed after evaluating
9033 // the latch conditional branch, and (c) other passes may add new
9034 // predecessors which terminate on this line. This is the easiest way to
9035 // ensure we don't accidentally cause an extra step back into the loop while
9036 // debugging.
9037 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9038
9039 // TODO: At the moment ComputeReductionResult also drives creation of the
9040 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9041 // even for in-loop reductions, until the reduction resume value handling is
9042 // also modeled in VPlan.
9043 auto *FinalReductionResult = new VPInstruction(
9044 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9045 FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9046 OrigExitingVPV->replaceUsesWithIf(
9047 FinalReductionResult,
9048 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9049 }
9050
9051 VPlanTransforms::clearReductionWrapFlags(*Plan);
9052 }
9053
execute(VPTransformState & State)9054 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9055 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9056 "Not a pointer induction according to InductionDescriptor!");
9057 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9058 "Unexpected type.");
9059 assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
9060 "Recipe should have been replaced");
9061
9062 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9063 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
9064 Type *PhiType = IndDesc.getStep()->getType();
9065
9066 // Build a pointer phi
9067 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9068 Type *ScStValueType = ScalarStartValue->getType();
9069 PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
9070 CanonicalIV->getIterator());
9071
9072 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9073 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9074
9075 // A pointer induction, performed by using a gep
9076 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9077
9078 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9079 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9080 Value *NumUnrolledElems =
9081 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9082 Value *InductionGEP = GetElementPtrInst::Create(
9083 State.Builder.getInt8Ty(), NewPointerPhi,
9084 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9085 InductionLoc);
9086 // Add induction update using an incorrect block temporarily. The phi node
9087 // will be fixed after VPlan execution. Note that at this point the latch
9088 // block cannot be used, as it does not exist yet.
9089 // TODO: Model increment value in VPlan, by turning the recipe into a
9090 // multi-def and a subclass of VPHeaderPHIRecipe.
9091 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9092
9093 // Create UF many actual address geps that use the pointer
9094 // phi as base and a vectorized version of the step value
9095 // (<step*0, ..., step*N>) as offset.
9096 for (unsigned Part = 0; Part < State.UF; ++Part) {
9097 Type *VecPhiType = VectorType::get(PhiType, State.VF);
9098 Value *StartOffsetScalar =
9099 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9100 Value *StartOffset =
9101 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9102 // Create a vector of consecutive numbers from zero to VF.
9103 StartOffset = State.Builder.CreateAdd(
9104 StartOffset, State.Builder.CreateStepVector(VecPhiType));
9105
9106 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9107 "scalar step must be the same across all parts");
9108 Value *GEP = State.Builder.CreateGEP(
9109 State.Builder.getInt8Ty(), NewPointerPhi,
9110 State.Builder.CreateMul(
9111 StartOffset,
9112 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9113 "vector.gep"));
9114 State.set(this, GEP, Part);
9115 }
9116 }
9117
execute(VPTransformState & State)9118 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9119 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9120
9121 // Fast-math-flags propagate from the original induction instruction.
9122 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9123 if (FPBinOp)
9124 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9125
9126 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9127 Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
9128 Value *DerivedIV = emitTransformedIndex(
9129 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9130 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9131 DerivedIV->setName("offset.idx");
9132 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9133
9134 State.set(this, DerivedIV, VPIteration(0, 0));
9135 }
9136
execute(VPTransformState & State)9137 void VPReplicateRecipe::execute(VPTransformState &State) {
9138 Instruction *UI = getUnderlyingInstr();
9139 if (State.Instance) { // Generate a single instance.
9140 assert((State.VF.isScalar() || !isUniform()) &&
9141 "uniform recipe shouldn't be predicated");
9142 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9143 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9144 // Insert scalar instance packing it into a vector.
9145 if (State.VF.isVector() && shouldPack()) {
9146 // If we're constructing lane 0, initialize to start from poison.
9147 if (State.Instance->Lane.isFirstLane()) {
9148 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9149 Value *Poison = PoisonValue::get(
9150 VectorType::get(UI->getType(), State.VF));
9151 State.set(this, Poison, State.Instance->Part);
9152 }
9153 State.packScalarIntoVectorValue(this, *State.Instance);
9154 }
9155 return;
9156 }
9157
9158 if (IsUniform) {
9159 // If the recipe is uniform across all parts (instead of just per VF), only
9160 // generate a single instance.
9161 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9162 all_of(operands(), [](VPValue *Op) {
9163 return Op->isDefinedOutsideVectorRegions();
9164 })) {
9165 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9166 if (user_begin() != user_end()) {
9167 for (unsigned Part = 1; Part < State.UF; ++Part)
9168 State.set(this, State.get(this, VPIteration(0, 0)),
9169 VPIteration(Part, 0));
9170 }
9171 return;
9172 }
9173
9174 // Uniform within VL means we need to generate lane 0 only for each
9175 // unrolled copy.
9176 for (unsigned Part = 0; Part < State.UF; ++Part)
9177 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9178 return;
9179 }
9180
9181 // A store of a loop varying value to a uniform address only needs the last
9182 // copy of the store.
9183 if (isa<StoreInst>(UI) &&
9184 vputils::isUniformAfterVectorization(getOperand(1))) {
9185 auto Lane = VPLane::getLastLaneForVF(State.VF);
9186 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9187 State);
9188 return;
9189 }
9190
9191 // Generate scalar instances for all VF lanes of all UF parts.
9192 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9193 const unsigned EndLane = State.VF.getKnownMinValue();
9194 for (unsigned Part = 0; Part < State.UF; ++Part)
9195 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9196 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9197 }
9198
execute(VPTransformState & State)9199 void VPWidenLoadRecipe::execute(VPTransformState &State) {
9200 auto *LI = cast<LoadInst>(&Ingredient);
9201
9202 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9203 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9204 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9205 bool CreateGather = !isConsecutive();
9206
9207 auto &Builder = State.Builder;
9208 State.setDebugLocFrom(getDebugLoc());
9209 for (unsigned Part = 0; Part < State.UF; ++Part) {
9210 Value *NewLI;
9211 Value *Mask = nullptr;
9212 if (auto *VPMask = getMask()) {
9213 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9214 // of a null all-one mask is a null mask.
9215 Mask = State.get(VPMask, Part);
9216 if (isReverse())
9217 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9218 }
9219
9220 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
9221 if (CreateGather) {
9222 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
9223 "wide.masked.gather");
9224 } else if (Mask) {
9225 NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
9226 PoisonValue::get(DataTy),
9227 "wide.masked.load");
9228 } else {
9229 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
9230 }
9231 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9232 State.addMetadata(NewLI, LI);
9233 if (Reverse)
9234 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9235 State.set(this, NewLI, Part);
9236 }
9237 }
9238
9239 /// Use all-true mask for reverse rather than actual mask, as it avoids a
9240 /// dependence w/o affecting the result.
createReverseEVL(IRBuilderBase & Builder,Value * Operand,Value * EVL,const Twine & Name)9241 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
9242 Value *EVL, const Twine &Name) {
9243 VectorType *ValTy = cast<VectorType>(Operand->getType());
9244 Value *AllTrueMask =
9245 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
9246 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
9247 {Operand, AllTrueMask, EVL}, nullptr, Name);
9248 }
9249
execute(VPTransformState & State)9250 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
9251 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9252 "explicit vector length.");
9253 auto *LI = cast<LoadInst>(&Ingredient);
9254
9255 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9256 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9257 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9258 bool CreateGather = !isConsecutive();
9259
9260 auto &Builder = State.Builder;
9261 State.setDebugLocFrom(getDebugLoc());
9262 CallInst *NewLI;
9263 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9264 Value *Addr = State.get(getAddr(), 0, !CreateGather);
9265 Value *Mask = nullptr;
9266 if (VPValue *VPMask = getMask()) {
9267 Mask = State.get(VPMask, 0);
9268 if (isReverse())
9269 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9270 } else {
9271 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9272 }
9273
9274 if (CreateGather) {
9275 NewLI =
9276 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9277 nullptr, "wide.masked.gather");
9278 } else {
9279 VectorBuilder VBuilder(Builder);
9280 VBuilder.setEVL(EVL).setMask(Mask);
9281 NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
9282 Instruction::Load, DataTy, Addr, "vp.op.load"));
9283 }
9284 NewLI->addParamAttr(
9285 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
9286 State.addMetadata(NewLI, LI);
9287 Instruction *Res = NewLI;
9288 if (isReverse())
9289 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
9290 State.set(this, Res, 0);
9291 }
9292
execute(VPTransformState & State)9293 void VPWidenStoreRecipe::execute(VPTransformState &State) {
9294 auto *SI = cast<StoreInst>(&Ingredient);
9295
9296 VPValue *StoredVPValue = getStoredValue();
9297 bool CreateScatter = !isConsecutive();
9298 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9299
9300 auto &Builder = State.Builder;
9301 State.setDebugLocFrom(getDebugLoc());
9302
9303 for (unsigned Part = 0; Part < State.UF; ++Part) {
9304 Instruction *NewSI = nullptr;
9305 Value *Mask = nullptr;
9306 if (auto *VPMask = getMask()) {
9307 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9308 // of a null all-one mask is a null mask.
9309 Mask = State.get(VPMask, Part);
9310 if (isReverse())
9311 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9312 }
9313
9314 Value *StoredVal = State.get(StoredVPValue, Part);
9315 if (isReverse()) {
9316 // If we store to reverse consecutive memory locations, then we need
9317 // to reverse the order of elements in the stored value.
9318 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9319 // We don't want to update the value in the map as it might be used in
9320 // another expression. So don't call resetVectorValue(StoredVal).
9321 }
9322 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
9323 if (CreateScatter)
9324 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
9325 else if (Mask)
9326 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
9327 else
9328 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
9329 State.addMetadata(NewSI, SI);
9330 }
9331 }
9332
execute(VPTransformState & State)9333 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
9334 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9335 "explicit vector length.");
9336 auto *SI = cast<StoreInst>(&Ingredient);
9337
9338 VPValue *StoredValue = getStoredValue();
9339 bool CreateScatter = !isConsecutive();
9340 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9341
9342 auto &Builder = State.Builder;
9343 State.setDebugLocFrom(getDebugLoc());
9344
9345 CallInst *NewSI = nullptr;
9346 Value *StoredVal = State.get(StoredValue, 0);
9347 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9348 if (isReverse())
9349 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
9350 Value *Mask = nullptr;
9351 if (VPValue *VPMask = getMask()) {
9352 Mask = State.get(VPMask, 0);
9353 if (isReverse())
9354 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9355 } else {
9356 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9357 }
9358 Value *Addr = State.get(getAddr(), 0, !CreateScatter);
9359 if (CreateScatter) {
9360 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9361 Intrinsic::vp_scatter,
9362 {StoredVal, Addr, Mask, EVL});
9363 } else {
9364 VectorBuilder VBuilder(Builder);
9365 VBuilder.setEVL(EVL).setMask(Mask);
9366 NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
9367 Instruction::Store, Type::getVoidTy(EVL->getContext()),
9368 {StoredVal, Addr}));
9369 }
9370 NewSI->addParamAttr(
9371 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
9372 State.addMetadata(NewSI, SI);
9373 }
9374
9375 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9376 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9377 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9378 // for predication.
getScalarEpilogueLowering(Function * F,Loop * L,LoopVectorizeHints & Hints,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,LoopVectorizationLegality & LVL,InterleavedAccessInfo * IAI)9379 static ScalarEpilogueLowering getScalarEpilogueLowering(
9380 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9381 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9382 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9383 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9384 // don't look at hints or options, and don't request a scalar epilogue.
9385 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9386 // LoopAccessInfo (due to code dependency and not being able to reliably get
9387 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9388 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9389 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9390 // back to the old way and vectorize with versioning when forced. See D81345.)
9391 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9392 PGSOQueryType::IRPass) &&
9393 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9394 return CM_ScalarEpilogueNotAllowedOptSize;
9395
9396 // 2) If set, obey the directives
9397 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9398 switch (PreferPredicateOverEpilogue) {
9399 case PreferPredicateTy::ScalarEpilogue:
9400 return CM_ScalarEpilogueAllowed;
9401 case PreferPredicateTy::PredicateElseScalarEpilogue:
9402 return CM_ScalarEpilogueNotNeededUsePredicate;
9403 case PreferPredicateTy::PredicateOrDontVectorize:
9404 return CM_ScalarEpilogueNotAllowedUsePredicate;
9405 };
9406 }
9407
9408 // 3) If set, obey the hints
9409 switch (Hints.getPredicate()) {
9410 case LoopVectorizeHints::FK_Enabled:
9411 return CM_ScalarEpilogueNotNeededUsePredicate;
9412 case LoopVectorizeHints::FK_Disabled:
9413 return CM_ScalarEpilogueAllowed;
9414 };
9415
9416 // 4) if the TTI hook indicates this is profitable, request predication.
9417 TailFoldingInfo TFI(TLI, &LVL, IAI);
9418 if (TTI->preferPredicateOverEpilogue(&TFI))
9419 return CM_ScalarEpilogueNotNeededUsePredicate;
9420
9421 return CM_ScalarEpilogueAllowed;
9422 }
9423
9424 // Process the loop in the VPlan-native vectorization path. This path builds
9425 // VPlan upfront in the vectorization pipeline, which allows to apply
9426 // VPlan-to-VPlan transformations from the very beginning without modifying the
9427 // input LLVM IR.
processLoopInVPlanNativePath(Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,LoopVectorizationLegality * LVL,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,LoopVectorizeHints & Hints,LoopVectorizationRequirements & Requirements)9428 static bool processLoopInVPlanNativePath(
9429 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9430 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9431 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9432 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9433 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9434 LoopVectorizationRequirements &Requirements) {
9435
9436 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9437 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9438 return false;
9439 }
9440 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9441 Function *F = L->getHeader()->getParent();
9442 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9443
9444 ScalarEpilogueLowering SEL =
9445 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9446
9447 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9448 &Hints, IAI);
9449 // Use the planner for outer loop vectorization.
9450 // TODO: CM is not used at this point inside the planner. Turn CM into an
9451 // optional argument if we don't need it in the future.
9452 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9453 ORE);
9454
9455 // Get user vectorization factor.
9456 ElementCount UserVF = Hints.getWidth();
9457
9458 CM.collectElementTypesForWidening();
9459
9460 // Plan how to best vectorize, return the best VF and its cost.
9461 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9462
9463 // If we are stress testing VPlan builds, do not attempt to generate vector
9464 // code. Masked vector code generation support will follow soon.
9465 // Also, do not attempt to vectorize if no vector code will be produced.
9466 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9467 return false;
9468
9469 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9470
9471 {
9472 bool AddBranchWeights =
9473 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9474 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9475 F->getDataLayout(), AddBranchWeights);
9476 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9477 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9478 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9479 << L->getHeader()->getParent()->getName() << "\"\n");
9480 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9481 }
9482
9483 reportVectorization(ORE, L, VF, 1);
9484
9485 // Mark the loop as already vectorized to avoid vectorizing again.
9486 Hints.setAlreadyVectorized();
9487 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9488 return true;
9489 }
9490
9491 // Emit a remark if there are stores to floats that required a floating point
9492 // extension. If the vectorized loop was generated with floating point there
9493 // will be a performance penalty from the conversion overhead and the change in
9494 // the vector width.
checkMixedPrecision(Loop * L,OptimizationRemarkEmitter * ORE)9495 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9496 SmallVector<Instruction *, 4> Worklist;
9497 for (BasicBlock *BB : L->getBlocks()) {
9498 for (Instruction &Inst : *BB) {
9499 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9500 if (S->getValueOperand()->getType()->isFloatTy())
9501 Worklist.push_back(S);
9502 }
9503 }
9504 }
9505
9506 // Traverse the floating point stores upwards searching, for floating point
9507 // conversions.
9508 SmallPtrSet<const Instruction *, 4> Visited;
9509 SmallPtrSet<const Instruction *, 4> EmittedRemark;
9510 while (!Worklist.empty()) {
9511 auto *I = Worklist.pop_back_val();
9512 if (!L->contains(I))
9513 continue;
9514 if (!Visited.insert(I).second)
9515 continue;
9516
9517 // Emit a remark if the floating point store required a floating
9518 // point conversion.
9519 // TODO: More work could be done to identify the root cause such as a
9520 // constant or a function return type and point the user to it.
9521 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9522 ORE->emit([&]() {
9523 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9524 I->getDebugLoc(), L->getHeader())
9525 << "floating point conversion changes vector width. "
9526 << "Mixed floating point precision requires an up/down "
9527 << "cast that will negatively impact performance.";
9528 });
9529
9530 for (Use &Op : I->operands())
9531 if (auto *OpI = dyn_cast<Instruction>(Op))
9532 Worklist.push_back(OpI);
9533 }
9534 }
9535
areRuntimeChecksProfitable(GeneratedRTChecks & Checks,VectorizationFactor & VF,std::optional<unsigned> VScale,Loop * L,ScalarEvolution & SE,ScalarEpilogueLowering SEL)9536 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9537 VectorizationFactor &VF,
9538 std::optional<unsigned> VScale, Loop *L,
9539 ScalarEvolution &SE,
9540 ScalarEpilogueLowering SEL) {
9541 InstructionCost CheckCost = Checks.getCost();
9542 if (!CheckCost.isValid())
9543 return false;
9544
9545 // When interleaving only scalar and vector cost will be equal, which in turn
9546 // would lead to a divide by 0. Fall back to hard threshold.
9547 if (VF.Width.isScalar()) {
9548 if (CheckCost > VectorizeMemoryCheckThreshold) {
9549 LLVM_DEBUG(
9550 dbgs()
9551 << "LV: Interleaving only is not profitable due to runtime checks\n");
9552 return false;
9553 }
9554 return true;
9555 }
9556
9557 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9558 uint64_t ScalarC = *VF.ScalarCost.getValue();
9559 if (ScalarC == 0)
9560 return true;
9561
9562 // First, compute the minimum iteration count required so that the vector
9563 // loop outperforms the scalar loop.
9564 // The total cost of the scalar loop is
9565 // ScalarC * TC
9566 // where
9567 // * TC is the actual trip count of the loop.
9568 // * ScalarC is the cost of a single scalar iteration.
9569 //
9570 // The total cost of the vector loop is
9571 // RtC + VecC * (TC / VF) + EpiC
9572 // where
9573 // * RtC is the cost of the generated runtime checks
9574 // * VecC is the cost of a single vector iteration.
9575 // * TC is the actual trip count of the loop
9576 // * VF is the vectorization factor
9577 // * EpiCost is the cost of the generated epilogue, including the cost
9578 // of the remaining scalar operations.
9579 //
9580 // Vectorization is profitable once the total vector cost is less than the
9581 // total scalar cost:
9582 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9583 //
9584 // Now we can compute the minimum required trip count TC as
9585 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9586 //
9587 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9588 // the computations are performed on doubles, not integers and the result
9589 // is rounded up, hence we get an upper estimate of the TC.
9590 unsigned IntVF = VF.Width.getKnownMinValue();
9591 if (VF.Width.isScalable()) {
9592 unsigned AssumedMinimumVscale = 1;
9593 if (VScale)
9594 AssumedMinimumVscale = *VScale;
9595 IntVF *= AssumedMinimumVscale;
9596 }
9597 uint64_t RtC = *CheckCost.getValue();
9598 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
9599 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9600
9601 // Second, compute a minimum iteration count so that the cost of the
9602 // runtime checks is only a fraction of the total scalar loop cost. This
9603 // adds a loop-dependent bound on the overhead incurred if the runtime
9604 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9605 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9606 // cost, compute
9607 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9608 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9609
9610 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9611 // epilogue is allowed, choose the next closest multiple of VF. This should
9612 // partly compensate for ignoring the epilogue cost.
9613 uint64_t MinTC = std::max(MinTC1, MinTC2);
9614 if (SEL == CM_ScalarEpilogueAllowed)
9615 MinTC = alignTo(MinTC, IntVF);
9616 VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
9617
9618 LLVM_DEBUG(
9619 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9620 << VF.MinProfitableTripCount << "\n");
9621
9622 // Skip vectorization if the expected trip count is less than the minimum
9623 // required trip count.
9624 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9625 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
9626 VF.MinProfitableTripCount)) {
9627 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9628 "trip count < minimum profitable VF ("
9629 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9630 << ")\n");
9631
9632 return false;
9633 }
9634 }
9635 return true;
9636 }
9637
LoopVectorizePass(LoopVectorizeOptions Opts)9638 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9639 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9640 !EnableLoopInterleaving),
9641 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9642 !EnableLoopVectorization) {}
9643
processLoop(Loop * L)9644 bool LoopVectorizePass::processLoop(Loop *L) {
9645 assert((EnableVPlanNativePath || L->isInnermost()) &&
9646 "VPlan-native path is not enabled. Only process inner loops.");
9647
9648 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9649 << L->getHeader()->getParent()->getName() << "' from "
9650 << L->getLocStr() << "\n");
9651
9652 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9653
9654 LLVM_DEBUG(
9655 dbgs() << "LV: Loop hints:"
9656 << " force="
9657 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9658 ? "disabled"
9659 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9660 ? "enabled"
9661 : "?"))
9662 << " width=" << Hints.getWidth()
9663 << " interleave=" << Hints.getInterleave() << "\n");
9664
9665 // Function containing loop
9666 Function *F = L->getHeader()->getParent();
9667
9668 // Looking at the diagnostic output is the only way to determine if a loop
9669 // was vectorized (other than looking at the IR or machine code), so it
9670 // is important to generate an optimization remark for each loop. Most of
9671 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9672 // generated as OptimizationRemark and OptimizationRemarkMissed are
9673 // less verbose reporting vectorized loops and unvectorized loops that may
9674 // benefit from vectorization, respectively.
9675
9676 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9677 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9678 return false;
9679 }
9680
9681 PredicatedScalarEvolution PSE(*SE, *L);
9682
9683 // Check if it is legal to vectorize the loop.
9684 LoopVectorizationRequirements Requirements;
9685 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9686 &Requirements, &Hints, DB, AC, BFI, PSI);
9687 if (!LVL.canVectorize(EnableVPlanNativePath)) {
9688 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9689 Hints.emitRemarkWithHints();
9690 return false;
9691 }
9692
9693 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9694 // here. They may require CFG and instruction level transformations before
9695 // even evaluating whether vectorization is profitable. Since we cannot modify
9696 // the incoming IR, we need to build VPlan upfront in the vectorization
9697 // pipeline.
9698 if (!L->isInnermost())
9699 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9700 ORE, BFI, PSI, Hints, Requirements);
9701
9702 assert(L->isInnermost() && "Inner loop expected.");
9703
9704 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9705 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9706
9707 // If an override option has been passed in for interleaved accesses, use it.
9708 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9709 UseInterleaved = EnableInterleavedMemAccesses;
9710
9711 // Analyze interleaved memory accesses.
9712 if (UseInterleaved)
9713 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9714
9715 // Check the function attributes and profiles to find out if this function
9716 // should be optimized for size.
9717 ScalarEpilogueLowering SEL =
9718 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9719
9720 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9721 // count by optimizing for size, to minimize overheads.
9722 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9723 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9724 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9725 << "This loop is worth vectorizing only if no scalar "
9726 << "iteration overheads are incurred.");
9727 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9728 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9729 else {
9730 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9731 LLVM_DEBUG(dbgs() << "\n");
9732 // Predicate tail-folded loops are efficient even when the loop
9733 // iteration count is low. However, setting the epilogue policy to
9734 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9735 // with runtime checks. It's more effective to let
9736 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9737 // for the loop.
9738 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9739 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9740 } else {
9741 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9742 "small to consider vectorizing.\n");
9743 reportVectorizationFailure(
9744 "The trip count is below the minial threshold value.",
9745 "loop trip count is too low, avoiding vectorization",
9746 "LowTripCount", ORE, L);
9747 Hints.emitRemarkWithHints();
9748 return false;
9749 }
9750 }
9751 }
9752
9753 // Check the function attributes to see if implicit floats or vectors are
9754 // allowed.
9755 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9756 reportVectorizationFailure(
9757 "Can't vectorize when the NoImplicitFloat attribute is used",
9758 "loop not vectorized due to NoImplicitFloat attribute",
9759 "NoImplicitFloat", ORE, L);
9760 Hints.emitRemarkWithHints();
9761 return false;
9762 }
9763
9764 // Check if the target supports potentially unsafe FP vectorization.
9765 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9766 // for the target we're vectorizing for, to make sure none of the
9767 // additional fp-math flags can help.
9768 if (Hints.isPotentiallyUnsafe() &&
9769 TTI->isFPVectorizationPotentiallyUnsafe()) {
9770 reportVectorizationFailure(
9771 "Potentially unsafe FP op prevents vectorization",
9772 "loop not vectorized due to unsafe FP support.",
9773 "UnsafeFP", ORE, L);
9774 Hints.emitRemarkWithHints();
9775 return false;
9776 }
9777
9778 bool AllowOrderedReductions;
9779 // If the flag is set, use that instead and override the TTI behaviour.
9780 if (ForceOrderedReductions.getNumOccurrences() > 0)
9781 AllowOrderedReductions = ForceOrderedReductions;
9782 else
9783 AllowOrderedReductions = TTI->enableOrderedReductions();
9784 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9785 ORE->emit([&]() {
9786 auto *ExactFPMathInst = Requirements.getExactFPInst();
9787 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9788 ExactFPMathInst->getDebugLoc(),
9789 ExactFPMathInst->getParent())
9790 << "loop not vectorized: cannot prove it is safe to reorder "
9791 "floating-point operations";
9792 });
9793 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9794 "reorder floating-point operations\n");
9795 Hints.emitRemarkWithHints();
9796 return false;
9797 }
9798
9799 // Use the cost model.
9800 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9801 F, &Hints, IAI);
9802 // Use the planner for vectorization.
9803 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9804 ORE);
9805
9806 // Get user vectorization factor and interleave count.
9807 ElementCount UserVF = Hints.getWidth();
9808 unsigned UserIC = Hints.getInterleave();
9809
9810 // Plan how to best vectorize, return the best VF and its cost.
9811 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9812
9813 VectorizationFactor VF = VectorizationFactor::Disabled();
9814 unsigned IC = 1;
9815
9816 bool AddBranchWeights =
9817 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9818 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9819 F->getDataLayout(), AddBranchWeights);
9820 if (MaybeVF) {
9821 VF = *MaybeVF;
9822 // Select the interleave count.
9823 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9824
9825 unsigned SelectedIC = std::max(IC, UserIC);
9826 // Optimistically generate runtime checks if they are needed. Drop them if
9827 // they turn out to not be profitable.
9828 if (VF.Width.isVector() || SelectedIC > 1)
9829 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9830
9831 // Check if it is profitable to vectorize with runtime checks.
9832 bool ForceVectorization =
9833 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
9834 if (!ForceVectorization &&
9835 !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
9836 *PSE.getSE(), SEL)) {
9837 ORE->emit([&]() {
9838 return OptimizationRemarkAnalysisAliasing(
9839 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9840 L->getHeader())
9841 << "loop not vectorized: cannot prove it is safe to reorder "
9842 "memory operations";
9843 });
9844 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9845 Hints.emitRemarkWithHints();
9846 return false;
9847 }
9848 }
9849
9850 // Identify the diagnostic messages that should be produced.
9851 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9852 bool VectorizeLoop = true, InterleaveLoop = true;
9853 if (VF.Width.isScalar()) {
9854 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9855 VecDiagMsg = std::make_pair(
9856 "VectorizationNotBeneficial",
9857 "the cost-model indicates that vectorization is not beneficial");
9858 VectorizeLoop = false;
9859 }
9860
9861 if (!MaybeVF && UserIC > 1) {
9862 // Tell the user interleaving was avoided up-front, despite being explicitly
9863 // requested.
9864 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9865 "interleaving should be avoided up front\n");
9866 IntDiagMsg = std::make_pair(
9867 "InterleavingAvoided",
9868 "Ignoring UserIC, because interleaving was avoided up front");
9869 InterleaveLoop = false;
9870 } else if (IC == 1 && UserIC <= 1) {
9871 // Tell the user interleaving is not beneficial.
9872 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9873 IntDiagMsg = std::make_pair(
9874 "InterleavingNotBeneficial",
9875 "the cost-model indicates that interleaving is not beneficial");
9876 InterleaveLoop = false;
9877 if (UserIC == 1) {
9878 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9879 IntDiagMsg.second +=
9880 " and is explicitly disabled or interleave count is set to 1";
9881 }
9882 } else if (IC > 1 && UserIC == 1) {
9883 // Tell the user interleaving is beneficial, but it explicitly disabled.
9884 LLVM_DEBUG(
9885 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9886 IntDiagMsg = std::make_pair(
9887 "InterleavingBeneficialButDisabled",
9888 "the cost-model indicates that interleaving is beneficial "
9889 "but is explicitly disabled or interleave count is set to 1");
9890 InterleaveLoop = false;
9891 }
9892
9893 // Override IC if user provided an interleave count.
9894 IC = UserIC > 0 ? UserIC : IC;
9895
9896 // Emit diagnostic messages, if any.
9897 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9898 if (!VectorizeLoop && !InterleaveLoop) {
9899 // Do not vectorize or interleaving the loop.
9900 ORE->emit([&]() {
9901 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9902 L->getStartLoc(), L->getHeader())
9903 << VecDiagMsg.second;
9904 });
9905 ORE->emit([&]() {
9906 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9907 L->getStartLoc(), L->getHeader())
9908 << IntDiagMsg.second;
9909 });
9910 return false;
9911 } else if (!VectorizeLoop && InterleaveLoop) {
9912 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9913 ORE->emit([&]() {
9914 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9915 L->getStartLoc(), L->getHeader())
9916 << VecDiagMsg.second;
9917 });
9918 } else if (VectorizeLoop && !InterleaveLoop) {
9919 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9920 << ") in " << L->getLocStr() << '\n');
9921 ORE->emit([&]() {
9922 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9923 L->getStartLoc(), L->getHeader())
9924 << IntDiagMsg.second;
9925 });
9926 } else if (VectorizeLoop && InterleaveLoop) {
9927 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9928 << ") in " << L->getLocStr() << '\n');
9929 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9930 }
9931
9932 bool DisableRuntimeUnroll = false;
9933 MDNode *OrigLoopID = L->getLoopID();
9934 {
9935 using namespace ore;
9936 if (!VectorizeLoop) {
9937 assert(IC > 1 && "interleave count should not be 1 or 0");
9938 // If we decided that it is not legal to vectorize the loop, then
9939 // interleave it.
9940 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
9941 &CM, BFI, PSI, Checks);
9942
9943 VPlan &BestPlan =
9944 UseLegacyCostModel ? LVP.getBestPlanFor(VF.Width) : LVP.getBestPlan();
9945 assert((UseLegacyCostModel || BestPlan.hasScalarVFOnly()) &&
9946 "VPlan cost model and legacy cost model disagreed");
9947 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
9948
9949 ORE->emit([&]() {
9950 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9951 L->getHeader())
9952 << "interleaved loop (interleaved count: "
9953 << NV("InterleaveCount", IC) << ")";
9954 });
9955 } else {
9956 // If we decided that it is *legal* to vectorize the loop, then do it.
9957
9958 // Consider vectorizing the epilogue too if it's profitable.
9959 VectorizationFactor EpilogueVF =
9960 LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
9961 if (EpilogueVF.Width.isVector()) {
9962
9963 // The first pass vectorizes the main loop and creates a scalar epilogue
9964 // to be vectorized by executing the plan (potentially with a different
9965 // factor) again shortly afterwards.
9966 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
9967 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
9968 EPI, &LVL, &CM, BFI, PSI, Checks);
9969
9970 std::unique_ptr<VPlan> BestMainPlan(
9971 LVP.getBestPlanFor(EPI.MainLoopVF).duplicate());
9972 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
9973 EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
9974 ++LoopsVectorized;
9975
9976 // Second pass vectorizes the epilogue and adjusts the control flow
9977 // edges from the first pass.
9978 EPI.MainLoopVF = EPI.EpilogueVF;
9979 EPI.MainLoopUF = EPI.EpilogueUF;
9980 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9981 ORE, EPI, &LVL, &CM, BFI, PSI,
9982 Checks);
9983
9984 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
9985 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
9986 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9987 Header->setName("vec.epilog.vector.body");
9988
9989 // Re-use the trip count and steps expanded for the main loop, as
9990 // skeleton creation needs it as a value that dominates both the scalar
9991 // and vector epilogue loops
9992 // TODO: This is a workaround needed for epilogue vectorization and it
9993 // should be removed once induction resume value creation is done
9994 // directly in VPlan.
9995 EpilogILV.setTripCount(MainILV.getTripCount());
9996 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
9997 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
9998 auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
9999 ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10000 ExpandR->replaceAllUsesWith(ExpandedVal);
10001 if (BestEpiPlan.getTripCount() == ExpandR)
10002 BestEpiPlan.resetTripCount(ExpandedVal);
10003 ExpandR->eraseFromParent();
10004 }
10005
10006 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10007 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10008 // before vectorizing the epilogue loop.
10009 for (VPRecipeBase &R : Header->phis()) {
10010 if (isa<VPCanonicalIVPHIRecipe>(&R))
10011 continue;
10012
10013 Value *ResumeV = nullptr;
10014 // TODO: Move setting of resume values to prepareToExecute.
10015 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10016 const RecurrenceDescriptor &RdxDesc =
10017 ReductionPhi->getRecurrenceDescriptor();
10018 RecurKind RK = RdxDesc.getRecurrenceKind();
10019 ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
10020 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
10021 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10022 // start value; compare the final value from the main vector loop
10023 // to the start value.
10024 IRBuilder<> Builder(
10025 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10026 ResumeV = Builder.CreateICmpNE(ResumeV,
10027 RdxDesc.getRecurrenceStartValue());
10028 }
10029 } else {
10030 // Create induction resume values for both widened pointer and
10031 // integer/fp inductions and update the start value of the induction
10032 // recipes to use the resume value.
10033 PHINode *IndPhi = nullptr;
10034 const InductionDescriptor *ID;
10035 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10036 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10037 ID = &Ind->getInductionDescriptor();
10038 } else {
10039 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10040 IndPhi = WidenInd->getPHINode();
10041 ID = &WidenInd->getInductionDescriptor();
10042 }
10043
10044 ResumeV = MainILV.createInductionResumeValue(
10045 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10046 {EPI.MainLoopIterationCountCheck});
10047 }
10048 assert(ResumeV && "Must have a resume value");
10049 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
10050 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10051 }
10052
10053 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10054 "DT not preserved correctly");
10055 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10056 DT, true, &ExpandedSCEVs);
10057 ++LoopsEpilogueVectorized;
10058
10059 if (!MainILV.areSafetyChecksAdded())
10060 DisableRuntimeUnroll = true;
10061 } else {
10062 ElementCount Width = VF.Width;
10063 VPlan &BestPlan =
10064 UseLegacyCostModel ? LVP.getBestPlanFor(Width) : LVP.getBestPlan();
10065 if (!UseLegacyCostModel) {
10066 assert(size(BestPlan.vectorFactors()) == 1 &&
10067 "Plan should have a single VF");
10068 Width = *BestPlan.vectorFactors().begin();
10069 LLVM_DEBUG(dbgs()
10070 << "VF picked by VPlan cost model: " << Width << "\n");
10071 assert(VF.Width == Width &&
10072 "VPlan cost model and legacy cost model disagreed");
10073 }
10074 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width,
10075 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10076 PSI, Checks);
10077 LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
10078 ++LoopsVectorized;
10079
10080 // Add metadata to disable runtime unrolling a scalar loop when there
10081 // are no runtime checks about strides and memory. A scalar loop that is
10082 // rarely used is not worth unrolling.
10083 if (!LB.areSafetyChecksAdded())
10084 DisableRuntimeUnroll = true;
10085 }
10086 // Report the vectorization decision.
10087 reportVectorization(ORE, L, VF, IC);
10088 }
10089
10090 if (ORE->allowExtraAnalysis(LV_NAME))
10091 checkMixedPrecision(L, ORE);
10092 }
10093
10094 std::optional<MDNode *> RemainderLoopID =
10095 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10096 LLVMLoopVectorizeFollowupEpilogue});
10097 if (RemainderLoopID) {
10098 L->setLoopID(*RemainderLoopID);
10099 } else {
10100 if (DisableRuntimeUnroll)
10101 AddRuntimeUnrollDisableMetaData(L);
10102
10103 // Mark the loop as already vectorized to avoid vectorizing again.
10104 Hints.setAlreadyVectorized();
10105 }
10106
10107 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10108 return true;
10109 }
10110
runImpl(Function & F,ScalarEvolution & SE_,LoopInfo & LI_,TargetTransformInfo & TTI_,DominatorTree & DT_,BlockFrequencyInfo * BFI_,TargetLibraryInfo * TLI_,DemandedBits & DB_,AssumptionCache & AC_,LoopAccessInfoManager & LAIs_,OptimizationRemarkEmitter & ORE_,ProfileSummaryInfo * PSI_)10111 LoopVectorizeResult LoopVectorizePass::runImpl(
10112 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10113 DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
10114 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10115 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10116 SE = &SE_;
10117 LI = &LI_;
10118 TTI = &TTI_;
10119 DT = &DT_;
10120 BFI = BFI_;
10121 TLI = TLI_;
10122 AC = &AC_;
10123 LAIs = &LAIs_;
10124 DB = &DB_;
10125 ORE = &ORE_;
10126 PSI = PSI_;
10127
10128 // Don't attempt if
10129 // 1. the target claims to have no vector registers, and
10130 // 2. interleaving won't help ILP.
10131 //
10132 // The second condition is necessary because, even if the target has no
10133 // vector registers, loop vectorization may still enable scalar
10134 // interleaving.
10135 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10136 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10137 return LoopVectorizeResult(false, false);
10138
10139 bool Changed = false, CFGChanged = false;
10140
10141 // The vectorizer requires loops to be in simplified form.
10142 // Since simplification may add new inner loops, it has to run before the
10143 // legality and profitability checks. This means running the loop vectorizer
10144 // will simplify all loops, regardless of whether anything end up being
10145 // vectorized.
10146 for (const auto &L : *LI)
10147 Changed |= CFGChanged |=
10148 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10149
10150 // Build up a worklist of inner-loops to vectorize. This is necessary as
10151 // the act of vectorizing or partially unrolling a loop creates new loops
10152 // and can invalidate iterators across the loops.
10153 SmallVector<Loop *, 8> Worklist;
10154
10155 for (Loop *L : *LI)
10156 collectSupportedLoops(*L, LI, ORE, Worklist);
10157
10158 LoopsAnalyzed += Worklist.size();
10159
10160 // Now walk the identified inner loops.
10161 while (!Worklist.empty()) {
10162 Loop *L = Worklist.pop_back_val();
10163
10164 // For the inner loops we actually process, form LCSSA to simplify the
10165 // transform.
10166 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10167
10168 Changed |= CFGChanged |= processLoop(L);
10169
10170 if (Changed) {
10171 LAIs->clear();
10172
10173 #ifndef NDEBUG
10174 if (VerifySCEV)
10175 SE->verify();
10176 #endif
10177 }
10178 }
10179
10180 // Process each loop nest in the function.
10181 return LoopVectorizeResult(Changed, CFGChanged);
10182 }
10183
run(Function & F,FunctionAnalysisManager & AM)10184 PreservedAnalyses LoopVectorizePass::run(Function &F,
10185 FunctionAnalysisManager &AM) {
10186 auto &LI = AM.getResult<LoopAnalysis>(F);
10187 // There are no loops in the function. Return before computing other expensive
10188 // analyses.
10189 if (LI.empty())
10190 return PreservedAnalyses::all();
10191 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10192 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10193 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10194 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10195 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10196 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10197 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10198
10199 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10200 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10201 ProfileSummaryInfo *PSI =
10202 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10203 BlockFrequencyInfo *BFI = nullptr;
10204 if (PSI && PSI->hasProfileSummary())
10205 BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10206 LoopVectorizeResult Result =
10207 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10208 if (!Result.MadeAnyChange)
10209 return PreservedAnalyses::all();
10210 PreservedAnalyses PA;
10211
10212 if (isAssignmentTrackingEnabled(*F.getParent())) {
10213 for (auto &BB : F)
10214 RemoveRedundantDbgInstrs(&BB);
10215 }
10216
10217 PA.preserve<LoopAnalysis>();
10218 PA.preserve<DominatorTreeAnalysis>();
10219 PA.preserve<ScalarEvolutionAnalysis>();
10220 PA.preserve<LoopAccessAnalysis>();
10221
10222 if (Result.MadeCFGChange) {
10223 // Making CFG changes likely means a loop got vectorized. Indicate that
10224 // extra simplification passes should be run.
10225 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10226 // be run if runtime checks have been added.
10227 AM.getResult<ShouldRunExtraVectorPasses>(F);
10228 PA.preserve<ShouldRunExtraVectorPasses>();
10229 } else {
10230 PA.preserveSet<CFGAnalyses>();
10231 }
10232 return PA;
10233 }
10234
printPipeline(raw_ostream & OS,function_ref<StringRef (StringRef)> MapClassName2PassName)10235 void LoopVectorizePass::printPipeline(
10236 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10237 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10238 OS, MapClassName2PassName);
10239
10240 OS << '<';
10241 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10242 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10243 OS << '>';
10244 }
10245