xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass merges loads/stores to/from sequential memory addresses into vector
10 // loads/stores.  Although there's nothing GPU-specific in here, this pass is
11 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12 //
13 // (For simplicity below we talk about loads only, but everything also applies
14 // to stores.)
15 //
16 // This pass is intended to be run late in the pipeline, after other
17 // vectorization opportunities have been exploited.  So the assumption here is
18 // that immediately following our new vector load we'll need to extract out the
19 // individual elements of the load, so we can operate on them individually.
20 //
21 // On CPUs this transformation is usually not beneficial, because extracting the
22 // elements of a vector register is expensive on most architectures.  It's
23 // usually better just to load each element individually into its own scalar
24 // register.
25 //
26 // However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
27 // "vector load" loads directly into a series of scalar registers.  In effect,
28 // extracting the elements of the vector is free.  It's therefore always
29 // beneficial to vectorize a sequence of loads on these architectures.
30 //
31 // Vectorizing (perhaps a better name might be "coalescing") loads can have
32 // large performance impacts on GPU kernels, and opportunities for vectorizing
33 // are common in GPU code.  This pass tries very hard to find such
34 // opportunities; its runtime is quadratic in the number of loads in a BB.
35 //
36 // Some CPU architectures, such as ARM, have instructions that load into
37 // multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
38 // could use this pass (with some modifications), but currently it implements
39 // its own pass to do something similar to what we do here.
40 //
41 // Overview of the algorithm and terminology in this pass:
42 //
43 //  - Break up each basic block into pseudo-BBs, composed of instructions which
44 //    are guaranteed to transfer control to their successors.
45 //  - Within a single pseudo-BB, find all loads, and group them into
46 //    "equivalence classes" according to getUnderlyingObject() and loaded
47 //    element size.  Do the same for stores.
48 //  - For each equivalence class, greedily build "chains".  Each chain has a
49 //    leader instruction, and every other member of the chain has a known
50 //    constant offset from the first instr in the chain.
51 //  - Break up chains so that they contain only contiguous accesses of legal
52 //    size with no intervening may-alias instrs.
53 //  - Convert each chain to vector instructions.
54 //
55 // The O(n^2) behavior of this pass comes from initially building the chains.
56 // In the worst case we have to compare each new instruction to all of those
57 // that came before. To limit this, we only calculate the offset to the leaders
58 // of the N most recently-used chains.
59 
60 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
61 #include "llvm/ADT/APInt.h"
62 #include "llvm/ADT/ArrayRef.h"
63 #include "llvm/ADT/DenseMap.h"
64 #include "llvm/ADT/MapVector.h"
65 #include "llvm/ADT/PostOrderIterator.h"
66 #include "llvm/ADT/STLExtras.h"
67 #include "llvm/ADT/Sequence.h"
68 #include "llvm/ADT/SmallPtrSet.h"
69 #include "llvm/ADT/SmallVector.h"
70 #include "llvm/ADT/Statistic.h"
71 #include "llvm/ADT/iterator_range.h"
72 #include "llvm/Analysis/AliasAnalysis.h"
73 #include "llvm/Analysis/AssumptionCache.h"
74 #include "llvm/Analysis/MemoryLocation.h"
75 #include "llvm/Analysis/ScalarEvolution.h"
76 #include "llvm/Analysis/TargetTransformInfo.h"
77 #include "llvm/Analysis/ValueTracking.h"
78 #include "llvm/Analysis/VectorUtils.h"
79 #include "llvm/IR/Attributes.h"
80 #include "llvm/IR/BasicBlock.h"
81 #include "llvm/IR/ConstantRange.h"
82 #include "llvm/IR/Constants.h"
83 #include "llvm/IR/DataLayout.h"
84 #include "llvm/IR/DerivedTypes.h"
85 #include "llvm/IR/Dominators.h"
86 #include "llvm/IR/Function.h"
87 #include "llvm/IR/GetElementPtrTypeIterator.h"
88 #include "llvm/IR/IRBuilder.h"
89 #include "llvm/IR/InstrTypes.h"
90 #include "llvm/IR/Instruction.h"
91 #include "llvm/IR/Instructions.h"
92 #include "llvm/IR/LLVMContext.h"
93 #include "llvm/IR/Module.h"
94 #include "llvm/IR/Type.h"
95 #include "llvm/IR/Value.h"
96 #include "llvm/InitializePasses.h"
97 #include "llvm/Pass.h"
98 #include "llvm/Support/Alignment.h"
99 #include "llvm/Support/Casting.h"
100 #include "llvm/Support/Debug.h"
101 #include "llvm/Support/KnownBits.h"
102 #include "llvm/Support/MathExtras.h"
103 #include "llvm/Support/ModRef.h"
104 #include "llvm/Support/raw_ostream.h"
105 #include "llvm/Transforms/Utils/Local.h"
106 #include <algorithm>
107 #include <cassert>
108 #include <cstdint>
109 #include <cstdlib>
110 #include <iterator>
111 #include <numeric>
112 #include <optional>
113 #include <tuple>
114 #include <type_traits>
115 #include <utility>
116 #include <vector>
117 
118 using namespace llvm;
119 
120 #define DEBUG_TYPE "load-store-vectorizer"
121 
122 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124 
125 namespace {
126 
127 // Equivalence class key, the initial tuple by which we group loads/stores.
128 // Loads/stores with different EqClassKeys are never merged.
129 //
130 // (We could in theory remove element-size from the this tuple.  We'd just need
131 // to fix up the vector packing/unpacking code.)
132 using EqClassKey =
133     std::tuple<const Value * /* result of getUnderlyingObject() */,
134                unsigned /* AddrSpace */,
135                unsigned /* Load/Store element size bits */,
136                char /* IsLoad; char b/c bool can't be a DenseMap key */
137                >;
operator <<(llvm::raw_ostream & OS,const EqClassKey & K)138 [[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
139                                                const EqClassKey &K) {
140   const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141   return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142             << " of element size " << ElementSize << " bits in addrspace "
143             << AddrSpace;
144 }
145 
146 // A Chain is a set of instructions such that:
147 //  - All instructions have the same equivalence class, so in particular all are
148 //    loads, or all are stores.
149 //  - We know the address accessed by the i'th chain elem relative to the
150 //    chain's leader instruction, which is the first instr of the chain in BB
151 //    order.
152 //
153 // Chains have two canonical orderings:
154 //  - BB order, sorted by Instr->comesBefore.
155 //  - Offset order, sorted by OffsetFromLeader.
156 // This pass switches back and forth between these orders.
157 struct ChainElem {
158   Instruction *Inst;
159   APInt OffsetFromLeader;
160 };
161 using Chain = SmallVector<ChainElem, 1>;
162 
sortChainInBBOrder(Chain & C)163 void sortChainInBBOrder(Chain &C) {
164   sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
165 }
166 
sortChainInOffsetOrder(Chain & C)167 void sortChainInOffsetOrder(Chain &C) {
168   sort(C, [](const auto &A, const auto &B) {
169     if (A.OffsetFromLeader != B.OffsetFromLeader)
170       return A.OffsetFromLeader.slt(B.OffsetFromLeader);
171     return A.Inst->comesBefore(B.Inst); // stable tiebreaker
172   });
173 }
174 
dumpChain(ArrayRef<ChainElem> C)175 [[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
176   for (const auto &E : C) {
177     dbgs() << "  " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
178   }
179 }
180 
181 using EquivalenceClassMap =
182     MapVector<EqClassKey, SmallVector<Instruction *, 8>>;
183 
184 // FIXME: Assuming stack alignment of 4 is always good enough
185 constexpr unsigned StackAdjustedAlignment = 4;
186 
propagateMetadata(Instruction * I,const Chain & C)187 Instruction *propagateMetadata(Instruction *I, const Chain &C) {
188   SmallVector<Value *, 8> Values;
189   for (const ChainElem &E : C)
190     Values.push_back(E.Inst);
191   return propagateMetadata(I, Values);
192 }
193 
isInvariantLoad(const Instruction * I)194 bool isInvariantLoad(const Instruction *I) {
195   const LoadInst *LI = dyn_cast<LoadInst>(I);
196   return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
197 }
198 
199 /// Reorders the instructions that I depends on (the instructions defining its
200 /// operands), to ensure they dominate I.
reorder(Instruction * I)201 void reorder(Instruction *I) {
202   SmallPtrSet<Instruction *, 16> InstructionsToMove;
203   SmallVector<Instruction *, 16> Worklist;
204 
205   Worklist.push_back(I);
206   while (!Worklist.empty()) {
207     Instruction *IW = Worklist.pop_back_val();
208     int NumOperands = IW->getNumOperands();
209     for (int i = 0; i < NumOperands; i++) {
210       Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
211       if (!IM || IM->getOpcode() == Instruction::PHI)
212         continue;
213 
214       // If IM is in another BB, no need to move it, because this pass only
215       // vectorizes instructions within one BB.
216       if (IM->getParent() != I->getParent())
217         continue;
218 
219       if (!IM->comesBefore(I)) {
220         InstructionsToMove.insert(IM);
221         Worklist.push_back(IM);
222       }
223     }
224   }
225 
226   // All instructions to move should follow I. Start from I, not from begin().
227   for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
228     Instruction *IM = &*(BBI++);
229     if (!InstructionsToMove.count(IM))
230       continue;
231     IM->moveBefore(I);
232   }
233 }
234 
235 class Vectorizer {
236   Function &F;
237   AliasAnalysis &AA;
238   AssumptionCache &AC;
239   DominatorTree &DT;
240   ScalarEvolution &SE;
241   TargetTransformInfo &TTI;
242   const DataLayout &DL;
243   IRBuilder<> Builder;
244 
245   // We could erase instrs right after vectorizing them, but that can mess up
246   // our BB iterators, and also can make the equivalence class keys point to
247   // freed memory.  This is fixable, but it's simpler just to wait until we're
248   // done with the BB and erase all at once.
249   SmallVector<Instruction *, 128> ToErase;
250 
251 public:
Vectorizer(Function & F,AliasAnalysis & AA,AssumptionCache & AC,DominatorTree & DT,ScalarEvolution & SE,TargetTransformInfo & TTI)252   Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
253              DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
254       : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
255         DL(F.getDataLayout()), Builder(SE.getContext()) {}
256 
257   bool run();
258 
259 private:
260   static const unsigned MaxDepth = 3;
261 
262   /// Runs the vectorizer on a "pseudo basic block", which is a range of
263   /// instructions [Begin, End) within one BB all of which have
264   /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
265   bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
266 
267   /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
268   /// in the same BB with the same value for getUnderlyingObject() etc.
269   bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
270                              ArrayRef<Instruction *> EqClass);
271 
272   /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
273   /// where all instructions access a known, constant offset from the first
274   /// instruction.
275   bool runOnChain(Chain &C);
276 
277   /// Splits the chain into subchains of instructions which read/write a
278   /// contiguous block of memory.  Discards any length-1 subchains (because
279   /// there's nothing to vectorize in there).
280   std::vector<Chain> splitChainByContiguity(Chain &C);
281 
282   /// Splits the chain into subchains where it's safe to hoist loads up to the
283   /// beginning of the sub-chain and it's safe to sink loads up to the end of
284   /// the sub-chain.  Discards any length-1 subchains.
285   std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
286 
287   /// Splits the chain into subchains that make legal, aligned accesses.
288   /// Discards any length-1 subchains.
289   std::vector<Chain> splitChainByAlignment(Chain &C);
290 
291   /// Converts the instrs in the chain into a single vectorized load or store.
292   /// Adds the old scalar loads/stores to ToErase.
293   bool vectorizeChain(Chain &C);
294 
295   /// Tries to compute the offset in bytes PtrB - PtrA.
296   std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
297                                          Instruction *ContextInst,
298                                          unsigned Depth = 0);
299   std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
300                                                      Instruction *ContextInst,
301                                                      unsigned Depth);
302   std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
303                                                 Instruction *ContextInst,
304                                                 unsigned Depth);
305 
306   /// Gets the element type of the vector that the chain will load or store.
307   /// This is nontrivial because the chain may contain elements of different
308   /// types; e.g. it's legal to have a chain that contains both i32 and float.
309   Type *getChainElemTy(const Chain &C);
310 
311   /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
312   /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
313   /// instructions.
314   ///
315   /// The map ChainElemOffsets must contain all of the elements in
316   /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
317   /// address.  It's ok if it contains additional entries.
318   template <bool IsLoadChain>
319   bool isSafeToMove(
320       Instruction *ChainElem, Instruction *ChainBegin,
321       const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
322 
323   /// Collects loads and stores grouped by "equivalence class", where:
324   ///   - all elements in an eq class are a load or all are a store,
325   ///   - they all load/store the same element size (it's OK to have e.g. i8 and
326   ///     <4 x i8> in the same class, but not i32 and <4 x i8>), and
327   ///   - they all have the same value for getUnderlyingObject().
328   EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
329                                                 BasicBlock::iterator End);
330 
331   /// Partitions Instrs into "chains" where every instruction has a known
332   /// constant offset from the first instr in the chain.
333   ///
334   /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
335   /// in the chain is the leader, and an instr touches distance 0 from itself.
336   std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
337 };
338 
339 class LoadStoreVectorizerLegacyPass : public FunctionPass {
340 public:
341   static char ID;
342 
LoadStoreVectorizerLegacyPass()343   LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
344     initializeLoadStoreVectorizerLegacyPassPass(
345         *PassRegistry::getPassRegistry());
346   }
347 
348   bool runOnFunction(Function &F) override;
349 
getPassName() const350   StringRef getPassName() const override {
351     return "GPU Load and Store Vectorizer";
352   }
353 
getAnalysisUsage(AnalysisUsage & AU) const354   void getAnalysisUsage(AnalysisUsage &AU) const override {
355     AU.addRequired<AAResultsWrapperPass>();
356     AU.addRequired<AssumptionCacheTracker>();
357     AU.addRequired<ScalarEvolutionWrapperPass>();
358     AU.addRequired<DominatorTreeWrapperPass>();
359     AU.addRequired<TargetTransformInfoWrapperPass>();
360     AU.setPreservesCFG();
361   }
362 };
363 
364 } // end anonymous namespace
365 
366 char LoadStoreVectorizerLegacyPass::ID = 0;
367 
368 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
369                       "Vectorize load and Store instructions", false, false)
370 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
371 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
372 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)373 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
374 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
375 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
376 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
377                     "Vectorize load and store instructions", false, false)
378 
379 Pass *llvm::createLoadStoreVectorizerPass() {
380   return new LoadStoreVectorizerLegacyPass();
381 }
382 
runOnFunction(Function & F)383 bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
384   // Don't vectorize when the attribute NoImplicitFloat is used.
385   if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
386     return false;
387 
388   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
389   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
390   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
391   TargetTransformInfo &TTI =
392       getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
393 
394   AssumptionCache &AC =
395       getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
396 
397   return Vectorizer(F, AA, AC, DT, SE, TTI).run();
398 }
399 
run(Function & F,FunctionAnalysisManager & AM)400 PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
401                                                FunctionAnalysisManager &AM) {
402   // Don't vectorize when the attribute NoImplicitFloat is used.
403   if (F.hasFnAttribute(Attribute::NoImplicitFloat))
404     return PreservedAnalyses::all();
405 
406   AliasAnalysis &AA = AM.getResult<AAManager>(F);
407   DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
408   ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
409   TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
410   AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
411 
412   bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
413   PreservedAnalyses PA;
414   PA.preserveSet<CFGAnalyses>();
415   return Changed ? PA : PreservedAnalyses::all();
416 }
417 
run()418 bool Vectorizer::run() {
419   bool Changed = false;
420   // Break up the BB if there are any instrs which aren't guaranteed to transfer
421   // execution to their successor.
422   //
423   // Consider, for example:
424   //
425   //   def assert_arr_len(int n) { if (n < 2) exit(); }
426   //
427   //   load arr[0]
428   //   call assert_array_len(arr.length)
429   //   load arr[1]
430   //
431   // Even though assert_arr_len does not read or write any memory, we can't
432   // speculate the second load before the call.  More info at
433   // https://github.com/llvm/llvm-project/issues/52950.
434   for (BasicBlock *BB : post_order(&F)) {
435     // BB must at least have a terminator.
436     assert(!BB->empty());
437 
438     SmallVector<BasicBlock::iterator, 8> Barriers;
439     Barriers.push_back(BB->begin());
440     for (Instruction &I : *BB)
441       if (!isGuaranteedToTransferExecutionToSuccessor(&I))
442         Barriers.push_back(I.getIterator());
443     Barriers.push_back(BB->end());
444 
445     for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
446          ++It)
447       Changed |= runOnPseudoBB(*It, *std::next(It));
448 
449     for (Instruction *I : ToErase) {
450       auto *PtrOperand = getLoadStorePointerOperand(I);
451       if (I->use_empty())
452         I->eraseFromParent();
453       RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
454     }
455     ToErase.clear();
456   }
457 
458   return Changed;
459 }
460 
runOnPseudoBB(BasicBlock::iterator Begin,BasicBlock::iterator End)461 bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
462                                BasicBlock::iterator End) {
463   LLVM_DEBUG({
464     dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
465     if (End != Begin->getParent()->end())
466       dbgs() << *End;
467     else
468       dbgs() << "<BB end>";
469     dbgs() << ")\n";
470   });
471 
472   bool Changed = false;
473   for (const auto &[EqClassKey, EqClass] :
474        collectEquivalenceClasses(Begin, End))
475     Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
476 
477   return Changed;
478 }
479 
runOnEquivalenceClass(const EqClassKey & EqClassKey,ArrayRef<Instruction * > EqClass)480 bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
481                                        ArrayRef<Instruction *> EqClass) {
482   bool Changed = false;
483 
484   LLVM_DEBUG({
485     dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
486            << " keyed on " << EqClassKey << ":\n";
487     for (Instruction *I : EqClass)
488       dbgs() << "  " << *I << "\n";
489   });
490 
491   std::vector<Chain> Chains = gatherChains(EqClass);
492   LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
493                     << " nontrivial chains.\n";);
494   for (Chain &C : Chains)
495     Changed |= runOnChain(C);
496   return Changed;
497 }
498 
runOnChain(Chain & C)499 bool Vectorizer::runOnChain(Chain &C) {
500   LLVM_DEBUG({
501     dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
502     dumpChain(C);
503   });
504 
505   // Split up the chain into increasingly smaller chains, until we can finally
506   // vectorize the chains.
507   //
508   // (Don't be scared by the depth of the loop nest here.  These operations are
509   // all at worst O(n lg n) in the number of instructions, and splitting chains
510   // doesn't change the number of instrs.  So the whole loop nest is O(n lg n).)
511   bool Changed = false;
512   for (auto &C : splitChainByMayAliasInstrs(C))
513     for (auto &C : splitChainByContiguity(C))
514       for (auto &C : splitChainByAlignment(C))
515         Changed |= vectorizeChain(C);
516   return Changed;
517 }
518 
splitChainByMayAliasInstrs(Chain & C)519 std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
520   if (C.empty())
521     return {};
522 
523   sortChainInBBOrder(C);
524 
525   LLVM_DEBUG({
526     dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
527     dumpChain(C);
528   });
529 
530   // We know that elements in the chain with nonverlapping offsets can't
531   // alias, but AA may not be smart enough to figure this out.  Use a
532   // hashtable.
533   DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
534   for (const auto &E : C)
535     ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
536 
537   // Loads get hoisted up to the first load in the chain.  Stores get sunk
538   // down to the last store in the chain.  Our algorithm for loads is:
539   //
540   //  - Take the first element of the chain.  This is the start of a new chain.
541   //  - Take the next element of `Chain` and check for may-alias instructions
542   //    up to the start of NewChain.  If no may-alias instrs, add it to
543   //    NewChain.  Otherwise, start a new NewChain.
544   //
545   // For stores it's the same except in the reverse direction.
546   //
547   // We expect IsLoad to be an std::bool_constant.
548   auto Impl = [&](auto IsLoad) {
549     // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
550     auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
551       if constexpr (IsLoad())
552         return std::make_pair(C.begin(), C.end());
553       else
554         return std::make_pair(C.rbegin(), C.rend());
555     }(IsLoad);
556     assert(ChainBegin != ChainEnd);
557 
558     std::vector<Chain> Chains;
559     SmallVector<ChainElem, 1> NewChain;
560     NewChain.push_back(*ChainBegin);
561     for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
562       if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
563                                ChainOffsets)) {
564         LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
565                           << *ChainIt->Inst << " into " << *ChainBegin->Inst
566                           << "\n");
567         NewChain.push_back(*ChainIt);
568       } else {
569         LLVM_DEBUG(
570             dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
571                    << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
572         if (NewChain.size() > 1) {
573           LLVM_DEBUG({
574             dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
575             dumpChain(NewChain);
576           });
577           Chains.push_back(std::move(NewChain));
578         }
579 
580         // Start a new chain.
581         NewChain = SmallVector<ChainElem, 1>({*ChainIt});
582       }
583     }
584     if (NewChain.size() > 1) {
585       LLVM_DEBUG({
586         dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
587         dumpChain(NewChain);
588       });
589       Chains.push_back(std::move(NewChain));
590     }
591     return Chains;
592   };
593 
594   if (isa<LoadInst>(C[0].Inst))
595     return Impl(/*IsLoad=*/std::bool_constant<true>());
596 
597   assert(isa<StoreInst>(C[0].Inst));
598   return Impl(/*IsLoad=*/std::bool_constant<false>());
599 }
600 
splitChainByContiguity(Chain & C)601 std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
602   if (C.empty())
603     return {};
604 
605   sortChainInOffsetOrder(C);
606 
607   LLVM_DEBUG({
608     dbgs() << "LSV: splitChainByContiguity considering chain:\n";
609     dumpChain(C);
610   });
611 
612   std::vector<Chain> Ret;
613   Ret.push_back({C.front()});
614 
615   for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
616     // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
617     auto &CurChain = Ret.back();
618     const ChainElem &Prev = CurChain.back();
619     unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
620     assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
621                               "collectEquivalenceClass");
622     APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
623 
624     // Add this instruction to the end of the current chain, or start a new one.
625     bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
626     LLVM_DEBUG(dbgs() << "LSV: Instructions are "
627                       << (AreContiguous ? "" : "not ") << "contiguous: "
628                       << *Prev.Inst << " (ends at offset " << PrevReadEnd
629                       << ") -> " << *It->Inst << " (starts at offset "
630                       << It->OffsetFromLeader << ")\n");
631     if (AreContiguous)
632       CurChain.push_back(*It);
633     else
634       Ret.push_back({*It});
635   }
636 
637   // Filter out length-1 chains, these are uninteresting.
638   llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
639   return Ret;
640 }
641 
getChainElemTy(const Chain & C)642 Type *Vectorizer::getChainElemTy(const Chain &C) {
643   assert(!C.empty());
644   // The rules are:
645   //  - If there are any pointer types in the chain, use an integer type.
646   //  - Prefer an integer type if it appears in the chain.
647   //  - Otherwise, use the first type in the chain.
648   //
649   // The rule about pointer types is a simplification when we merge e.g.  a load
650   // of a ptr and a double.  There's no direct conversion from a ptr to a
651   // double; it requires a ptrtoint followed by a bitcast.
652   //
653   // It's unclear to me if the other rules have any practical effect, but we do
654   // it to match this pass's previous behavior.
655   if (any_of(C, [](const ChainElem &E) {
656         return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
657       })) {
658     return Type::getIntNTy(
659         F.getContext(),
660         DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
661   }
662 
663   for (const ChainElem &E : C)
664     if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
665       return T;
666   return getLoadStoreType(C[0].Inst)->getScalarType();
667 }
668 
splitChainByAlignment(Chain & C)669 std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
670   // We use a simple greedy algorithm.
671   //  - Given a chain of length N, find all prefixes that
672   //    (a) are not longer than the max register length, and
673   //    (b) are a power of 2.
674   //  - Starting from the longest prefix, try to create a vector of that length.
675   //  - If one of them works, great.  Repeat the algorithm on any remaining
676   //    elements in the chain.
677   //  - If none of them work, discard the first element and repeat on a chain
678   //    of length N-1.
679   if (C.empty())
680     return {};
681 
682   sortChainInOffsetOrder(C);
683 
684   LLVM_DEBUG({
685     dbgs() << "LSV: splitChainByAlignment considering chain:\n";
686     dumpChain(C);
687   });
688 
689   bool IsLoadChain = isa<LoadInst>(C[0].Inst);
690   auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
691                              unsigned ChainSizeBytes, VectorType *VecTy) {
692     return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
693                                                  ChainSizeBytes, VecTy)
694                        : TTI.getStoreVectorFactor(VF, LoadStoreSize,
695                                                   ChainSizeBytes, VecTy);
696   };
697 
698 #ifndef NDEBUG
699   for (const auto &E : C) {
700     Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
701     assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
702            "Should have filtered out non-power-of-two elements in "
703            "collectEquivalenceClasses.");
704   }
705 #endif
706 
707   unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
708   unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
709 
710   std::vector<Chain> Ret;
711   for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
712     // Find candidate chains of size not greater than the largest vector reg.
713     // These chains are over the closed interval [CBegin, CEnd].
714     SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
715         CandidateChains;
716     for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
717       APInt Sz = C[CEnd].OffsetFromLeader +
718                  DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
719                  C[CBegin].OffsetFromLeader;
720       if (Sz.sgt(VecRegBytes))
721         break;
722       CandidateChains.push_back(
723           {CEnd, static_cast<unsigned>(Sz.getLimitedValue())});
724     }
725 
726     // Consider the longest chain first.
727     for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
728          It != End; ++It) {
729       auto [CEnd, SizeBytes] = *It;
730       LLVM_DEBUG(
731           dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
732                  << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
733 
734       Type *VecElemTy = getChainElemTy(C);
735       // Note, VecElemTy is a power of 2, but might be less than one byte.  For
736       // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
737       // VecElemTy would be i4.
738       unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
739 
740       // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
741       assert((8 * SizeBytes) % VecElemBits == 0);
742       unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
743       FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
744       unsigned VF = 8 * VecRegBytes / VecElemBits;
745 
746       // Check that TTI is happy with this vectorization factor.
747       unsigned TargetVF = getVectorFactor(VF, VecElemBits,
748                                           VecElemBits * NumVecElems / 8, VecTy);
749       if (TargetVF != VF && TargetVF < NumVecElems) {
750         LLVM_DEBUG(
751             dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
752                       "because TargetVF="
753                    << TargetVF << " != VF=" << VF
754                    << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
755         continue;
756       }
757 
758       // Is a load/store with this alignment allowed by TTI and at least as fast
759       // as an unvectorized load/store?
760       //
761       // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
762       auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
763                                &F = F](Align Alignment) {
764         if (Alignment.value() % SizeBytes == 0)
765           return true;
766         unsigned VectorizedSpeed = 0;
767         bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
768             F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
769         if (!AllowsMisaligned) {
770           LLVM_DEBUG(dbgs()
771                      << "LSV: Access of " << SizeBytes << "B in addrspace "
772                      << AS << " with alignment " << Alignment.value()
773                      << " is misaligned, and therefore can't be vectorized.\n");
774           return false;
775         }
776 
777         unsigned ElementwiseSpeed = 0;
778         (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
779                                              Alignment, &ElementwiseSpeed);
780         if (VectorizedSpeed < ElementwiseSpeed) {
781           LLVM_DEBUG(dbgs()
782                      << "LSV: Access of " << SizeBytes << "B in addrspace "
783                      << AS << " with alignment " << Alignment.value()
784                      << " has relative speed " << VectorizedSpeed
785                      << ", which is lower than the elementwise speed of "
786                      << ElementwiseSpeed
787                      << ".  Therefore this access won't be vectorized.\n");
788           return false;
789         }
790         return true;
791       };
792 
793       // If we're loading/storing from an alloca, align it if possible.
794       //
795       // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
796       // tells us this is beneficial.  This feels a bit odd, but it matches
797       // existing tests.  This isn't *so* bad, because at most we align to 4
798       // bytes (current value of StackAdjustedAlignment).
799       //
800       // FIXME: We will upgrade the alignment of the alloca even if it turns out
801       // we can't vectorize for some other reason.
802       Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
803       bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
804                             isa<AllocaInst>(PtrOperand->stripPointerCasts());
805       Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
806       Align PrefAlign = Align(StackAdjustedAlignment);
807       if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
808           IsAllowedAndFast(PrefAlign)) {
809         Align NewAlign = getOrEnforceKnownAlignment(
810             PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
811         if (NewAlign >= Alignment) {
812           LLVM_DEBUG(dbgs()
813                      << "LSV: splitByChain upgrading alloca alignment from "
814                      << Alignment.value() << " to " << NewAlign.value()
815                      << "\n");
816           Alignment = NewAlign;
817         }
818       }
819 
820       if (!IsAllowedAndFast(Alignment)) {
821         LLVM_DEBUG(
822             dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
823                       "because its alignment is not AllowedAndFast: "
824                    << Alignment.value() << "\n");
825         continue;
826       }
827 
828       if ((IsLoadChain &&
829            !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
830           (!IsLoadChain &&
831            !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
832         LLVM_DEBUG(
833             dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
834                       "because !isLegalToVectorizeLoad/StoreChain.");
835         continue;
836       }
837 
838       // Hooray, we can vectorize this chain!
839       Chain &NewChain = Ret.emplace_back();
840       for (unsigned I = CBegin; I <= CEnd; ++I)
841         NewChain.push_back(C[I]);
842       CBegin = CEnd; // Skip over the instructions we've added to the chain.
843       break;
844     }
845   }
846   return Ret;
847 }
848 
vectorizeChain(Chain & C)849 bool Vectorizer::vectorizeChain(Chain &C) {
850   if (C.size() < 2)
851     return false;
852 
853   sortChainInOffsetOrder(C);
854 
855   LLVM_DEBUG({
856     dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
857     dumpChain(C);
858   });
859 
860   Type *VecElemTy = getChainElemTy(C);
861   bool IsLoadChain = isa<LoadInst>(C[0].Inst);
862   unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
863   unsigned ChainBytes = std::accumulate(
864       C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
865         return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
866       });
867   assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
868   // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
869   // than 1 byte (e.g. VecTy == <32 x i1>).
870   Type *VecTy = FixedVectorType::get(
871       VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
872 
873   Align Alignment = getLoadStoreAlignment(C[0].Inst);
874   // If this is a load/store of an alloca, we might have upgraded the alloca's
875   // alignment earlier.  Get the new alignment.
876   if (AS == DL.getAllocaAddrSpace()) {
877     Alignment = std::max(
878         Alignment,
879         getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst),
880                                    MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
881   }
882 
883   // All elements of the chain must have the same scalar-type size.
884 #ifndef NDEBUG
885   for (const ChainElem &E : C)
886     assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
887            DL.getTypeStoreSize(VecElemTy));
888 #endif
889 
890   Instruction *VecInst;
891   if (IsLoadChain) {
892     // Loads get hoisted to the location of the first load in the chain.  We may
893     // also need to hoist the (transitive) operands of the loads.
894     Builder.SetInsertPoint(
895         llvm::min_element(C, [](const auto &A, const auto &B) {
896           return A.Inst->comesBefore(B.Inst);
897         })->Inst);
898 
899     // Chain is in offset order, so C[0] is the instr with the lowest offset,
900     // i.e. the root of the vector.
901     VecInst = Builder.CreateAlignedLoad(VecTy,
902                                         getLoadStorePointerOperand(C[0].Inst),
903                                         Alignment);
904 
905     unsigned VecIdx = 0;
906     for (const ChainElem &E : C) {
907       Instruction *I = E.Inst;
908       Value *V;
909       Type *T = getLoadStoreType(I);
910       if (auto *VT = dyn_cast<FixedVectorType>(T)) {
911         auto Mask = llvm::to_vector<8>(
912             llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
913         V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
914         VecIdx += VT->getNumElements();
915       } else {
916         V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
917                                          I->getName());
918         ++VecIdx;
919       }
920       if (V->getType() != I->getType())
921         V = Builder.CreateBitOrPointerCast(V, I->getType());
922       I->replaceAllUsesWith(V);
923     }
924 
925     // Finally, we need to reorder the instrs in the BB so that the (transitive)
926     // operands of VecInst appear before it.  To see why, suppose we have
927     // vectorized the following code:
928     //
929     //   ptr1  = gep a, 1
930     //   load1 = load i32 ptr1
931     //   ptr0  = gep a, 0
932     //   load0 = load i32 ptr0
933     //
934     // We will put the vectorized load at the location of the earliest load in
935     // the BB, i.e. load1.  We get:
936     //
937     //   ptr1  = gep a, 1
938     //   loadv = load <2 x i32> ptr0
939     //   load0 = extractelement loadv, 0
940     //   load1 = extractelement loadv, 1
941     //   ptr0 = gep a, 0
942     //
943     // Notice that loadv uses ptr0, which is defined *after* it!
944     reorder(VecInst);
945   } else {
946     // Stores get sunk to the location of the last store in the chain.
947     Builder.SetInsertPoint(llvm::max_element(C, [](auto &A, auto &B) {
948                              return A.Inst->comesBefore(B.Inst);
949                            })->Inst);
950 
951     // Build the vector to store.
952     Value *Vec = PoisonValue::get(VecTy);
953     unsigned VecIdx = 0;
954     auto InsertElem = [&](Value *V) {
955       if (V->getType() != VecElemTy)
956         V = Builder.CreateBitOrPointerCast(V, VecElemTy);
957       Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
958     };
959     for (const ChainElem &E : C) {
960       auto I = cast<StoreInst>(E.Inst);
961       if (FixedVectorType *VT =
962               dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
963         for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
964           InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
965                                                   Builder.getInt32(J)));
966         }
967       } else {
968         InsertElem(I->getValueOperand());
969       }
970     }
971 
972     // Chain is in offset order, so C[0] is the instr with the lowest offset,
973     // i.e. the root of the vector.
974     VecInst = Builder.CreateAlignedStore(
975         Vec,
976         getLoadStorePointerOperand(C[0].Inst),
977         Alignment);
978   }
979 
980   propagateMetadata(VecInst, C);
981 
982   for (const ChainElem &E : C)
983     ToErase.push_back(E.Inst);
984 
985   ++NumVectorInstructions;
986   NumScalarsVectorized += C.size();
987   return true;
988 }
989 
990 template <bool IsLoadChain>
isSafeToMove(Instruction * ChainElem,Instruction * ChainBegin,const DenseMap<Instruction *,APInt> & ChainOffsets)991 bool Vectorizer::isSafeToMove(
992     Instruction *ChainElem, Instruction *ChainBegin,
993     const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) {
994   LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
995                     << *ChainBegin << ")\n");
996 
997   assert(isa<LoadInst>(ChainElem) == IsLoadChain);
998   if (ChainElem == ChainBegin)
999     return true;
1000 
1001   // Invariant loads can always be reordered; by definition they are not
1002   // clobbered by stores.
1003   if (isInvariantLoad(ChainElem))
1004     return true;
1005 
1006   auto BBIt = std::next([&] {
1007     if constexpr (IsLoadChain)
1008       return BasicBlock::reverse_iterator(ChainElem);
1009     else
1010       return BasicBlock::iterator(ChainElem);
1011   }());
1012   auto BBItEnd = std::next([&] {
1013     if constexpr (IsLoadChain)
1014       return BasicBlock::reverse_iterator(ChainBegin);
1015     else
1016       return BasicBlock::iterator(ChainBegin);
1017   }());
1018 
1019   const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1020   const unsigned ChainElemSize =
1021       DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1022 
1023   for (; BBIt != BBItEnd; ++BBIt) {
1024     Instruction *I = &*BBIt;
1025 
1026     if (!I->mayReadOrWriteMemory())
1027       continue;
1028 
1029     // Loads can be reordered with other loads.
1030     if (IsLoadChain && isa<LoadInst>(I))
1031       continue;
1032 
1033     // Stores can be sunk below invariant loads.
1034     if (!IsLoadChain && isInvariantLoad(I))
1035       continue;
1036 
1037     // If I is in the chain, we can tell whether it aliases ChainIt by checking
1038     // what offset ChainIt accesses.  This may be better than AA is able to do.
1039     //
1040     // We should really only have duplicate offsets for stores (the duplicate
1041     // loads should be CSE'ed), but in case we have a duplicate load, we'll
1042     // split the chain so we don't have to handle this case specially.
1043     if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1044       // I and ChainElem overlap if:
1045       //   - I and ChainElem have the same offset, OR
1046       //   - I's offset is less than ChainElem's, but I touches past the
1047       //     beginning of ChainElem, OR
1048       //   - ChainElem's offset is less than I's, but ChainElem touches past the
1049       //     beginning of I.
1050       const APInt &IOffset = OffsetIt->second;
1051       unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1052       if (IOffset == ChainElemOffset ||
1053           (IOffset.sle(ChainElemOffset) &&
1054            (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1055           (ChainElemOffset.sle(IOffset) &&
1056            (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1057         LLVM_DEBUG({
1058           // Double check that AA also sees this alias.  If not, we probably
1059           // have a bug.
1060           ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1061           assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1062           dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1063         });
1064         return false; // We found an aliasing instruction; bail.
1065       }
1066 
1067       continue; // We're confident there's no alias.
1068     }
1069 
1070     LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1071     ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1072     if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1073       LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1074                         << "  Aliasing instruction:\n"
1075                         << "    " << *I << '\n'
1076                         << "  Aliased instruction and pointer:\n"
1077                         << "    " << *ChainElem << '\n'
1078                         << "    " << *getLoadStorePointerOperand(ChainElem)
1079                         << '\n');
1080 
1081       return false;
1082     }
1083   }
1084   return true;
1085 }
1086 
checkNoWrapFlags(Instruction * I,bool Signed)1087 static bool checkNoWrapFlags(Instruction *I, bool Signed) {
1088   BinaryOperator *BinOpI = cast<BinaryOperator>(I);
1089   return (Signed && BinOpI->hasNoSignedWrap()) ||
1090          (!Signed && BinOpI->hasNoUnsignedWrap());
1091 }
1092 
checkIfSafeAddSequence(const APInt & IdxDiff,Instruction * AddOpA,unsigned MatchingOpIdxA,Instruction * AddOpB,unsigned MatchingOpIdxB,bool Signed)1093 static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1094                                    unsigned MatchingOpIdxA, Instruction *AddOpB,
1095                                    unsigned MatchingOpIdxB, bool Signed) {
1096   LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1097                     << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1098                     << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1099                     << ", MatchingOpIdxB=" << MatchingOpIdxB
1100                     << ", Signed=" << Signed << "\n");
1101   // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1102   // being the same, we can guarantee that the transformation is safe if we can
1103   // prove that OpA won't overflow when Ret added to the other operand of OpA.
1104   // For example:
1105   //  %tmp7 = add nsw i32 %tmp2, %v0
1106   //  %tmp8 = sext i32 %tmp7 to i64
1107   //  ...
1108   //  %tmp11 = add nsw i32 %v0, 1
1109   //  %tmp12 = add nsw i32 %tmp2, %tmp11
1110   //  %tmp13 = sext i32 %tmp12 to i64
1111   //
1112   //  Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1113   //  It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1114   //  1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1115   assert(AddOpA->getOpcode() == Instruction::Add &&
1116          AddOpB->getOpcode() == Instruction::Add &&
1117          checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1118   if (AddOpA->getOperand(MatchingOpIdxA) ==
1119       AddOpB->getOperand(MatchingOpIdxB)) {
1120     Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1121     Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1122     Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1123     Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1124     // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1125     if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1126         checkNoWrapFlags(OtherInstrB, Signed) &&
1127         isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1128       int64_t CstVal =
1129           cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1130       if (OtherInstrB->getOperand(0) == OtherOperandA &&
1131           IdxDiff.getSExtValue() == CstVal)
1132         return true;
1133     }
1134     // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1135     if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1136         checkNoWrapFlags(OtherInstrA, Signed) &&
1137         isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1138       int64_t CstVal =
1139           cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1140       if (OtherInstrA->getOperand(0) == OtherOperandB &&
1141           IdxDiff.getSExtValue() == -CstVal)
1142         return true;
1143     }
1144     // Match `x +nsw/nuw (y +nsw/nuw c)` and
1145     // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1146     if (OtherInstrA && OtherInstrB &&
1147         OtherInstrA->getOpcode() == Instruction::Add &&
1148         OtherInstrB->getOpcode() == Instruction::Add &&
1149         checkNoWrapFlags(OtherInstrA, Signed) &&
1150         checkNoWrapFlags(OtherInstrB, Signed) &&
1151         isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1152         isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1153       int64_t CstValA =
1154           cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1155       int64_t CstValB =
1156           cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1157       if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1158           IdxDiff.getSExtValue() == (CstValB - CstValA))
1159         return true;
1160     }
1161   }
1162   return false;
1163 }
1164 
getConstantOffsetComplexAddrs(Value * PtrA,Value * PtrB,Instruction * ContextInst,unsigned Depth)1165 std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1166     Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1167   LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1168                     << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1169                     << " Depth=" << Depth << "\n");
1170   auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1171   auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1172   if (!GEPA || !GEPB)
1173     return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1174 
1175   // Look through GEPs after checking they're the same except for the last
1176   // index.
1177   if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1178       GEPA->getPointerOperand() != GEPB->getPointerOperand())
1179     return std::nullopt;
1180   gep_type_iterator GTIA = gep_type_begin(GEPA);
1181   gep_type_iterator GTIB = gep_type_begin(GEPB);
1182   for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1183     if (GTIA.getOperand() != GTIB.getOperand())
1184       return std::nullopt;
1185     ++GTIA;
1186     ++GTIB;
1187   }
1188 
1189   Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
1190   Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
1191   if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1192       OpA->getType() != OpB->getType())
1193     return std::nullopt;
1194 
1195   uint64_t Stride = GTIA.getSequentialElementStride(DL);
1196 
1197   // Only look through a ZExt/SExt.
1198   if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1199     return std::nullopt;
1200 
1201   bool Signed = isa<SExtInst>(OpA);
1202 
1203   // At this point A could be a function parameter, i.e. not an instruction
1204   Value *ValA = OpA->getOperand(0);
1205   OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1206   if (!OpB || ValA->getType() != OpB->getType())
1207     return std::nullopt;
1208 
1209   const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1210   const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1211   const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1212   if (IdxDiffSCEV == SE.getCouldNotCompute())
1213     return std::nullopt;
1214 
1215   ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1216   if (!IdxDiffRange.isSingleElement())
1217     return std::nullopt;
1218   APInt IdxDiff = *IdxDiffRange.getSingleElement();
1219 
1220   LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1221                     << "\n");
1222 
1223   // Now we need to prove that adding IdxDiff to ValA won't overflow.
1224   bool Safe = false;
1225 
1226   // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1227   // ValA, we're okay.
1228   if (OpB->getOpcode() == Instruction::Add &&
1229       isa<ConstantInt>(OpB->getOperand(1)) &&
1230       IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1231       checkNoWrapFlags(OpB, Signed))
1232     Safe = true;
1233 
1234   // Second attempt: check if we have eligible add NSW/NUW instruction
1235   // sequences.
1236   OpA = dyn_cast<Instruction>(ValA);
1237   if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1238       OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
1239       checkNoWrapFlags(OpB, Signed)) {
1240     // In the checks below a matching operand in OpA and OpB is an operand which
1241     // is the same in those two instructions.  Below we account for possible
1242     // orders of the operands of these add instructions.
1243     for (unsigned MatchingOpIdxA : {0, 1})
1244       for (unsigned MatchingOpIdxB : {0, 1})
1245         if (!Safe)
1246           Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1247                                         MatchingOpIdxB, Signed);
1248   }
1249 
1250   unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1251 
1252   // Third attempt:
1253   //
1254   // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1255   // order bit other than the sign bit are known to be zero in ValA, we can add
1256   // Diff to it while guaranteeing no overflow of any sort.
1257   //
1258   // If IdxDiff is negative, do the same, but swap ValA and ValB.
1259   if (!Safe) {
1260     // When computing known bits, use the GEPs as context instructions, since
1261     // they likely are in the same BB as the load/store.
1262     KnownBits Known(BitWidth);
1263     computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
1264                      ContextInst, &DT);
1265     APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1266     if (Signed)
1267       BitsAllowedToBeSet.clearBit(BitWidth - 1);
1268     if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
1269       return std::nullopt;
1270     Safe = true;
1271   }
1272 
1273   if (Safe)
1274     return IdxDiff * Stride;
1275   return std::nullopt;
1276 }
1277 
getConstantOffsetSelects(Value * PtrA,Value * PtrB,Instruction * ContextInst,unsigned Depth)1278 std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1279     Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1280   if (Depth++ == MaxDepth)
1281     return std::nullopt;
1282 
1283   if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1284     if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1285       if (SelectA->getCondition() != SelectB->getCondition())
1286         return std::nullopt;
1287       LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1288                         << ", PtrB=" << *PtrB << ", ContextInst="
1289                         << *ContextInst << ", Depth=" << Depth << "\n");
1290       std::optional<APInt> TrueDiff = getConstantOffset(
1291           SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1292       if (!TrueDiff.has_value())
1293         return std::nullopt;
1294       std::optional<APInt> FalseDiff =
1295           getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1296                             ContextInst, Depth);
1297       if (TrueDiff == FalseDiff)
1298         return TrueDiff;
1299     }
1300   }
1301   return std::nullopt;
1302 }
1303 
1304 EquivalenceClassMap
collectEquivalenceClasses(BasicBlock::iterator Begin,BasicBlock::iterator End)1305 Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1306                                       BasicBlock::iterator End) {
1307   EquivalenceClassMap Ret;
1308 
1309   auto getUnderlyingObject = [](const Value *Ptr) -> const Value * {
1310     const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1311     if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1312       // The select's themselves are distinct instructions even if they share
1313       // the same condition and evaluate to consecutive pointers for true and
1314       // false values of the condition. Therefore using the select's themselves
1315       // for grouping instructions would put consecutive accesses into different
1316       // lists and they won't be even checked for being consecutive, and won't
1317       // be vectorized.
1318       return Sel->getCondition();
1319     }
1320     return ObjPtr;
1321   };
1322 
1323   for (Instruction &I : make_range(Begin, End)) {
1324     auto *LI = dyn_cast<LoadInst>(&I);
1325     auto *SI = dyn_cast<StoreInst>(&I);
1326     if (!LI && !SI)
1327       continue;
1328 
1329     if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1330       continue;
1331 
1332     if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1333         (SI && !TTI.isLegalToVectorizeStore(SI)))
1334       continue;
1335 
1336     Type *Ty = getLoadStoreType(&I);
1337     if (!VectorType::isValidElementType(Ty->getScalarType()))
1338       continue;
1339 
1340     // Skip weird non-byte sizes. They probably aren't worth the effort of
1341     // handling correctly.
1342     unsigned TySize = DL.getTypeSizeInBits(Ty);
1343     if ((TySize % 8) != 0)
1344       continue;
1345 
1346     // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1347     // functions are currently using an integer type for the vectorized
1348     // load/store, and does not support casting between the integer type and a
1349     // vector of pointers (e.g. i64 to <2 x i16*>)
1350     if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1351       continue;
1352 
1353     Value *Ptr = getLoadStorePointerOperand(&I);
1354     unsigned AS = Ptr->getType()->getPointerAddressSpace();
1355     unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1356 
1357     unsigned VF = VecRegSize / TySize;
1358     VectorType *VecTy = dyn_cast<VectorType>(Ty);
1359 
1360     // Only handle power-of-two sized elements.
1361     if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1362         (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1363       continue;
1364 
1365     // No point in looking at these if they're too big to vectorize.
1366     if (TySize > VecRegSize / 2 ||
1367         (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1368       continue;
1369 
1370     Ret[{getUnderlyingObject(Ptr), AS,
1371          DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1372          /*IsLoad=*/LI != nullptr}]
1373         .push_back(&I);
1374   }
1375 
1376   return Ret;
1377 }
1378 
gatherChains(ArrayRef<Instruction * > Instrs)1379 std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1380   if (Instrs.empty())
1381     return {};
1382 
1383   unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1384   unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1385 
1386 #ifndef NDEBUG
1387   // Check that Instrs is in BB order and all have the same addr space.
1388   for (size_t I = 1; I < Instrs.size(); ++I) {
1389     assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1390     assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1391   }
1392 #endif
1393 
1394   // Machinery to build an MRU-hashtable of Chains.
1395   //
1396   // (Ideally this could be done with MapVector, but as currently implemented,
1397   // moving an element to the front of a MapVector is O(n).)
1398   struct InstrListElem : ilist_node<InstrListElem>,
1399                          std::pair<Instruction *, Chain> {
1400     explicit InstrListElem(Instruction *I)
1401         : std::pair<Instruction *, Chain>(I, {}) {}
1402   };
1403   struct InstrListElemDenseMapInfo {
1404     using PtrInfo = DenseMapInfo<InstrListElem *>;
1405     using IInfo = DenseMapInfo<Instruction *>;
1406     static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1407     static InstrListElem *getTombstoneKey() {
1408       return PtrInfo::getTombstoneKey();
1409     }
1410     static unsigned getHashValue(const InstrListElem *E) {
1411       return IInfo::getHashValue(E->first);
1412     }
1413     static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1414       if (A == getEmptyKey() || B == getEmptyKey())
1415         return A == getEmptyKey() && B == getEmptyKey();
1416       if (A == getTombstoneKey() || B == getTombstoneKey())
1417         return A == getTombstoneKey() && B == getTombstoneKey();
1418       return IInfo::isEqual(A->first, B->first);
1419     }
1420   };
1421   SpecificBumpPtrAllocator<InstrListElem> Allocator;
1422   simple_ilist<InstrListElem> MRU;
1423   DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
1424 
1425   // Compare each instruction in `instrs` to leader of the N most recently-used
1426   // chains.  This limits the O(n^2) behavior of this pass while also allowing
1427   // us to build arbitrarily long chains.
1428   for (Instruction *I : Instrs) {
1429     constexpr int MaxChainsToTry = 64;
1430 
1431     bool MatchFound = false;
1432     auto ChainIter = MRU.begin();
1433     for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1434          ++J, ++ChainIter) {
1435       std::optional<APInt> Offset = getConstantOffset(
1436           getLoadStorePointerOperand(ChainIter->first),
1437           getLoadStorePointerOperand(I),
1438           /*ContextInst=*/
1439           (ChainIter->first->comesBefore(I) ? I : ChainIter->first));
1440       if (Offset.has_value()) {
1441         // `Offset` might not have the expected number of bits, if e.g. AS has a
1442         // different number of bits than opaque pointers.
1443         ChainIter->second.push_back(ChainElem{I, Offset.value()});
1444         // Move ChainIter to the front of the MRU list.
1445         MRU.remove(*ChainIter);
1446         MRU.push_front(*ChainIter);
1447         MatchFound = true;
1448         break;
1449       }
1450     }
1451 
1452     if (!MatchFound) {
1453       APInt ZeroOffset(ASPtrBits, 0);
1454       InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1455       E->second.push_back(ChainElem{I, ZeroOffset});
1456       MRU.push_front(*E);
1457       Chains.insert(E);
1458     }
1459   }
1460 
1461   std::vector<Chain> Ret;
1462   Ret.reserve(Chains.size());
1463   // Iterate over MRU rather than Chains so the order is deterministic.
1464   for (auto &E : MRU)
1465     if (E.second.size() > 1)
1466       Ret.push_back(std::move(E.second));
1467   return Ret;
1468 }
1469 
getConstantOffset(Value * PtrA,Value * PtrB,Instruction * ContextInst,unsigned Depth)1470 std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1471                                                    Instruction *ContextInst,
1472                                                    unsigned Depth) {
1473   LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1474                     << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1475                     << ", Depth=" << Depth << "\n");
1476   // We'll ultimately return a value of this bit width, even if computations
1477   // happen in a different width.
1478   unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1479   APInt OffsetA(OrigBitWidth, 0);
1480   APInt OffsetB(OrigBitWidth, 0);
1481   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1482   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1483   unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1484   if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1485     return std::nullopt;
1486 
1487   // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1488   // should properly handle a possible overflow and the value should fit into
1489   // the smallest data type used in the cast/gep chain.
1490   assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1491          OffsetB.getSignificantBits() <= NewPtrBitWidth);
1492 
1493   OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1494   OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1495   if (PtrA == PtrB)
1496     return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1497 
1498   // Try to compute B - A.
1499   const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1500   if (DistScev != SE.getCouldNotCompute()) {
1501     LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1502     ConstantRange DistRange = SE.getSignedRange(DistScev);
1503     if (DistRange.isSingleElement()) {
1504       // Handle index width (the width of Dist) != pointer width (the width of
1505       // the Offset*s at this point).
1506       APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1507       return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1508     }
1509   }
1510   std::optional<APInt> Diff =
1511       getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth);
1512   if (Diff.has_value())
1513     return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1514         .sextOrTrunc(OrigBitWidth);
1515   return std::nullopt;
1516 }
1517