1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass merges loads/stores to/from sequential memory addresses into vector
10 // loads/stores. Although there's nothing GPU-specific in here, this pass is
11 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12 //
13 // (For simplicity below we talk about loads only, but everything also applies
14 // to stores.)
15 //
16 // This pass is intended to be run late in the pipeline, after other
17 // vectorization opportunities have been exploited. So the assumption here is
18 // that immediately following our new vector load we'll need to extract out the
19 // individual elements of the load, so we can operate on them individually.
20 //
21 // On CPUs this transformation is usually not beneficial, because extracting the
22 // elements of a vector register is expensive on most architectures. It's
23 // usually better just to load each element individually into its own scalar
24 // register.
25 //
26 // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27 // "vector load" loads directly into a series of scalar registers. In effect,
28 // extracting the elements of the vector is free. It's therefore always
29 // beneficial to vectorize a sequence of loads on these architectures.
30 //
31 // Vectorizing (perhaps a better name might be "coalescing") loads can have
32 // large performance impacts on GPU kernels, and opportunities for vectorizing
33 // are common in GPU code. This pass tries very hard to find such
34 // opportunities; its runtime is quadratic in the number of loads in a BB.
35 //
36 // Some CPU architectures, such as ARM, have instructions that load into
37 // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38 // could use this pass (with some modifications), but currently it implements
39 // its own pass to do something similar to what we do here.
40 //
41 // Overview of the algorithm and terminology in this pass:
42 //
43 // - Break up each basic block into pseudo-BBs, composed of instructions which
44 // are guaranteed to transfer control to their successors.
45 // - Within a single pseudo-BB, find all loads, and group them into
46 // "equivalence classes" according to getUnderlyingObject() and loaded
47 // element size. Do the same for stores.
48 // - For each equivalence class, greedily build "chains". Each chain has a
49 // leader instruction, and every other member of the chain has a known
50 // constant offset from the first instr in the chain.
51 // - Break up chains so that they contain only contiguous accesses of legal
52 // size with no intervening may-alias instrs.
53 // - Convert each chain to vector instructions.
54 //
55 // The O(n^2) behavior of this pass comes from initially building the chains.
56 // In the worst case we have to compare each new instruction to all of those
57 // that came before. To limit this, we only calculate the offset to the leaders
58 // of the N most recently-used chains.
59
60 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
61 #include "llvm/ADT/APInt.h"
62 #include "llvm/ADT/ArrayRef.h"
63 #include "llvm/ADT/DenseMap.h"
64 #include "llvm/ADT/MapVector.h"
65 #include "llvm/ADT/PostOrderIterator.h"
66 #include "llvm/ADT/STLExtras.h"
67 #include "llvm/ADT/Sequence.h"
68 #include "llvm/ADT/SmallPtrSet.h"
69 #include "llvm/ADT/SmallVector.h"
70 #include "llvm/ADT/Statistic.h"
71 #include "llvm/ADT/iterator_range.h"
72 #include "llvm/Analysis/AliasAnalysis.h"
73 #include "llvm/Analysis/AssumptionCache.h"
74 #include "llvm/Analysis/MemoryLocation.h"
75 #include "llvm/Analysis/ScalarEvolution.h"
76 #include "llvm/Analysis/TargetTransformInfo.h"
77 #include "llvm/Analysis/ValueTracking.h"
78 #include "llvm/Analysis/VectorUtils.h"
79 #include "llvm/IR/Attributes.h"
80 #include "llvm/IR/BasicBlock.h"
81 #include "llvm/IR/ConstantRange.h"
82 #include "llvm/IR/Constants.h"
83 #include "llvm/IR/DataLayout.h"
84 #include "llvm/IR/DerivedTypes.h"
85 #include "llvm/IR/Dominators.h"
86 #include "llvm/IR/Function.h"
87 #include "llvm/IR/GetElementPtrTypeIterator.h"
88 #include "llvm/IR/IRBuilder.h"
89 #include "llvm/IR/InstrTypes.h"
90 #include "llvm/IR/Instruction.h"
91 #include "llvm/IR/Instructions.h"
92 #include "llvm/IR/LLVMContext.h"
93 #include "llvm/IR/Module.h"
94 #include "llvm/IR/Type.h"
95 #include "llvm/IR/Value.h"
96 #include "llvm/InitializePasses.h"
97 #include "llvm/Pass.h"
98 #include "llvm/Support/Alignment.h"
99 #include "llvm/Support/Casting.h"
100 #include "llvm/Support/Debug.h"
101 #include "llvm/Support/KnownBits.h"
102 #include "llvm/Support/MathExtras.h"
103 #include "llvm/Support/ModRef.h"
104 #include "llvm/Support/raw_ostream.h"
105 #include "llvm/Transforms/Utils/Local.h"
106 #include <algorithm>
107 #include <cassert>
108 #include <cstdint>
109 #include <cstdlib>
110 #include <iterator>
111 #include <numeric>
112 #include <optional>
113 #include <tuple>
114 #include <type_traits>
115 #include <utility>
116 #include <vector>
117
118 using namespace llvm;
119
120 #define DEBUG_TYPE "load-store-vectorizer"
121
122 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125 namespace {
126
127 // Equivalence class key, the initial tuple by which we group loads/stores.
128 // Loads/stores with different EqClassKeys are never merged.
129 //
130 // (We could in theory remove element-size from the this tuple. We'd just need
131 // to fix up the vector packing/unpacking code.)
132 using EqClassKey =
133 std::tuple<const Value * /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
137 >;
operator <<(llvm::raw_ostream & OS,const EqClassKey & K)138 [[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
139 const EqClassKey &K) {
140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize << " bits in addrspace "
143 << AddrSpace;
144 }
145
146 // A Chain is a set of instructions such that:
147 // - All instructions have the same equivalence class, so in particular all are
148 // loads, or all are stores.
149 // - We know the address accessed by the i'th chain elem relative to the
150 // chain's leader instruction, which is the first instr of the chain in BB
151 // order.
152 //
153 // Chains have two canonical orderings:
154 // - BB order, sorted by Instr->comesBefore.
155 // - Offset order, sorted by OffsetFromLeader.
156 // This pass switches back and forth between these orders.
157 struct ChainElem {
158 Instruction *Inst;
159 APInt OffsetFromLeader;
ChainElem__anonc586b4f20111::ChainElem160 ChainElem(Instruction *Inst, APInt OffsetFromLeader)
161 : Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
162 };
163 using Chain = SmallVector<ChainElem, 1>;
164
sortChainInBBOrder(Chain & C)165 void sortChainInBBOrder(Chain &C) {
166 sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
167 }
168
sortChainInOffsetOrder(Chain & C)169 void sortChainInOffsetOrder(Chain &C) {
170 sort(C, [](const auto &A, const auto &B) {
171 if (A.OffsetFromLeader != B.OffsetFromLeader)
172 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
173 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
174 });
175 }
176
dumpChain(ArrayRef<ChainElem> C)177 [[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
178 for (const auto &E : C) {
179 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
180 }
181 }
182
183 using EquivalenceClassMap =
184 MapVector<EqClassKey, SmallVector<Instruction *, 8>>;
185
186 // FIXME: Assuming stack alignment of 4 is always good enough
187 constexpr unsigned StackAdjustedAlignment = 4;
188
propagateMetadata(Instruction * I,const Chain & C)189 Instruction *propagateMetadata(Instruction *I, const Chain &C) {
190 SmallVector<Value *, 8> Values;
191 for (const ChainElem &E : C)
192 Values.emplace_back(E.Inst);
193 return propagateMetadata(I, Values);
194 }
195
isInvariantLoad(const Instruction * I)196 bool isInvariantLoad(const Instruction *I) {
197 const LoadInst *LI = dyn_cast<LoadInst>(I);
198 return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
199 }
200
201 /// Reorders the instructions that I depends on (the instructions defining its
202 /// operands), to ensure they dominate I.
reorder(Instruction * I)203 void reorder(Instruction *I) {
204 SmallPtrSet<Instruction *, 16> InstructionsToMove;
205 SmallVector<Instruction *, 16> Worklist;
206
207 Worklist.emplace_back(I);
208 while (!Worklist.empty()) {
209 Instruction *IW = Worklist.pop_back_val();
210 int NumOperands = IW->getNumOperands();
211 for (int Idx = 0; Idx < NumOperands; Idx++) {
212 Instruction *IM = dyn_cast<Instruction>(IW->getOperand(Idx));
213 if (!IM || IM->getOpcode() == Instruction::PHI)
214 continue;
215
216 // If IM is in another BB, no need to move it, because this pass only
217 // vectorizes instructions within one BB.
218 if (IM->getParent() != I->getParent())
219 continue;
220
221 assert(IM != I && "Unexpected cycle while re-ordering instructions");
222
223 if (!IM->comesBefore(I)) {
224 InstructionsToMove.insert(IM);
225 Worklist.emplace_back(IM);
226 }
227 }
228 }
229
230 // All instructions to move should follow I. Start from I, not from begin().
231 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
232 Instruction *IM = &*(BBI++);
233 if (!InstructionsToMove.contains(IM))
234 continue;
235 IM->moveBefore(I->getIterator());
236 }
237 }
238
239 class Vectorizer {
240 Function &F;
241 AliasAnalysis &AA;
242 AssumptionCache &AC;
243 DominatorTree &DT;
244 ScalarEvolution &SE;
245 TargetTransformInfo &TTI;
246 const DataLayout &DL;
247 IRBuilder<> Builder;
248
249 // We could erase instrs right after vectorizing them, but that can mess up
250 // our BB iterators, and also can make the equivalence class keys point to
251 // freed memory. This is fixable, but it's simpler just to wait until we're
252 // done with the BB and erase all at once.
253 SmallVector<Instruction *, 128> ToErase;
254
255 public:
Vectorizer(Function & F,AliasAnalysis & AA,AssumptionCache & AC,DominatorTree & DT,ScalarEvolution & SE,TargetTransformInfo & TTI)256 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
257 DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
258 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
259 DL(F.getDataLayout()), Builder(SE.getContext()) {}
260
261 bool run();
262
263 private:
264 static const unsigned MaxDepth = 3;
265
266 /// Runs the vectorizer on a "pseudo basic block", which is a range of
267 /// instructions [Begin, End) within one BB all of which have
268 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
269 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
270
271 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
272 /// in the same BB with the same value for getUnderlyingObject() etc.
273 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
274 ArrayRef<Instruction *> EqClass);
275
276 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
277 /// where all instructions access a known, constant offset from the first
278 /// instruction.
279 bool runOnChain(Chain &C);
280
281 /// Splits the chain into subchains of instructions which read/write a
282 /// contiguous block of memory. Discards any length-1 subchains (because
283 /// there's nothing to vectorize in there).
284 std::vector<Chain> splitChainByContiguity(Chain &C);
285
286 /// Splits the chain into subchains where it's safe to hoist loads up to the
287 /// beginning of the sub-chain and it's safe to sink loads up to the end of
288 /// the sub-chain. Discards any length-1 subchains.
289 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
290
291 /// Splits the chain into subchains that make legal, aligned accesses.
292 /// Discards any length-1 subchains.
293 std::vector<Chain> splitChainByAlignment(Chain &C);
294
295 /// Converts the instrs in the chain into a single vectorized load or store.
296 /// Adds the old scalar loads/stores to ToErase.
297 bool vectorizeChain(Chain &C);
298
299 /// Tries to compute the offset in bytes PtrB - PtrA.
300 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
301 Instruction *ContextInst,
302 unsigned Depth = 0);
303 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
304 Instruction *ContextInst,
305 unsigned Depth);
306 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
307 Instruction *ContextInst,
308 unsigned Depth);
309
310 /// Gets the element type of the vector that the chain will load or store.
311 /// This is nontrivial because the chain may contain elements of different
312 /// types; e.g. it's legal to have a chain that contains both i32 and float.
313 Type *getChainElemTy(const Chain &C);
314
315 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
316 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
317 /// instructions.
318 ///
319 /// The map ChainElemOffsets must contain all of the elements in
320 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
321 /// address. It's ok if it contains additional entries.
322 template <bool IsLoadChain>
323 bool isSafeToMove(
324 Instruction *ChainElem, Instruction *ChainBegin,
325 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
326 BatchAAResults &BatchAA);
327
328 /// Merges the equivalence classes if they have underlying objects that differ
329 /// by one level of indirection (i.e., one is a getelementptr and the other is
330 /// the base pointer in that getelementptr).
331 void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
332
333 /// Collects loads and stores grouped by "equivalence class", where:
334 /// - all elements in an eq class are a load or all are a store,
335 /// - they all load/store the same element size (it's OK to have e.g. i8 and
336 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
337 /// - they all have the same value for getUnderlyingObject().
338 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
339 BasicBlock::iterator End);
340
341 /// Partitions Instrs into "chains" where every instruction has a known
342 /// constant offset from the first instr in the chain.
343 ///
344 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
345 /// in the chain is the leader, and an instr touches distance 0 from itself.
346 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
347 };
348
349 class LoadStoreVectorizerLegacyPass : public FunctionPass {
350 public:
351 static char ID;
352
LoadStoreVectorizerLegacyPass()353 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
354 initializeLoadStoreVectorizerLegacyPassPass(
355 *PassRegistry::getPassRegistry());
356 }
357
358 bool runOnFunction(Function &F) override;
359
getPassName() const360 StringRef getPassName() const override {
361 return "GPU Load and Store Vectorizer";
362 }
363
getAnalysisUsage(AnalysisUsage & AU) const364 void getAnalysisUsage(AnalysisUsage &AU) const override {
365 AU.addRequired<AAResultsWrapperPass>();
366 AU.addRequired<AssumptionCacheTracker>();
367 AU.addRequired<ScalarEvolutionWrapperPass>();
368 AU.addRequired<DominatorTreeWrapperPass>();
369 AU.addRequired<TargetTransformInfoWrapperPass>();
370 AU.setPreservesCFG();
371 }
372 };
373
374 } // end anonymous namespace
375
376 char LoadStoreVectorizerLegacyPass::ID = 0;
377
378 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
379 "Vectorize load and Store instructions", false, false)
380 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
381 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
382 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)383 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
384 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
385 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
386 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
387 "Vectorize load and store instructions", false, false)
388
389 Pass *llvm::createLoadStoreVectorizerPass() {
390 return new LoadStoreVectorizerLegacyPass();
391 }
392
runOnFunction(Function & F)393 bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
394 // Don't vectorize when the attribute NoImplicitFloat is used.
395 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
396 return false;
397
398 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
399 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
400 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
401 TargetTransformInfo &TTI =
402 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
403
404 AssumptionCache &AC =
405 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
406
407 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
408 }
409
run(Function & F,FunctionAnalysisManager & AM)410 PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
411 FunctionAnalysisManager &AM) {
412 // Don't vectorize when the attribute NoImplicitFloat is used.
413 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
414 return PreservedAnalyses::all();
415
416 AliasAnalysis &AA = AM.getResult<AAManager>(F);
417 DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
418 ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
419 TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
420 AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
421
422 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
423 PreservedAnalyses PA;
424 PA.preserveSet<CFGAnalyses>();
425 return Changed ? PA : PreservedAnalyses::all();
426 }
427
run()428 bool Vectorizer::run() {
429 bool Changed = false;
430 // Break up the BB if there are any instrs which aren't guaranteed to transfer
431 // execution to their successor.
432 //
433 // Consider, for example:
434 //
435 // def assert_arr_len(int n) { if (n < 2) exit(); }
436 //
437 // load arr[0]
438 // call assert_array_len(arr.length)
439 // load arr[1]
440 //
441 // Even though assert_arr_len does not read or write any memory, we can't
442 // speculate the second load before the call. More info at
443 // https://github.com/llvm/llvm-project/issues/52950.
444 for (BasicBlock *BB : post_order(&F)) {
445 // BB must at least have a terminator.
446 assert(!BB->empty());
447
448 SmallVector<BasicBlock::iterator, 8> Barriers;
449 Barriers.emplace_back(BB->begin());
450 for (Instruction &I : *BB)
451 if (!isGuaranteedToTransferExecutionToSuccessor(&I))
452 Barriers.emplace_back(I.getIterator());
453 Barriers.emplace_back(BB->end());
454
455 for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
456 ++It)
457 Changed |= runOnPseudoBB(*It, *std::next(It));
458
459 for (Instruction *I : ToErase) {
460 auto *PtrOperand = getLoadStorePointerOperand(I);
461 if (I->use_empty())
462 I->eraseFromParent();
463 RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
464 }
465 ToErase.clear();
466 }
467
468 return Changed;
469 }
470
runOnPseudoBB(BasicBlock::iterator Begin,BasicBlock::iterator End)471 bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
472 BasicBlock::iterator End) {
473 LLVM_DEBUG({
474 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
475 if (End != Begin->getParent()->end())
476 dbgs() << *End;
477 else
478 dbgs() << "<BB end>";
479 dbgs() << ")\n";
480 });
481
482 bool Changed = false;
483 for (const auto &[EqClassKey, EqClass] :
484 collectEquivalenceClasses(Begin, End))
485 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
486
487 return Changed;
488 }
489
runOnEquivalenceClass(const EqClassKey & EqClassKey,ArrayRef<Instruction * > EqClass)490 bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
491 ArrayRef<Instruction *> EqClass) {
492 bool Changed = false;
493
494 LLVM_DEBUG({
495 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
496 << " keyed on " << EqClassKey << ":\n";
497 for (Instruction *I : EqClass)
498 dbgs() << " " << *I << "\n";
499 });
500
501 std::vector<Chain> Chains = gatherChains(EqClass);
502 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
503 << " nontrivial chains.\n";);
504 for (Chain &C : Chains)
505 Changed |= runOnChain(C);
506 return Changed;
507 }
508
runOnChain(Chain & C)509 bool Vectorizer::runOnChain(Chain &C) {
510 LLVM_DEBUG({
511 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
512 dumpChain(C);
513 });
514
515 // Split up the chain into increasingly smaller chains, until we can finally
516 // vectorize the chains.
517 //
518 // (Don't be scared by the depth of the loop nest here. These operations are
519 // all at worst O(n lg n) in the number of instructions, and splitting chains
520 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
521 bool Changed = false;
522 for (auto &C : splitChainByMayAliasInstrs(C))
523 for (auto &C : splitChainByContiguity(C))
524 for (auto &C : splitChainByAlignment(C))
525 Changed |= vectorizeChain(C);
526 return Changed;
527 }
528
splitChainByMayAliasInstrs(Chain & C)529 std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
530 if (C.empty())
531 return {};
532
533 sortChainInBBOrder(C);
534
535 LLVM_DEBUG({
536 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
537 dumpChain(C);
538 });
539
540 // We know that elements in the chain with nonverlapping offsets can't
541 // alias, but AA may not be smart enough to figure this out. Use a
542 // hashtable.
543 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
544 for (const auto &E : C)
545 ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
546
547 // Across a single invocation of this function the IR is not changing, so
548 // using a batched Alias Analysis is safe and can reduce compile time.
549 BatchAAResults BatchAA(AA);
550
551 // Loads get hoisted up to the first load in the chain. Stores get sunk
552 // down to the last store in the chain. Our algorithm for loads is:
553 //
554 // - Take the first element of the chain. This is the start of a new chain.
555 // - Take the next element of `Chain` and check for may-alias instructions
556 // up to the start of NewChain. If no may-alias instrs, add it to
557 // NewChain. Otherwise, start a new NewChain.
558 //
559 // For stores it's the same except in the reverse direction.
560 //
561 // We expect IsLoad to be an std::bool_constant.
562 auto Impl = [&](auto IsLoad) {
563 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
564 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
565 if constexpr (IsLoad())
566 return std::make_pair(C.begin(), C.end());
567 else
568 return std::make_pair(C.rbegin(), C.rend());
569 }(IsLoad);
570 assert(ChainBegin != ChainEnd);
571
572 std::vector<Chain> Chains;
573 SmallVector<ChainElem, 1> NewChain;
574 NewChain.emplace_back(*ChainBegin);
575 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
576 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
577 ChainOffsets, BatchAA)) {
578 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
579 << *ChainIt->Inst << " into " << *ChainBegin->Inst
580 << "\n");
581 NewChain.emplace_back(*ChainIt);
582 } else {
583 LLVM_DEBUG(
584 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
585 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
586 if (NewChain.size() > 1) {
587 LLVM_DEBUG({
588 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
589 dumpChain(NewChain);
590 });
591 Chains.emplace_back(std::move(NewChain));
592 }
593
594 // Start a new chain.
595 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
596 }
597 }
598 if (NewChain.size() > 1) {
599 LLVM_DEBUG({
600 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
601 dumpChain(NewChain);
602 });
603 Chains.emplace_back(std::move(NewChain));
604 }
605 return Chains;
606 };
607
608 if (isa<LoadInst>(C[0].Inst))
609 return Impl(/*IsLoad=*/std::bool_constant<true>());
610
611 assert(isa<StoreInst>(C[0].Inst));
612 return Impl(/*IsLoad=*/std::bool_constant<false>());
613 }
614
splitChainByContiguity(Chain & C)615 std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
616 if (C.empty())
617 return {};
618
619 sortChainInOffsetOrder(C);
620
621 LLVM_DEBUG({
622 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
623 dumpChain(C);
624 });
625
626 std::vector<Chain> Ret;
627 Ret.push_back({C.front()});
628
629 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
630 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
631 auto &CurChain = Ret.back();
632 const ChainElem &Prev = CurChain.back();
633 unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
634 assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
635 "collectEquivalenceClass");
636 APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
637
638 // Add this instruction to the end of the current chain, or start a new one.
639 bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
640 LLVM_DEBUG(dbgs() << "LSV: Instructions are "
641 << (AreContiguous ? "" : "not ") << "contiguous: "
642 << *Prev.Inst << " (ends at offset " << PrevReadEnd
643 << ") -> " << *It->Inst << " (starts at offset "
644 << It->OffsetFromLeader << ")\n");
645 if (AreContiguous)
646 CurChain.push_back(*It);
647 else
648 Ret.push_back({*It});
649 }
650
651 // Filter out length-1 chains, these are uninteresting.
652 llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
653 return Ret;
654 }
655
getChainElemTy(const Chain & C)656 Type *Vectorizer::getChainElemTy(const Chain &C) {
657 assert(!C.empty());
658 // The rules are:
659 // - If there are any pointer types in the chain, use an integer type.
660 // - Prefer an integer type if it appears in the chain.
661 // - Otherwise, use the first type in the chain.
662 //
663 // The rule about pointer types is a simplification when we merge e.g. a load
664 // of a ptr and a double. There's no direct conversion from a ptr to a
665 // double; it requires a ptrtoint followed by a bitcast.
666 //
667 // It's unclear to me if the other rules have any practical effect, but we do
668 // it to match this pass's previous behavior.
669 if (any_of(C, [](const ChainElem &E) {
670 return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
671 })) {
672 return Type::getIntNTy(
673 F.getContext(),
674 DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
675 }
676
677 for (const ChainElem &E : C)
678 if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
679 return T;
680 return getLoadStoreType(C[0].Inst)->getScalarType();
681 }
682
splitChainByAlignment(Chain & C)683 std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
684 // We use a simple greedy algorithm.
685 // - Given a chain of length N, find all prefixes that
686 // (a) are not longer than the max register length, and
687 // (b) are a power of 2.
688 // - Starting from the longest prefix, try to create a vector of that length.
689 // - If one of them works, great. Repeat the algorithm on any remaining
690 // elements in the chain.
691 // - If none of them work, discard the first element and repeat on a chain
692 // of length N-1.
693 if (C.empty())
694 return {};
695
696 sortChainInOffsetOrder(C);
697
698 LLVM_DEBUG({
699 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
700 dumpChain(C);
701 });
702
703 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
704 auto GetVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
705 unsigned ChainSizeBytes, VectorType *VecTy) {
706 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
707 ChainSizeBytes, VecTy)
708 : TTI.getStoreVectorFactor(VF, LoadStoreSize,
709 ChainSizeBytes, VecTy);
710 };
711
712 #ifndef NDEBUG
713 for (const auto &E : C) {
714 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
715 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
716 "Should have filtered out non-power-of-two elements in "
717 "collectEquivalenceClasses.");
718 }
719 #endif
720
721 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
722 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
723
724 std::vector<Chain> Ret;
725 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
726 // Find candidate chains of size not greater than the largest vector reg.
727 // These chains are over the closed interval [CBegin, CEnd].
728 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
729 CandidateChains;
730 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
731 APInt Sz = C[CEnd].OffsetFromLeader +
732 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
733 C[CBegin].OffsetFromLeader;
734 if (Sz.sgt(VecRegBytes))
735 break;
736 CandidateChains.emplace_back(CEnd,
737 static_cast<unsigned>(Sz.getLimitedValue()));
738 }
739
740 // Consider the longest chain first.
741 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
742 It != End; ++It) {
743 auto [CEnd, SizeBytes] = *It;
744 LLVM_DEBUG(
745 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
746 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
747
748 Type *VecElemTy = getChainElemTy(C);
749 // Note, VecElemTy is a power of 2, but might be less than one byte. For
750 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
751 // VecElemTy would be i4.
752 unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
753
754 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
755 assert((8 * SizeBytes) % VecElemBits == 0);
756 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
757 FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
758 unsigned VF = 8 * VecRegBytes / VecElemBits;
759
760 // Check that TTI is happy with this vectorization factor.
761 unsigned TargetVF = GetVectorFactor(VF, VecElemBits,
762 VecElemBits * NumVecElems / 8, VecTy);
763 if (TargetVF != VF && TargetVF < NumVecElems) {
764 LLVM_DEBUG(
765 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
766 "because TargetVF="
767 << TargetVF << " != VF=" << VF
768 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
769 continue;
770 }
771
772 // Is a load/store with this alignment allowed by TTI and at least as fast
773 // as an unvectorized load/store?
774 //
775 // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
776 auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
777 &F = F](Align Alignment) {
778 if (Alignment.value() % SizeBytes == 0)
779 return true;
780 unsigned VectorizedSpeed = 0;
781 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
782 F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
783 if (!AllowsMisaligned) {
784 LLVM_DEBUG(dbgs()
785 << "LSV: Access of " << SizeBytes << "B in addrspace "
786 << AS << " with alignment " << Alignment.value()
787 << " is misaligned, and therefore can't be vectorized.\n");
788 return false;
789 }
790
791 unsigned ElementwiseSpeed = 0;
792 (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
793 Alignment, &ElementwiseSpeed);
794 if (VectorizedSpeed < ElementwiseSpeed) {
795 LLVM_DEBUG(dbgs()
796 << "LSV: Access of " << SizeBytes << "B in addrspace "
797 << AS << " with alignment " << Alignment.value()
798 << " has relative speed " << VectorizedSpeed
799 << ", which is lower than the elementwise speed of "
800 << ElementwiseSpeed
801 << ". Therefore this access won't be vectorized.\n");
802 return false;
803 }
804 return true;
805 };
806
807 // If we're loading/storing from an alloca, align it if possible.
808 //
809 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
810 // tells us this is beneficial. This feels a bit odd, but it matches
811 // existing tests. This isn't *so* bad, because at most we align to 4
812 // bytes (current value of StackAdjustedAlignment).
813 //
814 // FIXME: We will upgrade the alignment of the alloca even if it turns out
815 // we can't vectorize for some other reason.
816 Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
817 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
818 isa<AllocaInst>(PtrOperand->stripPointerCasts());
819 Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
820 Align PrefAlign = Align(StackAdjustedAlignment);
821 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
822 IsAllowedAndFast(PrefAlign)) {
823 Align NewAlign = getOrEnforceKnownAlignment(
824 PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
825 if (NewAlign >= Alignment) {
826 LLVM_DEBUG(dbgs()
827 << "LSV: splitByChain upgrading alloca alignment from "
828 << Alignment.value() << " to " << NewAlign.value()
829 << "\n");
830 Alignment = NewAlign;
831 }
832 }
833
834 if (!IsAllowedAndFast(Alignment)) {
835 LLVM_DEBUG(
836 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
837 "because its alignment is not AllowedAndFast: "
838 << Alignment.value() << "\n");
839 continue;
840 }
841
842 if ((IsLoadChain &&
843 !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
844 (!IsLoadChain &&
845 !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
846 LLVM_DEBUG(
847 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
848 "because !isLegalToVectorizeLoad/StoreChain.");
849 continue;
850 }
851
852 // Hooray, we can vectorize this chain!
853 Chain &NewChain = Ret.emplace_back();
854 for (unsigned I = CBegin; I <= CEnd; ++I)
855 NewChain.emplace_back(C[I]);
856 CBegin = CEnd; // Skip over the instructions we've added to the chain.
857 break;
858 }
859 }
860 return Ret;
861 }
862
vectorizeChain(Chain & C)863 bool Vectorizer::vectorizeChain(Chain &C) {
864 if (C.size() < 2)
865 return false;
866
867 sortChainInOffsetOrder(C);
868
869 LLVM_DEBUG({
870 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
871 dumpChain(C);
872 });
873
874 Type *VecElemTy = getChainElemTy(C);
875 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
876 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
877 unsigned ChainBytes = std::accumulate(
878 C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
879 return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
880 });
881 assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
882 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
883 // than 1 byte (e.g. VecTy == <32 x i1>).
884 Type *VecTy = FixedVectorType::get(
885 VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
886
887 Align Alignment = getLoadStoreAlignment(C[0].Inst);
888 // If this is a load/store of an alloca, we might have upgraded the alloca's
889 // alignment earlier. Get the new alignment.
890 if (AS == DL.getAllocaAddrSpace()) {
891 Alignment = std::max(
892 Alignment,
893 getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst),
894 MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
895 }
896
897 // All elements of the chain must have the same scalar-type size.
898 #ifndef NDEBUG
899 for (const ChainElem &E : C)
900 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
901 DL.getTypeStoreSize(VecElemTy));
902 #endif
903
904 Instruction *VecInst;
905 if (IsLoadChain) {
906 // Loads get hoisted to the location of the first load in the chain. We may
907 // also need to hoist the (transitive) operands of the loads.
908 Builder.SetInsertPoint(
909 llvm::min_element(C, [](const auto &A, const auto &B) {
910 return A.Inst->comesBefore(B.Inst);
911 })->Inst);
912
913 // Chain is in offset order, so C[0] is the instr with the lowest offset,
914 // i.e. the root of the vector.
915 VecInst = Builder.CreateAlignedLoad(VecTy,
916 getLoadStorePointerOperand(C[0].Inst),
917 Alignment);
918
919 unsigned VecIdx = 0;
920 for (const ChainElem &E : C) {
921 Instruction *I = E.Inst;
922 Value *V;
923 Type *T = getLoadStoreType(I);
924 if (auto *VT = dyn_cast<FixedVectorType>(T)) {
925 auto Mask = llvm::to_vector<8>(
926 llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
927 V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
928 VecIdx += VT->getNumElements();
929 } else {
930 V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
931 I->getName());
932 ++VecIdx;
933 }
934 if (V->getType() != I->getType())
935 V = Builder.CreateBitOrPointerCast(V, I->getType());
936 I->replaceAllUsesWith(V);
937 }
938
939 // Finally, we need to reorder the instrs in the BB so that the (transitive)
940 // operands of VecInst appear before it. To see why, suppose we have
941 // vectorized the following code:
942 //
943 // ptr1 = gep a, 1
944 // load1 = load i32 ptr1
945 // ptr0 = gep a, 0
946 // load0 = load i32 ptr0
947 //
948 // We will put the vectorized load at the location of the earliest load in
949 // the BB, i.e. load1. We get:
950 //
951 // ptr1 = gep a, 1
952 // loadv = load <2 x i32> ptr0
953 // load0 = extractelement loadv, 0
954 // load1 = extractelement loadv, 1
955 // ptr0 = gep a, 0
956 //
957 // Notice that loadv uses ptr0, which is defined *after* it!
958 reorder(VecInst);
959 } else {
960 // Stores get sunk to the location of the last store in the chain.
961 Builder.SetInsertPoint(llvm::max_element(C, [](auto &A, auto &B) {
962 return A.Inst->comesBefore(B.Inst);
963 })->Inst);
964
965 // Build the vector to store.
966 Value *Vec = PoisonValue::get(VecTy);
967 unsigned VecIdx = 0;
968 auto InsertElem = [&](Value *V) {
969 if (V->getType() != VecElemTy)
970 V = Builder.CreateBitOrPointerCast(V, VecElemTy);
971 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
972 };
973 for (const ChainElem &E : C) {
974 auto *I = cast<StoreInst>(E.Inst);
975 if (FixedVectorType *VT =
976 dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
977 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
978 InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
979 Builder.getInt32(J)));
980 }
981 } else {
982 InsertElem(I->getValueOperand());
983 }
984 }
985
986 // Chain is in offset order, so C[0] is the instr with the lowest offset,
987 // i.e. the root of the vector.
988 VecInst = Builder.CreateAlignedStore(
989 Vec,
990 getLoadStorePointerOperand(C[0].Inst),
991 Alignment);
992 }
993
994 propagateMetadata(VecInst, C);
995
996 for (const ChainElem &E : C)
997 ToErase.emplace_back(E.Inst);
998
999 ++NumVectorInstructions;
1000 NumScalarsVectorized += C.size();
1001 return true;
1002 }
1003
1004 template <bool IsLoadChain>
isSafeToMove(Instruction * ChainElem,Instruction * ChainBegin,const DenseMap<Instruction *,APInt> & ChainOffsets,BatchAAResults & BatchAA)1005 bool Vectorizer::isSafeToMove(
1006 Instruction *ChainElem, Instruction *ChainBegin,
1007 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
1008 BatchAAResults &BatchAA) {
1009 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
1010 << *ChainBegin << ")\n");
1011
1012 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1013 if (ChainElem == ChainBegin)
1014 return true;
1015
1016 // Invariant loads can always be reordered; by definition they are not
1017 // clobbered by stores.
1018 if (isInvariantLoad(ChainElem))
1019 return true;
1020
1021 auto BBIt = std::next([&] {
1022 if constexpr (IsLoadChain)
1023 return BasicBlock::reverse_iterator(ChainElem);
1024 else
1025 return BasicBlock::iterator(ChainElem);
1026 }());
1027 auto BBItEnd = std::next([&] {
1028 if constexpr (IsLoadChain)
1029 return BasicBlock::reverse_iterator(ChainBegin);
1030 else
1031 return BasicBlock::iterator(ChainBegin);
1032 }());
1033
1034 const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1035 const unsigned ChainElemSize =
1036 DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1037
1038 for (; BBIt != BBItEnd; ++BBIt) {
1039 Instruction *I = &*BBIt;
1040
1041 if (!I->mayReadOrWriteMemory())
1042 continue;
1043
1044 // Loads can be reordered with other loads.
1045 if (IsLoadChain && isa<LoadInst>(I))
1046 continue;
1047
1048 // Stores can be sunk below invariant loads.
1049 if (!IsLoadChain && isInvariantLoad(I))
1050 continue;
1051
1052 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1053 // what offset ChainIt accesses. This may be better than AA is able to do.
1054 //
1055 // We should really only have duplicate offsets for stores (the duplicate
1056 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1057 // split the chain so we don't have to handle this case specially.
1058 if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1059 // I and ChainElem overlap if:
1060 // - I and ChainElem have the same offset, OR
1061 // - I's offset is less than ChainElem's, but I touches past the
1062 // beginning of ChainElem, OR
1063 // - ChainElem's offset is less than I's, but ChainElem touches past the
1064 // beginning of I.
1065 const APInt &IOffset = OffsetIt->second;
1066 unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1067 if (IOffset == ChainElemOffset ||
1068 (IOffset.sle(ChainElemOffset) &&
1069 (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1070 (ChainElemOffset.sle(IOffset) &&
1071 (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1072 LLVM_DEBUG({
1073 // Double check that AA also sees this alias. If not, we probably
1074 // have a bug.
1075 ModRefInfo MR =
1076 BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1077 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1078 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1079 });
1080 return false; // We found an aliasing instruction; bail.
1081 }
1082
1083 continue; // We're confident there's no alias.
1084 }
1085
1086 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1087 ModRefInfo MR = BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1088 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1089 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1090 << " Aliasing instruction:\n"
1091 << " " << *I << '\n'
1092 << " Aliased instruction and pointer:\n"
1093 << " " << *ChainElem << '\n'
1094 << " " << *getLoadStorePointerOperand(ChainElem)
1095 << '\n');
1096
1097 return false;
1098 }
1099 }
1100 return true;
1101 }
1102
checkNoWrapFlags(Instruction * I,bool Signed)1103 static bool checkNoWrapFlags(Instruction *I, bool Signed) {
1104 BinaryOperator *BinOpI = cast<BinaryOperator>(I);
1105 return (Signed && BinOpI->hasNoSignedWrap()) ||
1106 (!Signed && BinOpI->hasNoUnsignedWrap());
1107 }
1108
checkIfSafeAddSequence(const APInt & IdxDiff,Instruction * AddOpA,unsigned MatchingOpIdxA,Instruction * AddOpB,unsigned MatchingOpIdxB,bool Signed)1109 static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1110 unsigned MatchingOpIdxA, Instruction *AddOpB,
1111 unsigned MatchingOpIdxB, bool Signed) {
1112 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1113 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1114 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1115 << ", MatchingOpIdxB=" << MatchingOpIdxB
1116 << ", Signed=" << Signed << "\n");
1117 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1118 // being the same, we can guarantee that the transformation is safe if we can
1119 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1120 // For example:
1121 // %tmp7 = add nsw i32 %tmp2, %v0
1122 // %tmp8 = sext i32 %tmp7 to i64
1123 // ...
1124 // %tmp11 = add nsw i32 %v0, 1
1125 // %tmp12 = add nsw i32 %tmp2, %tmp11
1126 // %tmp13 = sext i32 %tmp12 to i64
1127 //
1128 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1129 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1130 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1131 assert(AddOpA->getOpcode() == Instruction::Add &&
1132 AddOpB->getOpcode() == Instruction::Add &&
1133 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1134 if (AddOpA->getOperand(MatchingOpIdxA) ==
1135 AddOpB->getOperand(MatchingOpIdxB)) {
1136 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1137 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1138 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1139 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1140 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1141 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1142 checkNoWrapFlags(OtherInstrB, Signed) &&
1143 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1144 int64_t CstVal =
1145 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1146 if (OtherInstrB->getOperand(0) == OtherOperandA &&
1147 IdxDiff.getSExtValue() == CstVal)
1148 return true;
1149 }
1150 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1151 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1152 checkNoWrapFlags(OtherInstrA, Signed) &&
1153 isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1154 int64_t CstVal =
1155 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1156 if (OtherInstrA->getOperand(0) == OtherOperandB &&
1157 IdxDiff.getSExtValue() == -CstVal)
1158 return true;
1159 }
1160 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1161 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1162 if (OtherInstrA && OtherInstrB &&
1163 OtherInstrA->getOpcode() == Instruction::Add &&
1164 OtherInstrB->getOpcode() == Instruction::Add &&
1165 checkNoWrapFlags(OtherInstrA, Signed) &&
1166 checkNoWrapFlags(OtherInstrB, Signed) &&
1167 isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1168 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1169 int64_t CstValA =
1170 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1171 int64_t CstValB =
1172 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1173 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1174 IdxDiff.getSExtValue() == (CstValB - CstValA))
1175 return true;
1176 }
1177 }
1178 return false;
1179 }
1180
getConstantOffsetComplexAddrs(Value * PtrA,Value * PtrB,Instruction * ContextInst,unsigned Depth)1181 std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1182 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1183 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1184 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1185 << " Depth=" << Depth << "\n");
1186 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1187 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1188 if (!GEPA || !GEPB)
1189 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1190
1191 // Look through GEPs after checking they're the same except for the last
1192 // index.
1193 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1194 GEPA->getPointerOperand() != GEPB->getPointerOperand())
1195 return std::nullopt;
1196 gep_type_iterator GTIA = gep_type_begin(GEPA);
1197 gep_type_iterator GTIB = gep_type_begin(GEPB);
1198 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1199 if (GTIA.getOperand() != GTIB.getOperand())
1200 return std::nullopt;
1201 ++GTIA;
1202 ++GTIB;
1203 }
1204
1205 Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
1206 Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
1207 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1208 OpA->getType() != OpB->getType())
1209 return std::nullopt;
1210
1211 uint64_t Stride = GTIA.getSequentialElementStride(DL);
1212
1213 // Only look through a ZExt/SExt.
1214 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1215 return std::nullopt;
1216
1217 bool Signed = isa<SExtInst>(OpA);
1218
1219 // At this point A could be a function parameter, i.e. not an instruction
1220 Value *ValA = OpA->getOperand(0);
1221 OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1222 if (!OpB || ValA->getType() != OpB->getType())
1223 return std::nullopt;
1224
1225 const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1226 const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1227 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1228 if (IdxDiffSCEV == SE.getCouldNotCompute())
1229 return std::nullopt;
1230
1231 ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1232 if (!IdxDiffRange.isSingleElement())
1233 return std::nullopt;
1234 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1235
1236 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1237 << "\n");
1238
1239 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1240 bool Safe = false;
1241
1242 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1243 // ValA, we're okay.
1244 if (OpB->getOpcode() == Instruction::Add &&
1245 isa<ConstantInt>(OpB->getOperand(1)) &&
1246 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1247 checkNoWrapFlags(OpB, Signed))
1248 Safe = true;
1249
1250 // Second attempt: check if we have eligible add NSW/NUW instruction
1251 // sequences.
1252 OpA = dyn_cast<Instruction>(ValA);
1253 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1254 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
1255 checkNoWrapFlags(OpB, Signed)) {
1256 // In the checks below a matching operand in OpA and OpB is an operand which
1257 // is the same in those two instructions. Below we account for possible
1258 // orders of the operands of these add instructions.
1259 for (unsigned MatchingOpIdxA : {0, 1})
1260 for (unsigned MatchingOpIdxB : {0, 1})
1261 if (!Safe)
1262 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1263 MatchingOpIdxB, Signed);
1264 }
1265
1266 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1267
1268 // Third attempt:
1269 //
1270 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1271 // order bit other than the sign bit are known to be zero in ValA, we can add
1272 // Diff to it while guaranteeing no overflow of any sort.
1273 //
1274 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1275 if (!Safe) {
1276 // When computing known bits, use the GEPs as context instructions, since
1277 // they likely are in the same BB as the load/store.
1278 KnownBits Known(BitWidth);
1279 computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, &AC, ContextInst,
1280 &DT);
1281 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1282 if (Signed)
1283 BitsAllowedToBeSet.clearBit(BitWidth - 1);
1284 Safe = BitsAllowedToBeSet.uge(IdxDiff.abs());
1285 }
1286
1287 if (Safe)
1288 return IdxDiff * Stride;
1289 return std::nullopt;
1290 }
1291
getConstantOffsetSelects(Value * PtrA,Value * PtrB,Instruction * ContextInst,unsigned Depth)1292 std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1293 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1294 if (Depth++ == MaxDepth)
1295 return std::nullopt;
1296
1297 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1298 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1299 if (SelectA->getCondition() != SelectB->getCondition())
1300 return std::nullopt;
1301 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1302 << ", PtrB=" << *PtrB << ", ContextInst="
1303 << *ContextInst << ", Depth=" << Depth << "\n");
1304 std::optional<APInt> TrueDiff = getConstantOffset(
1305 SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1306 if (!TrueDiff)
1307 return std::nullopt;
1308 std::optional<APInt> FalseDiff =
1309 getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1310 ContextInst, Depth);
1311 if (TrueDiff == FalseDiff)
1312 return TrueDiff;
1313 }
1314 }
1315 return std::nullopt;
1316 }
1317
mergeEquivalenceClasses(EquivalenceClassMap & EQClasses) const1318 void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
1319 if (EQClasses.size() < 2) // There is nothing to merge.
1320 return;
1321
1322 // The reduced key has all elements of the ECClassKey except the underlying
1323 // object. Check that EqClassKey has 4 elements and define the reduced key.
1324 static_assert(std::tuple_size_v<EqClassKey> == 4,
1325 "EqClassKey has changed - EqClassReducedKey needs changes too");
1326 using EqClassReducedKey =
1327 std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
1328 std::tuple_element_t<2, EqClassKey> /* Element size */,
1329 std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
1330 using ECReducedKeyToUnderlyingObjectMap =
1331 MapVector<EqClassReducedKey,
1332 SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
1333
1334 // Form a map from the reduced key (without the underlying object) to the
1335 // underlying objects: 1 reduced key to many underlying objects, to form
1336 // groups of potentially merge-able equivalence classes.
1337 ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
1338 bool FoundPotentiallyOptimizableEC = false;
1339 for (const auto &EC : EQClasses) {
1340 const auto &Key = EC.first;
1341 EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
1342 std::get<3>(Key)};
1343 auto &UOMap = RedKeyToUOMap[RedKey];
1344 UOMap.insert(std::get<0>(Key));
1345 if (UOMap.size() > 1)
1346 FoundPotentiallyOptimizableEC = true;
1347 }
1348 if (!FoundPotentiallyOptimizableEC)
1349 return;
1350
1351 LLVM_DEBUG({
1352 dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
1353 for (const auto &EC : EQClasses) {
1354 dbgs() << " Key: {" << EC.first << "}\n";
1355 for (const auto &Inst : EC.second)
1356 dbgs() << " Inst: " << *Inst << '\n';
1357 }
1358 });
1359 LLVM_DEBUG({
1360 dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
1361 for (const auto &RedKeyToUO : RedKeyToUOMap) {
1362 dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
1363 << std::get<1>(RedKeyToUO.first) << ", "
1364 << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
1365 << RedKeyToUO.second.size() << " underlying objects:\n";
1366 for (auto UObject : RedKeyToUO.second)
1367 dbgs() << " " << *UObject << '\n';
1368 }
1369 });
1370
1371 using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
1372
1373 // Compute the ultimate targets for a set of underlying objects.
1374 auto GetUltimateTargets =
1375 [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
1376 UObjectToUObjectMap IndirectionMap;
1377 for (const auto *UObject : UObjects) {
1378 const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
1379 const auto *UltimateTarget = getUnderlyingObject(UObject, MaxLookupDepth);
1380 if (UltimateTarget != UObject)
1381 IndirectionMap[UObject] = UltimateTarget;
1382 }
1383 UObjectToUObjectMap UltimateTargetsMap;
1384 for (const auto *UObject : UObjects) {
1385 auto Target = UObject;
1386 auto It = IndirectionMap.find(Target);
1387 for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target))
1388 Target = It->second;
1389 UltimateTargetsMap[UObject] = Target;
1390 }
1391 return UltimateTargetsMap;
1392 };
1393
1394 // For each item in RedKeyToUOMap, if it has more than one underlying object,
1395 // try to merge the equivalence classes.
1396 for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
1397 if (UObjects.size() < 2)
1398 continue;
1399 auto UTMap = GetUltimateTargets(UObjects);
1400 for (const auto &[UObject, UltimateTarget] : UTMap) {
1401 if (UObject == UltimateTarget)
1402 continue;
1403
1404 EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
1405 std::get<2>(RedKey)};
1406 EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
1407 std::get<2>(RedKey)};
1408 // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
1409 // request the reference to the instructions vector for KeyTo first.
1410 const auto &VecTo = EQClasses[KeyTo];
1411 const auto &VecFrom = EQClasses[KeyFrom];
1412 SmallVector<Instruction *, 8> MergedVec;
1413 std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
1414 std::back_inserter(MergedVec),
1415 [](Instruction *A, Instruction *B) {
1416 return A && B && A->comesBefore(B);
1417 });
1418 EQClasses[KeyTo] = std::move(MergedVec);
1419 EQClasses.erase(KeyFrom);
1420 }
1421 }
1422 LLVM_DEBUG({
1423 dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
1424 for (const auto &EC : EQClasses) {
1425 dbgs() << " Key: {" << EC.first << "}\n";
1426 for (const auto &Inst : EC.second)
1427 dbgs() << " Inst: " << *Inst << '\n';
1428 }
1429 });
1430 }
1431
1432 EquivalenceClassMap
collectEquivalenceClasses(BasicBlock::iterator Begin,BasicBlock::iterator End)1433 Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1434 BasicBlock::iterator End) {
1435 EquivalenceClassMap Ret;
1436
1437 auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
1438 const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1439 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1440 // The select's themselves are distinct instructions even if they share
1441 // the same condition and evaluate to consecutive pointers for true and
1442 // false values of the condition. Therefore using the select's themselves
1443 // for grouping instructions would put consecutive accesses into different
1444 // lists and they won't be even checked for being consecutive, and won't
1445 // be vectorized.
1446 return Sel->getCondition();
1447 }
1448 return ObjPtr;
1449 };
1450
1451 for (Instruction &I : make_range(Begin, End)) {
1452 auto *LI = dyn_cast<LoadInst>(&I);
1453 auto *SI = dyn_cast<StoreInst>(&I);
1454 if (!LI && !SI)
1455 continue;
1456
1457 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1458 continue;
1459
1460 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1461 (SI && !TTI.isLegalToVectorizeStore(SI)))
1462 continue;
1463
1464 Type *Ty = getLoadStoreType(&I);
1465 if (!VectorType::isValidElementType(Ty->getScalarType()))
1466 continue;
1467
1468 // Skip weird non-byte sizes. They probably aren't worth the effort of
1469 // handling correctly.
1470 unsigned TySize = DL.getTypeSizeInBits(Ty);
1471 if ((TySize % 8) != 0)
1472 continue;
1473
1474 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1475 // functions are currently using an integer type for the vectorized
1476 // load/store, and does not support casting between the integer type and a
1477 // vector of pointers (e.g. i64 to <2 x i16*>)
1478 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1479 continue;
1480
1481 Value *Ptr = getLoadStorePointerOperand(&I);
1482 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1483 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1484
1485 unsigned VF = VecRegSize / TySize;
1486 VectorType *VecTy = dyn_cast<VectorType>(Ty);
1487
1488 // Only handle power-of-two sized elements.
1489 if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1490 (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1491 continue;
1492
1493 // No point in looking at these if they're too big to vectorize.
1494 if (TySize > VecRegSize / 2 ||
1495 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1496 continue;
1497
1498 Ret[{GetUnderlyingObject(Ptr), AS,
1499 DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1500 /*IsLoad=*/LI != nullptr}]
1501 .emplace_back(&I);
1502 }
1503
1504 mergeEquivalenceClasses(Ret);
1505 return Ret;
1506 }
1507
gatherChains(ArrayRef<Instruction * > Instrs)1508 std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1509 if (Instrs.empty())
1510 return {};
1511
1512 unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1513 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1514
1515 #ifndef NDEBUG
1516 // Check that Instrs is in BB order and all have the same addr space.
1517 for (size_t I = 1; I < Instrs.size(); ++I) {
1518 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1519 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1520 }
1521 #endif
1522
1523 // Machinery to build an MRU-hashtable of Chains.
1524 //
1525 // (Ideally this could be done with MapVector, but as currently implemented,
1526 // moving an element to the front of a MapVector is O(n).)
1527 struct InstrListElem : ilist_node<InstrListElem>,
1528 std::pair<Instruction *, Chain> {
1529 explicit InstrListElem(Instruction *I)
1530 : std::pair<Instruction *, Chain>(I, {}) {}
1531 };
1532 struct InstrListElemDenseMapInfo {
1533 using PtrInfo = DenseMapInfo<InstrListElem *>;
1534 using IInfo = DenseMapInfo<Instruction *>;
1535 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1536 static InstrListElem *getTombstoneKey() {
1537 return PtrInfo::getTombstoneKey();
1538 }
1539 static unsigned getHashValue(const InstrListElem *E) {
1540 return IInfo::getHashValue(E->first);
1541 }
1542 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1543 if (A == getEmptyKey() || B == getEmptyKey())
1544 return A == getEmptyKey() && B == getEmptyKey();
1545 if (A == getTombstoneKey() || B == getTombstoneKey())
1546 return A == getTombstoneKey() && B == getTombstoneKey();
1547 return IInfo::isEqual(A->first, B->first);
1548 }
1549 };
1550 SpecificBumpPtrAllocator<InstrListElem> Allocator;
1551 simple_ilist<InstrListElem> MRU;
1552 DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
1553
1554 // Compare each instruction in `instrs` to leader of the N most recently-used
1555 // chains. This limits the O(n^2) behavior of this pass while also allowing
1556 // us to build arbitrarily long chains.
1557 for (Instruction *I : Instrs) {
1558 constexpr int MaxChainsToTry = 64;
1559
1560 bool MatchFound = false;
1561 auto ChainIter = MRU.begin();
1562 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1563 ++J, ++ChainIter) {
1564 if (std::optional<APInt> Offset = getConstantOffset(
1565 getLoadStorePointerOperand(ChainIter->first),
1566 getLoadStorePointerOperand(I),
1567 /*ContextInst=*/
1568 (ChainIter->first->comesBefore(I) ? I : ChainIter->first))) {
1569 // `Offset` might not have the expected number of bits, if e.g. AS has a
1570 // different number of bits than opaque pointers.
1571 ChainIter->second.emplace_back(I, Offset.value());
1572 // Move ChainIter to the front of the MRU list.
1573 MRU.remove(*ChainIter);
1574 MRU.push_front(*ChainIter);
1575 MatchFound = true;
1576 break;
1577 }
1578 }
1579
1580 if (!MatchFound) {
1581 APInt ZeroOffset(ASPtrBits, 0);
1582 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1583 E->second.emplace_back(I, ZeroOffset);
1584 MRU.push_front(*E);
1585 Chains.insert(E);
1586 }
1587 }
1588
1589 std::vector<Chain> Ret;
1590 Ret.reserve(Chains.size());
1591 // Iterate over MRU rather than Chains so the order is deterministic.
1592 for (auto &E : MRU)
1593 if (E.second.size() > 1)
1594 Ret.emplace_back(std::move(E.second));
1595 return Ret;
1596 }
1597
getConstantOffset(Value * PtrA,Value * PtrB,Instruction * ContextInst,unsigned Depth)1598 std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1599 Instruction *ContextInst,
1600 unsigned Depth) {
1601 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1602 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1603 << ", Depth=" << Depth << "\n");
1604 // We'll ultimately return a value of this bit width, even if computations
1605 // happen in a different width.
1606 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1607 APInt OffsetA(OrigBitWidth, 0);
1608 APInt OffsetB(OrigBitWidth, 0);
1609 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1610 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1611 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1612 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1613 return std::nullopt;
1614
1615 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1616 // should properly handle a possible overflow and the value should fit into
1617 // the smallest data type used in the cast/gep chain.
1618 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1619 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1620
1621 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1622 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1623 if (PtrA == PtrB)
1624 return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1625
1626 // Try to compute B - A.
1627 const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1628 if (DistScev != SE.getCouldNotCompute()) {
1629 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1630 ConstantRange DistRange = SE.getSignedRange(DistScev);
1631 if (DistRange.isSingleElement()) {
1632 // Handle index width (the width of Dist) != pointer width (the width of
1633 // the Offset*s at this point).
1634 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1635 return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1636 }
1637 }
1638 if (std::optional<APInt> Diff =
1639 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth))
1640 return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1641 .sextOrTrunc(OrigBitWidth);
1642 return std::nullopt;
1643 }
1644