1 //===- InterleavedAccessPass.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the Interleaved Access pass, which identifies
10 // interleaved memory accesses and transforms them into target specific
11 // intrinsics.
12 //
13 // An interleaved load reads data from memory into several vectors, with
14 // DE-interleaving the data on a factor. An interleaved store writes several
15 // vectors to memory with RE-interleaving the data on a factor.
16 //
17 // As interleaved accesses are difficult to identified in CodeGen (mainly
18 // because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
19 // IR), we identify and transform them to intrinsics in this pass so the
20 // intrinsics can be easily matched into target specific instructions later in
21 // CodeGen.
22 //
23 // E.g. An interleaved load (Factor = 2):
24 // %wide.vec = load <8 x i32>, <8 x i32>* %ptr
25 // %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
26 // %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
27 //
28 // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
29 // intrinsic in ARM backend.
30 //
31 // In X86, this can be further optimized into a set of target
32 // specific loads followed by an optimized sequence of shuffles.
33 //
34 // E.g. An interleaved store (Factor = 3):
35 // %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
36 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
37 // store <12 x i32> %i.vec, <12 x i32>* %ptr
38 //
39 // It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
40 // intrinsic in ARM backend.
41 //
42 // Similarly, a set of interleaved stores can be transformed into an optimized
43 // sequence of shuffles followed by a set of target specific stores for X86.
44 //
45 //===----------------------------------------------------------------------===//
46
47 #include "llvm/ADT/ArrayRef.h"
48 #include "llvm/ADT/DenseMap.h"
49 #include "llvm/ADT/SetVector.h"
50 #include "llvm/ADT/SmallVector.h"
51 #include "llvm/Analysis/VectorUtils.h"
52 #include "llvm/CodeGen/InterleavedAccess.h"
53 #include "llvm/CodeGen/TargetLowering.h"
54 #include "llvm/CodeGen/TargetPassConfig.h"
55 #include "llvm/CodeGen/TargetSubtargetInfo.h"
56 #include "llvm/IR/Constants.h"
57 #include "llvm/IR/Dominators.h"
58 #include "llvm/IR/Function.h"
59 #include "llvm/IR/IRBuilder.h"
60 #include "llvm/IR/InstIterator.h"
61 #include "llvm/IR/Instruction.h"
62 #include "llvm/IR/Instructions.h"
63 #include "llvm/IR/IntrinsicInst.h"
64 #include "llvm/IR/PatternMatch.h"
65 #include "llvm/InitializePasses.h"
66 #include "llvm/Pass.h"
67 #include "llvm/Support/Casting.h"
68 #include "llvm/Support/CommandLine.h"
69 #include "llvm/Support/Debug.h"
70 #include "llvm/Support/raw_ostream.h"
71 #include "llvm/Target/TargetMachine.h"
72 #include "llvm/Transforms/Utils/Local.h"
73 #include <cassert>
74 #include <utility>
75
76 using namespace llvm;
77
78 #define DEBUG_TYPE "interleaved-access"
79
80 static cl::opt<bool> LowerInterleavedAccesses(
81 "lower-interleaved-accesses",
82 cl::desc("Enable lowering interleaved accesses to intrinsics"),
83 cl::init(true), cl::Hidden);
84
85 namespace {
86
87 class InterleavedAccessImpl {
88 friend class InterleavedAccess;
89
90 public:
91 InterleavedAccessImpl() = default;
InterleavedAccessImpl(DominatorTree * DT,const TargetLowering * TLI)92 InterleavedAccessImpl(DominatorTree *DT, const TargetLowering *TLI)
93 : DT(DT), TLI(TLI), MaxFactor(TLI->getMaxSupportedInterleaveFactor()) {}
94 bool runOnFunction(Function &F);
95
96 private:
97 DominatorTree *DT = nullptr;
98 const TargetLowering *TLI = nullptr;
99
100 /// The maximum supported interleave factor.
101 unsigned MaxFactor = 0u;
102
103 /// Transform an interleaved load into target specific intrinsics.
104 bool lowerInterleavedLoad(Instruction *Load,
105 SmallSetVector<Instruction *, 32> &DeadInsts);
106
107 /// Transform an interleaved store into target specific intrinsics.
108 bool lowerInterleavedStore(Instruction *Store,
109 SmallSetVector<Instruction *, 32> &DeadInsts);
110
111 /// Transform a load and a deinterleave intrinsic into target specific
112 /// instructions.
113 bool lowerDeinterleaveIntrinsic(IntrinsicInst *II,
114 SmallSetVector<Instruction *, 32> &DeadInsts);
115
116 /// Transform an interleave intrinsic and a store into target specific
117 /// instructions.
118 bool lowerInterleaveIntrinsic(IntrinsicInst *II,
119 SmallSetVector<Instruction *, 32> &DeadInsts);
120
121 /// Returns true if the uses of an interleaved load by the
122 /// extractelement instructions in \p Extracts can be replaced by uses of the
123 /// shufflevector instructions in \p Shuffles instead. If so, the necessary
124 /// replacements are also performed.
125 bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
126 ArrayRef<ShuffleVectorInst *> Shuffles);
127
128 /// Given a number of shuffles of the form shuffle(binop(x,y)), convert them
129 /// to binop(shuffle(x), shuffle(y)) to allow the formation of an
130 /// interleaving load. Any newly created shuffles that operate on \p LI will
131 /// be added to \p Shuffles. Returns true, if any changes to the IR have been
132 /// made.
133 bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
134 SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
135 Instruction *LI);
136 };
137
138 class InterleavedAccess : public FunctionPass {
139 InterleavedAccessImpl Impl;
140
141 public:
142 static char ID;
143
InterleavedAccess()144 InterleavedAccess() : FunctionPass(ID) {
145 initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
146 }
147
getPassName() const148 StringRef getPassName() const override { return "Interleaved Access Pass"; }
149
150 bool runOnFunction(Function &F) override;
151
getAnalysisUsage(AnalysisUsage & AU) const152 void getAnalysisUsage(AnalysisUsage &AU) const override {
153 AU.addRequired<DominatorTreeWrapperPass>();
154 AU.setPreservesCFG();
155 }
156 };
157
158 } // end anonymous namespace.
159
run(Function & F,FunctionAnalysisManager & FAM)160 PreservedAnalyses InterleavedAccessPass::run(Function &F,
161 FunctionAnalysisManager &FAM) {
162 auto *DT = &FAM.getResult<DominatorTreeAnalysis>(F);
163 auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
164 InterleavedAccessImpl Impl(DT, TLI);
165 bool Changed = Impl.runOnFunction(F);
166
167 if (!Changed)
168 return PreservedAnalyses::all();
169
170 PreservedAnalyses PA;
171 PA.preserveSet<CFGAnalyses>();
172 return PA;
173 }
174
175 char InterleavedAccess::ID = 0;
176
runOnFunction(Function & F)177 bool InterleavedAccess::runOnFunction(Function &F) {
178 if (skipFunction(F))
179 return false;
180
181 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
182 if (!TPC || !LowerInterleavedAccesses)
183 return false;
184
185 LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
186
187 Impl.DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
188 auto &TM = TPC->getTM<TargetMachine>();
189 Impl.TLI = TM.getSubtargetImpl(F)->getTargetLowering();
190 Impl.MaxFactor = Impl.TLI->getMaxSupportedInterleaveFactor();
191
192 return Impl.runOnFunction(F);
193 }
194
195 INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE,
196 "Lower interleaved memory accesses to target specific intrinsics", false,
197 false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)198 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
199 INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE,
200 "Lower interleaved memory accesses to target specific intrinsics", false,
201 false)
202
203 FunctionPass *llvm::createInterleavedAccessPass() {
204 return new InterleavedAccess();
205 }
206
207 /// Check if the mask is a DE-interleave mask for an interleaved load.
208 ///
209 /// E.g. DE-interleave masks (Factor = 2) could be:
210 /// <0, 2, 4, 6> (mask of index 0 to extract even elements)
211 /// <1, 3, 5, 7> (mask of index 1 to extract odd elements)
isDeInterleaveMask(ArrayRef<int> Mask,unsigned & Factor,unsigned & Index,unsigned MaxFactor,unsigned NumLoadElements)212 static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
213 unsigned &Index, unsigned MaxFactor,
214 unsigned NumLoadElements) {
215 if (Mask.size() < 2)
216 return false;
217
218 // Check potential Factors.
219 for (Factor = 2; Factor <= MaxFactor; Factor++) {
220 // Make sure we don't produce a load wider than the input load.
221 if (Mask.size() * Factor > NumLoadElements)
222 return false;
223 if (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor, Index))
224 return true;
225 }
226
227 return false;
228 }
229
230 /// Check if the mask can be used in an interleaved store.
231 //
232 /// It checks for a more general pattern than the RE-interleave mask.
233 /// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
234 /// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35>
235 /// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
236 /// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5>
237 ///
238 /// The particular case of an RE-interleave mask is:
239 /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
240 /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
isReInterleaveMask(ShuffleVectorInst * SVI,unsigned & Factor,unsigned MaxFactor)241 static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
242 unsigned MaxFactor) {
243 unsigned NumElts = SVI->getShuffleMask().size();
244 if (NumElts < 4)
245 return false;
246
247 // Check potential Factors.
248 for (Factor = 2; Factor <= MaxFactor; Factor++) {
249 if (SVI->isInterleave(Factor))
250 return true;
251 }
252
253 return false;
254 }
255
256 // Return the corresponded deinterleaved mask, or nullptr if there is no valid
257 // mask.
258 static Value *getMask(Value *WideMask, unsigned Factor,
259 ElementCount LeafValueEC);
260
getMask(Value * WideMask,unsigned Factor,VectorType * LeafValueTy)261 static Value *getMask(Value *WideMask, unsigned Factor,
262 VectorType *LeafValueTy) {
263 return getMask(WideMask, Factor, LeafValueTy->getElementCount());
264 }
265
lowerInterleavedLoad(Instruction * Load,SmallSetVector<Instruction *,32> & DeadInsts)266 bool InterleavedAccessImpl::lowerInterleavedLoad(
267 Instruction *Load, SmallSetVector<Instruction *, 32> &DeadInsts) {
268 if (isa<ScalableVectorType>(Load->getType()))
269 return false;
270
271 if (auto *LI = dyn_cast<LoadInst>(Load)) {
272 if (!LI->isSimple())
273 return false;
274 } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
275 assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
276 // Require a constant mask.
277 if (!isa<ConstantVector>(VPLoad->getMaskParam()))
278 return false;
279 } else {
280 llvm_unreachable("unsupported load operation");
281 }
282
283 // Check if all users of this load are shufflevectors. If we encounter any
284 // users that are extractelement instructions or binary operators, we save
285 // them to later check if they can be modified to extract from one of the
286 // shufflevectors instead of the load.
287
288 SmallVector<ShuffleVectorInst *, 4> Shuffles;
289 SmallVector<ExtractElementInst *, 4> Extracts;
290 // BinOpShuffles need to be handled a single time in case both operands of the
291 // binop are the same load.
292 SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
293
294 for (auto *User : Load->users()) {
295 auto *Extract = dyn_cast<ExtractElementInst>(User);
296 if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
297 Extracts.push_back(Extract);
298 continue;
299 }
300 if (auto *BI = dyn_cast<BinaryOperator>(User)) {
301 if (!BI->user_empty() && all_of(BI->users(), [](auto *U) {
302 auto *SVI = dyn_cast<ShuffleVectorInst>(U);
303 return SVI && isa<UndefValue>(SVI->getOperand(1));
304 })) {
305 for (auto *SVI : BI->users())
306 BinOpShuffles.insert(cast<ShuffleVectorInst>(SVI));
307 continue;
308 }
309 }
310 auto *SVI = dyn_cast<ShuffleVectorInst>(User);
311 if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
312 return false;
313
314 Shuffles.push_back(SVI);
315 }
316
317 if (Shuffles.empty() && BinOpShuffles.empty())
318 return false;
319
320 unsigned Factor, Index;
321
322 unsigned NumLoadElements =
323 cast<FixedVectorType>(Load->getType())->getNumElements();
324 auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
325 // Check if the first shufflevector is DE-interleave shuffle.
326 if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
327 NumLoadElements))
328 return false;
329
330 // Holds the corresponding index for each DE-interleave shuffle.
331 SmallVector<unsigned, 4> Indices;
332
333 Type *VecTy = FirstSVI->getType();
334
335 // Check if other shufflevectors are also DE-interleaved of the same type
336 // and factor as the first shufflevector.
337 for (auto *Shuffle : Shuffles) {
338 if (Shuffle->getType() != VecTy)
339 return false;
340 if (!ShuffleVectorInst::isDeInterleaveMaskOfFactor(
341 Shuffle->getShuffleMask(), Factor, Index))
342 return false;
343
344 assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
345 Indices.push_back(Index);
346 }
347 for (auto *Shuffle : BinOpShuffles) {
348 if (Shuffle->getType() != VecTy)
349 return false;
350 if (!ShuffleVectorInst::isDeInterleaveMaskOfFactor(
351 Shuffle->getShuffleMask(), Factor, Index))
352 return false;
353
354 assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
355
356 if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == Load)
357 Indices.push_back(Index);
358 if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == Load)
359 Indices.push_back(Index);
360 }
361
362 // Try and modify users of the load that are extractelement instructions to
363 // use the shufflevector instructions instead of the load.
364 if (!tryReplaceExtracts(Extracts, Shuffles))
365 return false;
366
367 bool BinOpShuffleChanged =
368 replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
369
370 if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
371 Value *LaneMask =
372 getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
373 if (!LaneMask)
374 return false;
375
376 LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n");
377
378 // Sometimes the number of Shuffles might be less than Factor, we have to
379 // fill the gaps with null. Also, lowerInterleavedVPLoad
380 // expects them to be sorted.
381 SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
382 for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
383 ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
384 if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
385 // If Extracts is not empty, tryReplaceExtracts made changes earlier.
386 return !Extracts.empty() || BinOpShuffleChanged;
387 } else {
388 LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
389
390 // Try to create target specific intrinsics to replace the load and
391 // shuffles.
392 if (!TLI->lowerInterleavedLoad(cast<LoadInst>(Load), Shuffles, Indices,
393 Factor))
394 // If Extracts is not empty, tryReplaceExtracts made changes earlier.
395 return !Extracts.empty() || BinOpShuffleChanged;
396 }
397
398 DeadInsts.insert_range(Shuffles);
399
400 DeadInsts.insert(Load);
401 return true;
402 }
403
replaceBinOpShuffles(ArrayRef<ShuffleVectorInst * > BinOpShuffles,SmallVectorImpl<ShuffleVectorInst * > & Shuffles,Instruction * Load)404 bool InterleavedAccessImpl::replaceBinOpShuffles(
405 ArrayRef<ShuffleVectorInst *> BinOpShuffles,
406 SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *Load) {
407 for (auto *SVI : BinOpShuffles) {
408 BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
409 Type *BIOp0Ty = BI->getOperand(0)->getType();
410 ArrayRef<int> Mask = SVI->getShuffleMask();
411 assert(all_of(Mask, [&](int Idx) {
412 return Idx < (int)cast<FixedVectorType>(BIOp0Ty)->getNumElements();
413 }));
414
415 BasicBlock::iterator insertPos = SVI->getIterator();
416 auto *NewSVI1 =
417 new ShuffleVectorInst(BI->getOperand(0), PoisonValue::get(BIOp0Ty),
418 Mask, SVI->getName(), insertPos);
419 auto *NewSVI2 = new ShuffleVectorInst(
420 BI->getOperand(1), PoisonValue::get(BI->getOperand(1)->getType()), Mask,
421 SVI->getName(), insertPos);
422 BinaryOperator *NewBI = BinaryOperator::CreateWithCopiedFlags(
423 BI->getOpcode(), NewSVI1, NewSVI2, BI, BI->getName(), insertPos);
424 SVI->replaceAllUsesWith(NewBI);
425 LLVM_DEBUG(dbgs() << " Replaced: " << *BI << "\n And : " << *SVI
426 << "\n With : " << *NewSVI1 << "\n And : "
427 << *NewSVI2 << "\n And : " << *NewBI << "\n");
428 RecursivelyDeleteTriviallyDeadInstructions(SVI);
429 if (NewSVI1->getOperand(0) == Load)
430 Shuffles.push_back(NewSVI1);
431 if (NewSVI2->getOperand(0) == Load)
432 Shuffles.push_back(NewSVI2);
433 }
434
435 return !BinOpShuffles.empty();
436 }
437
tryReplaceExtracts(ArrayRef<ExtractElementInst * > Extracts,ArrayRef<ShuffleVectorInst * > Shuffles)438 bool InterleavedAccessImpl::tryReplaceExtracts(
439 ArrayRef<ExtractElementInst *> Extracts,
440 ArrayRef<ShuffleVectorInst *> Shuffles) {
441 // If there aren't any extractelement instructions to modify, there's nothing
442 // to do.
443 if (Extracts.empty())
444 return true;
445
446 // Maps extractelement instructions to vector-index pairs. The extractlement
447 // instructions will be modified to use the new vector and index operands.
448 DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
449
450 for (auto *Extract : Extracts) {
451 // The vector index that is extracted.
452 auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
453 auto Index = IndexOperand->getSExtValue();
454
455 // Look for a suitable shufflevector instruction. The goal is to modify the
456 // extractelement instruction (which uses an interleaved load) to use one
457 // of the shufflevector instructions instead of the load.
458 for (auto *Shuffle : Shuffles) {
459 // If the shufflevector instruction doesn't dominate the extract, we
460 // can't create a use of it.
461 if (!DT->dominates(Shuffle, Extract))
462 continue;
463
464 // Inspect the indices of the shufflevector instruction. If the shuffle
465 // selects the same index that is extracted, we can modify the
466 // extractelement instruction.
467 SmallVector<int, 4> Indices;
468 Shuffle->getShuffleMask(Indices);
469 for (unsigned I = 0; I < Indices.size(); ++I)
470 if (Indices[I] == Index) {
471 assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
472 "Vector operations do not match");
473 ReplacementMap[Extract] = std::make_pair(Shuffle, I);
474 break;
475 }
476
477 // If we found a suitable shufflevector instruction, stop looking.
478 if (ReplacementMap.count(Extract))
479 break;
480 }
481
482 // If we did not find a suitable shufflevector instruction, the
483 // extractelement instruction cannot be modified, so we must give up.
484 if (!ReplacementMap.count(Extract))
485 return false;
486 }
487
488 // Finally, perform the replacements.
489 IRBuilder<> Builder(Extracts[0]->getContext());
490 for (auto &Replacement : ReplacementMap) {
491 auto *Extract = Replacement.first;
492 auto *Vector = Replacement.second.first;
493 auto Index = Replacement.second.second;
494 Builder.SetInsertPoint(Extract);
495 Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
496 Extract->eraseFromParent();
497 }
498
499 return true;
500 }
501
lowerInterleavedStore(Instruction * Store,SmallSetVector<Instruction *,32> & DeadInsts)502 bool InterleavedAccessImpl::lowerInterleavedStore(
503 Instruction *Store, SmallSetVector<Instruction *, 32> &DeadInsts) {
504 Value *StoredValue;
505 if (auto *SI = dyn_cast<StoreInst>(Store)) {
506 if (!SI->isSimple())
507 return false;
508 StoredValue = SI->getValueOperand();
509 } else if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
510 assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
511 // Require a constant mask.
512 if (!isa<ConstantVector>(VPStore->getMaskParam()))
513 return false;
514 StoredValue = VPStore->getArgOperand(0);
515 } else {
516 llvm_unreachable("unsupported store operation");
517 }
518
519 auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
520 if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
521 return false;
522
523 unsigned NumStoredElements =
524 cast<FixedVectorType>(SVI->getType())->getNumElements();
525 // Check if the shufflevector is RE-interleave shuffle.
526 unsigned Factor;
527 if (!isReInterleaveMask(SVI, Factor, MaxFactor))
528 return false;
529 assert(NumStoredElements % Factor == 0 &&
530 "number of stored element should be a multiple of Factor");
531
532 if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
533 unsigned LaneMaskLen = NumStoredElements / Factor;
534 Value *LaneMask = getMask(VPStore->getMaskParam(), Factor,
535 ElementCount::getFixed(LaneMaskLen));
536 if (!LaneMask)
537 return false;
538
539 LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store
540 << "\n");
541
542 IRBuilder<> Builder(VPStore);
543 // We need to effectively de-interleave the shufflemask
544 // because lowerInterleavedVPStore expects individual de-interleaved
545 // values.
546 SmallVector<Value *, 10> NewShuffles;
547 SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
548 auto ShuffleMask = SVI->getShuffleMask();
549
550 for (unsigned i = 0; i < Factor; i++) {
551 for (unsigned j = 0; j < LaneMaskLen; j++)
552 NewShuffleMask[j] = ShuffleMask[i + Factor * j];
553
554 NewShuffles.push_back(Builder.CreateShuffleVector(
555 SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
556 }
557
558 // Try to create target specific intrinsics to replace the vp.store and
559 // shuffle.
560 if (!TLI->lowerInterleavedVPStore(VPStore, LaneMask, NewShuffles))
561 // We already created new shuffles.
562 return true;
563 } else {
564 LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
565
566 // Try to create target specific intrinsics to replace the store and
567 // shuffle.
568 if (!TLI->lowerInterleavedStore(cast<StoreInst>(Store), SVI, Factor))
569 return false;
570 }
571
572 // Already have a new target specific interleaved store. Erase the old store.
573 DeadInsts.insert(Store);
574 DeadInsts.insert(SVI);
575 return true;
576 }
577
getMask(Value * WideMask,unsigned Factor,ElementCount LeafValueEC)578 static Value *getMask(Value *WideMask, unsigned Factor,
579 ElementCount LeafValueEC) {
580 if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
581 if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID());
582 F && F == Factor && llvm::all_equal(IMI->args())) {
583 return IMI->getArgOperand(0);
584 }
585 }
586
587 if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
588 if (auto *Splat = ConstMask->getSplatValue())
589 // All-ones or all-zeros mask.
590 return ConstantVector::getSplat(LeafValueEC, Splat);
591
592 if (LeafValueEC.isFixed()) {
593 unsigned LeafMaskLen = LeafValueEC.getFixedValue();
594 SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
595 // If this is a fixed-length constant mask, each lane / leaf has to
596 // use the same mask. This is done by checking if every group with Factor
597 // number of elements in the interleaved mask has homogeneous values.
598 for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
599 Constant *C = ConstMask->getAggregateElement(Idx);
600 if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
601 return nullptr;
602 LeafMask[Idx / Factor] = C;
603 }
604
605 return ConstantVector::get(LeafMask);
606 }
607 }
608
609 return nullptr;
610 }
611
lowerDeinterleaveIntrinsic(IntrinsicInst * DI,SmallSetVector<Instruction *,32> & DeadInsts)612 bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
613 IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
614 Value *LoadedVal = DI->getOperand(0);
615 if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
616 return false;
617
618 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
619 assert(Factor && "unexpected deinterleave intrinsic");
620
621 SmallVector<Value *, 8> DeinterleaveValues(Factor, nullptr);
622 Value *LastFactor = nullptr;
623 for (auto *User : DI->users()) {
624 auto *Extract = dyn_cast<ExtractValueInst>(User);
625 if (!Extract || Extract->getNumIndices() != 1)
626 return false;
627 unsigned Idx = Extract->getIndices()[0];
628 if (DeinterleaveValues[Idx])
629 return false;
630 DeinterleaveValues[Idx] = Extract;
631 LastFactor = Extract;
632 }
633
634 if (!LastFactor)
635 return false;
636
637 if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
638 if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
639 return false;
640 // Check mask operand. Handle both all-true/false and interleaved mask.
641 Value *WideMask = VPLoad->getOperand(1);
642 Value *Mask =
643 getMask(WideMask, Factor, cast<VectorType>(LastFactor->getType()));
644 if (!Mask)
645 return false;
646
647 LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic "
648 << *DI << " and factor = " << Factor << "\n");
649
650 // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
651 // TLI function to emit target-specific interleaved instruction.
652 if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
653 return false;
654
655 } else {
656 auto *LI = cast<LoadInst>(LoadedVal);
657 if (!LI->isSimple())
658 return false;
659
660 LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
661 << " and factor = " << Factor << "\n");
662
663 // Try and match this with target specific intrinsics.
664 if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
665 return false;
666 }
667
668 for (Value *V : DeinterleaveValues)
669 if (V)
670 DeadInsts.insert(cast<Instruction>(V));
671 DeadInsts.insert(DI);
672 // We now have a target-specific load, so delete the old one.
673 DeadInsts.insert(cast<Instruction>(LoadedVal));
674 return true;
675 }
676
lowerInterleaveIntrinsic(IntrinsicInst * II,SmallSetVector<Instruction *,32> & DeadInsts)677 bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
678 IntrinsicInst *II, SmallSetVector<Instruction *, 32> &DeadInsts) {
679 if (!II->hasOneUse())
680 return false;
681 Value *StoredBy = II->user_back();
682 if (!isa<StoreInst, VPIntrinsic>(StoredBy))
683 return false;
684
685 SmallVector<Value *, 8> InterleaveValues(II->args());
686 const unsigned Factor = getInterleaveIntrinsicFactor(II->getIntrinsicID());
687 assert(Factor && "unexpected interleave intrinsic");
688
689 if (auto *VPStore = dyn_cast<VPIntrinsic>(StoredBy)) {
690 if (VPStore->getIntrinsicID() != Intrinsic::vp_store)
691 return false;
692
693 Value *WideMask = VPStore->getOperand(2);
694 Value *Mask = getMask(WideMask, Factor,
695 cast<VectorType>(InterleaveValues[0]->getType()));
696 if (!Mask)
697 return false;
698
699 LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic "
700 << *II << " and factor = " << Factor << "\n");
701
702 // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
703 // TLI function to emit target-specific interleaved instruction.
704 if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues))
705 return false;
706 } else {
707 auto *SI = cast<StoreInst>(StoredBy);
708 if (!SI->isSimple())
709 return false;
710
711 LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II
712 << " and factor = " << Factor << "\n");
713
714 // Try and match this with target specific intrinsics.
715 if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues))
716 return false;
717 }
718
719 // We now have a target-specific store, so delete the old one.
720 DeadInsts.insert(cast<Instruction>(StoredBy));
721 DeadInsts.insert(II);
722 return true;
723 }
724
runOnFunction(Function & F)725 bool InterleavedAccessImpl::runOnFunction(Function &F) {
726 // Holds dead instructions that will be erased later.
727 SmallSetVector<Instruction *, 32> DeadInsts;
728 bool Changed = false;
729
730 using namespace PatternMatch;
731 for (auto &I : instructions(F)) {
732 if (match(&I, m_CombineOr(m_Load(m_Value()),
733 m_Intrinsic<Intrinsic::vp_load>())))
734 Changed |= lowerInterleavedLoad(&I, DeadInsts);
735
736 if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
737 m_Intrinsic<Intrinsic::vp_store>())))
738 Changed |= lowerInterleavedStore(&I, DeadInsts);
739
740 if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
741 if (getDeinterleaveIntrinsicFactor(II->getIntrinsicID()))
742 Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
743 else if (getInterleaveIntrinsicFactor(II->getIntrinsicID()))
744 Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
745 }
746 }
747
748 for (auto *I : DeadInsts)
749 I->eraseFromParent();
750
751 return Changed;
752 }
753