xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp (revision a3266ba2697a383d2ede56803320d941866c7e76)
1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// This file implements SLP analysis based on VPlan. The analysis is based on
9 /// the ideas described in
10 ///
11 ///   Look-ahead SLP: auto-vectorization in the presence of commutative
12 ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
13 ///   Luís F. W. Góes
14 ///
15 //===----------------------------------------------------------------------===//
16 
17 #include "VPlan.h"
18 #include "llvm/ADT/DepthFirstIterator.h"
19 #include "llvm/ADT/PostOrderIterator.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Twine.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/VectorUtils.h"
24 #include "llvm/IR/BasicBlock.h"
25 #include "llvm/IR/CFG.h"
26 #include "llvm/IR/Dominators.h"
27 #include "llvm/IR/InstrTypes.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Type.h"
31 #include "llvm/IR/Value.h"
32 #include "llvm/Support/Casting.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/ErrorHandling.h"
35 #include "llvm/Support/GraphWriter.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
38 #include <cassert>
39 #include <iterator>
40 #include <string>
41 #include <vector>
42 
43 using namespace llvm;
44 
45 #define DEBUG_TYPE "vplan-slp"
46 
47 // Number of levels to look ahead when re-ordering multi node operands.
48 static unsigned LookaheadMaxDepth = 5;
49 
50 VPInstruction *VPlanSlp::markFailed() {
51   // FIXME: Currently this is used to signal we hit instructions we cannot
52   //        trivially SLP'ize.
53   CompletelySLP = false;
54   return nullptr;
55 }
56 
57 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
58   if (all_of(Operands, [](VPValue *V) {
59         return cast<VPInstruction>(V)->getUnderlyingInstr();
60       })) {
61     unsigned BundleSize = 0;
62     for (VPValue *V : Operands) {
63       Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
64       assert(!T->isVectorTy() && "Only scalar types supported for now");
65       BundleSize += T->getScalarSizeInBits();
66     }
67     WidestBundleBits = std::max(WidestBundleBits, BundleSize);
68   }
69 
70   auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
71   assert(Res.second &&
72          "Already created a combined instruction for the operand bundle");
73   (void)Res;
74 }
75 
76 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
77   // Currently we only support VPInstructions.
78   if (!all_of(Operands, [](VPValue *Op) {
79         return Op && isa<VPInstruction>(Op) &&
80                cast<VPInstruction>(Op)->getUnderlyingInstr();
81       })) {
82     LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
83     return false;
84   }
85 
86   // Check if opcodes and type width agree for all instructions in the bundle.
87   // FIXME: Differing widths/opcodes can be handled by inserting additional
88   //        instructions.
89   // FIXME: Deal with non-primitive types.
90   const Instruction *OriginalInstr =
91       cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
92   unsigned Opcode = OriginalInstr->getOpcode();
93   unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
94   if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
95         const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
96         return I->getOpcode() == Opcode &&
97                I->getType()->getPrimitiveSizeInBits() == Width;
98       })) {
99     LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
100     return false;
101   }
102 
103   // For now, all operands must be defined in the same BB.
104   if (any_of(Operands, [this](VPValue *Op) {
105         return cast<VPInstruction>(Op)->getParent() != &this->BB;
106       })) {
107     LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
108     return false;
109   }
110 
111   if (any_of(Operands,
112              [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
113     LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
114     return false;
115   }
116 
117   // For loads, check that there are no instructions writing to memory in
118   // between them.
119   // TODO: we only have to forbid instructions writing to memory that could
120   //       interfere with any of the loads in the bundle
121   if (Opcode == Instruction::Load) {
122     unsigned LoadsSeen = 0;
123     VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
124     for (auto &I : *Parent) {
125       auto *VPI = cast<VPInstruction>(&I);
126       if (VPI->getOpcode() == Instruction::Load &&
127           llvm::is_contained(Operands, VPI))
128         LoadsSeen++;
129 
130       if (LoadsSeen == Operands.size())
131         break;
132       if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
133         LLVM_DEBUG(
134             dbgs() << "VPSLP: instruction modifying memory between loads\n");
135         return false;
136       }
137     }
138 
139     if (!all_of(Operands, [](VPValue *Op) {
140           return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
141               ->isSimple();
142         })) {
143       LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
144       return false;
145     }
146   }
147 
148   if (Opcode == Instruction::Store)
149     if (!all_of(Operands, [](VPValue *Op) {
150           return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
151               ->isSimple();
152         })) {
153       LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
154       return false;
155     }
156 
157   return true;
158 }
159 
160 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
161                                              unsigned OperandIndex) {
162   SmallVector<VPValue *, 4> Operands;
163   for (VPValue *V : Values) {
164     // Currently we only support VPInstructions.
165     auto *U = cast<VPInstruction>(V);
166     Operands.push_back(U->getOperand(OperandIndex));
167   }
168   return Operands;
169 }
170 
171 static bool areCommutative(ArrayRef<VPValue *> Values) {
172   return Instruction::isCommutative(
173       cast<VPInstruction>(Values[0])->getOpcode());
174 }
175 
176 static SmallVector<SmallVector<VPValue *, 4>, 4>
177 getOperands(ArrayRef<VPValue *> Values) {
178   SmallVector<SmallVector<VPValue *, 4>, 4> Result;
179   auto *VPI = cast<VPInstruction>(Values[0]);
180 
181   switch (VPI->getOpcode()) {
182   case Instruction::Load:
183     llvm_unreachable("Loads terminate a tree, no need to get operands");
184   case Instruction::Store:
185     Result.push_back(getOperands(Values, 0));
186     break;
187   default:
188     for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
189       Result.push_back(getOperands(Values, I));
190     break;
191   }
192 
193   return Result;
194 }
195 
196 /// Returns the opcode of Values or ~0 if they do not all agree.
197 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
198   unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
199   if (any_of(Values, [Opcode](VPValue *V) {
200         return cast<VPInstruction>(V)->getOpcode() != Opcode;
201       }))
202     return None;
203   return {Opcode};
204 }
205 
206 /// Returns true if A and B access sequential memory if they are loads or
207 /// stores or if they have identical opcodes otherwise.
208 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
209                                   VPInterleavedAccessInfo &IAI) {
210   if (A->getOpcode() != B->getOpcode())
211     return false;
212 
213   if (A->getOpcode() != Instruction::Load &&
214       A->getOpcode() != Instruction::Store)
215     return true;
216   auto *GA = IAI.getInterleaveGroup(A);
217   auto *GB = IAI.getInterleaveGroup(B);
218 
219   return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
220 }
221 
222 /// Implements getLAScore from Listing 7 in the paper.
223 /// Traverses and compares operands of V1 and V2 to MaxLevel.
224 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
225                            VPInterleavedAccessInfo &IAI) {
226   auto *I1 = dyn_cast<VPInstruction>(V1);
227   auto *I2 = dyn_cast<VPInstruction>(V2);
228   // Currently we only support VPInstructions.
229   if (!I1 || !I2)
230     return 0;
231 
232   if (MaxLevel == 0)
233     return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);
234 
235   unsigned Score = 0;
236   for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
237     for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
238       Score +=
239           getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
240   return Score;
241 }
242 
243 std::pair<VPlanSlp::OpMode, VPValue *>
244 VPlanSlp::getBest(OpMode Mode, VPValue *Last,
245                   SmallPtrSetImpl<VPValue *> &Candidates,
246                   VPInterleavedAccessInfo &IAI) {
247   assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
248          "Currently we only handle load and commutative opcodes");
249   LLVM_DEBUG(dbgs() << "      getBest\n");
250 
251   SmallVector<VPValue *, 4> BestCandidates;
252   LLVM_DEBUG(dbgs() << "        Candidates  for "
253                     << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
254   for (auto *Candidate : Candidates) {
255     auto *LastI = cast<VPInstruction>(Last);
256     auto *CandidateI = cast<VPInstruction>(Candidate);
257     if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
258       LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
259                         << " ");
260       BestCandidates.push_back(Candidate);
261     }
262   }
263   LLVM_DEBUG(dbgs() << "\n");
264 
265   if (BestCandidates.empty())
266     return {OpMode::Failed, nullptr};
267 
268   if (BestCandidates.size() == 1)
269     return {Mode, BestCandidates[0]};
270 
271   VPValue *Best = nullptr;
272   unsigned BestScore = 0;
273   for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
274     unsigned PrevScore = ~0u;
275     bool AllSame = true;
276 
277     // FIXME: Avoid visiting the same operands multiple times.
278     for (auto *Candidate : BestCandidates) {
279       unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
280       if (PrevScore == ~0u)
281         PrevScore = Score;
282       if (PrevScore != Score)
283         AllSame = false;
284       PrevScore = Score;
285 
286       if (Score > BestScore) {
287         BestScore = Score;
288         Best = Candidate;
289       }
290     }
291     if (!AllSame)
292       break;
293   }
294   LLVM_DEBUG(dbgs() << "Found best "
295                     << *cast<VPInstruction>(Best)->getUnderlyingInstr()
296                     << "\n");
297   Candidates.erase(Best);
298 
299   return {Mode, Best};
300 }
301 
302 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
303   SmallVector<MultiNodeOpTy, 4> FinalOrder;
304   SmallVector<OpMode, 4> Mode;
305   FinalOrder.reserve(MultiNodeOps.size());
306   Mode.reserve(MultiNodeOps.size());
307 
308   LLVM_DEBUG(dbgs() << "Reordering multinode\n");
309 
310   for (auto &Operands : MultiNodeOps) {
311     FinalOrder.push_back({Operands.first, {Operands.second[0]}});
312     if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
313         Instruction::Load)
314       Mode.push_back(OpMode::Load);
315     else
316       Mode.push_back(OpMode::Opcode);
317   }
318 
319   for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
320     LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n");
321     SmallPtrSet<VPValue *, 4> Candidates;
322     LLVM_DEBUG(dbgs() << "  Candidates  ");
323     for (auto Ops : MultiNodeOps) {
324       LLVM_DEBUG(
325           dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
326                  << " ");
327       Candidates.insert(Ops.second[Lane]);
328     }
329     LLVM_DEBUG(dbgs() << "\n");
330 
331     for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
332       LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n");
333       if (Mode[Op] == OpMode::Failed)
334         continue;
335 
336       VPValue *Last = FinalOrder[Op].second[Lane - 1];
337       std::pair<OpMode, VPValue *> Res =
338           getBest(Mode[Op], Last, Candidates, IAI);
339       if (Res.second)
340         FinalOrder[Op].second.push_back(Res.second);
341       else
342         // TODO: handle this case
343         FinalOrder[Op].second.push_back(markFailed());
344     }
345   }
346 
347   return FinalOrder;
348 }
349 
350 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
351   dbgs() << " Ops: ";
352   for (auto Op : Values) {
353     if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
354       if (auto *Instr = VPInstr->getUnderlyingInstr()) {
355         dbgs() << *Instr << " | ";
356         continue;
357       }
358     dbgs() << " nullptr | ";
359   }
360   dbgs() << "\n";
361 }
362 
363 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
364   assert(!Values.empty() && "Need some operands!");
365 
366   // If we already visited this instruction bundle, re-use the existing node
367   auto I = BundleToCombined.find(to_vector<4>(Values));
368   if (I != BundleToCombined.end()) {
369 #ifndef NDEBUG
370     // Check that the resulting graph is a tree. If we re-use a node, this means
371     // its values have multiple users. We only allow this, if all users of each
372     // value are the same instruction.
373     for (auto *V : Values) {
374       auto UI = V->user_begin();
375       auto *FirstUser = *UI++;
376       while (UI != V->user_end()) {
377         assert(*UI == FirstUser && "Currently we only support SLP trees.");
378         UI++;
379       }
380     }
381 #endif
382     return I->second;
383   }
384 
385   // Dump inputs
386   LLVM_DEBUG({
387     dbgs() << "buildGraph: ";
388     dumpBundle(Values);
389   });
390 
391   if (!areVectorizable(Values))
392     return markFailed();
393 
394   assert(getOpcode(Values) && "Opcodes for all values must match");
395   unsigned ValuesOpcode = getOpcode(Values).getValue();
396 
397   SmallVector<VPValue *, 4> CombinedOperands;
398   if (areCommutative(Values)) {
399     bool MultiNodeRoot = !MultiNodeActive;
400     MultiNodeActive = true;
401     for (auto &Operands : getOperands(Values)) {
402       LLVM_DEBUG({
403         dbgs() << "  Visiting Commutative";
404         dumpBundle(Operands);
405       });
406 
407       auto OperandsOpcode = getOpcode(Operands);
408       if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
409         LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n");
410         CombinedOperands.push_back(buildGraph(Operands));
411       } else {
412         LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
413         // Create dummy VPInstruction, which will we replace later by the
414         // re-ordered operand.
415         VPInstruction *Op = new VPInstruction(0, {});
416         CombinedOperands.push_back(Op);
417         MultiNodeOps.emplace_back(Op, Operands);
418       }
419     }
420 
421     if (MultiNodeRoot) {
422       LLVM_DEBUG(dbgs() << "Reorder \n");
423       MultiNodeActive = false;
424 
425       auto FinalOrder = reorderMultiNodeOps();
426 
427       MultiNodeOps.clear();
428       for (auto &Ops : FinalOrder) {
429         VPInstruction *NewOp = buildGraph(Ops.second);
430         Ops.first->replaceAllUsesWith(NewOp);
431         for (unsigned i = 0; i < CombinedOperands.size(); i++)
432           if (CombinedOperands[i] == Ops.first)
433             CombinedOperands[i] = NewOp;
434         delete Ops.first;
435         Ops.first = NewOp;
436       }
437       LLVM_DEBUG(dbgs() << "Found final order\n");
438     }
439   } else {
440     LLVM_DEBUG(dbgs() << "  NonCommuntative\n");
441     if (ValuesOpcode == Instruction::Load)
442       for (VPValue *V : Values)
443         CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
444     else
445       for (auto &Operands : getOperands(Values))
446         CombinedOperands.push_back(buildGraph(Operands));
447   }
448 
449   unsigned Opcode;
450   switch (ValuesOpcode) {
451   case Instruction::Load:
452     Opcode = VPInstruction::SLPLoad;
453     break;
454   case Instruction::Store:
455     Opcode = VPInstruction::SLPStore;
456     break;
457   default:
458     Opcode = ValuesOpcode;
459     break;
460   }
461 
462   if (!CompletelySLP)
463     return markFailed();
464 
465   assert(CombinedOperands.size() > 0 && "Need more some operands");
466   auto *VPI = new VPInstruction(Opcode, CombinedOperands);
467   VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
468 
469   LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
470                     << *cast<VPInstruction>(Values[0]) << "\n");
471   addCombined(Values, VPI);
472   return VPI;
473 }
474