xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp (revision ec4deee4e4f2aef1b97d9424f25d04e91fd7dc10)
1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// This file implements SLP analysis based on VPlan. The analysis is based on
9 /// the ideas described in
10 ///
11 ///   Look-ahead SLP: auto-vectorization in the presence of commutative
12 ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
13 ///   Luís F. W. Góes
14 ///
15 //===----------------------------------------------------------------------===//
16 
17 #include "VPlan.h"
18 #include "llvm/ADT/DepthFirstIterator.h"
19 #include "llvm/ADT/PostOrderIterator.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Twine.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/VectorUtils.h"
24 #include "llvm/IR/BasicBlock.h"
25 #include "llvm/IR/CFG.h"
26 #include "llvm/IR/Dominators.h"
27 #include "llvm/IR/InstrTypes.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Type.h"
31 #include "llvm/IR/Value.h"
32 #include "llvm/Support/Casting.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/ErrorHandling.h"
35 #include "llvm/Support/GraphWriter.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
38 #include <cassert>
39 #include <iterator>
40 #include <string>
41 #include <vector>
42 
43 using namespace llvm;
44 
45 #define DEBUG_TYPE "vplan-slp"
46 
47 // Number of levels to look ahead when re-ordering multi node operands.
48 static unsigned LookaheadMaxDepth = 5;
49 
50 VPInstruction *VPlanSlp::markFailed() {
51   // FIXME: Currently this is used to signal we hit instructions we cannot
52   //        trivially SLP'ize.
53   CompletelySLP = false;
54   return nullptr;
55 }
56 
57 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
58   if (all_of(Operands, [](VPValue *V) {
59         return cast<VPInstruction>(V)->getUnderlyingInstr();
60       })) {
61     unsigned BundleSize = 0;
62     for (VPValue *V : Operands) {
63       Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
64       assert(!T->isVectorTy() && "Only scalar types supported for now");
65       BundleSize += T->getScalarSizeInBits();
66     }
67     WidestBundleBits = std::max(WidestBundleBits, BundleSize);
68   }
69 
70   auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
71   assert(Res.second &&
72          "Already created a combined instruction for the operand bundle");
73   (void)Res;
74 }
75 
76 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
77   // Currently we only support VPInstructions.
78   if (!all_of(Operands, [](VPValue *Op) {
79         return Op && isa<VPInstruction>(Op) &&
80                cast<VPInstruction>(Op)->getUnderlyingInstr();
81       })) {
82     LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
83     return false;
84   }
85 
86   // Check if opcodes and type width agree for all instructions in the bundle.
87   // FIXME: Differing widths/opcodes can be handled by inserting additional
88   //        instructions.
89   // FIXME: Deal with non-primitive types.
90   const Instruction *OriginalInstr =
91       cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
92   unsigned Opcode = OriginalInstr->getOpcode();
93   unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
94   if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
95         const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
96         return I->getOpcode() == Opcode &&
97                I->getType()->getPrimitiveSizeInBits() == Width;
98       })) {
99     LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
100     return false;
101   }
102 
103   // For now, all operands must be defined in the same BB.
104   if (any_of(Operands, [this](VPValue *Op) {
105         return cast<VPInstruction>(Op)->getParent() != &this->BB;
106       })) {
107     LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
108     return false;
109   }
110 
111   if (any_of(Operands,
112              [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
113     LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
114     return false;
115   }
116 
117   // For loads, check that there are no instructions writing to memory in
118   // between them.
119   // TODO: we only have to forbid instructions writing to memory that could
120   //       interfere with any of the loads in the bundle
121   if (Opcode == Instruction::Load) {
122     unsigned LoadsSeen = 0;
123     VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
124     for (auto &I : *Parent) {
125       auto *VPI = cast<VPInstruction>(&I);
126       if (VPI->getOpcode() == Instruction::Load &&
127           std::find(Operands.begin(), Operands.end(), VPI) != Operands.end())
128         LoadsSeen++;
129 
130       if (LoadsSeen == Operands.size())
131         break;
132       if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
133         LLVM_DEBUG(
134             dbgs() << "VPSLP: instruction modifying memory between loads\n");
135         return false;
136       }
137     }
138 
139     if (!all_of(Operands, [](VPValue *Op) {
140           return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
141               ->isSimple();
142         })) {
143       LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
144       return false;
145     }
146   }
147 
148   if (Opcode == Instruction::Store)
149     if (!all_of(Operands, [](VPValue *Op) {
150           return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
151               ->isSimple();
152         })) {
153       LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
154       return false;
155     }
156 
157   return true;
158 }
159 
160 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
161                                              unsigned OperandIndex) {
162   SmallVector<VPValue *, 4> Operands;
163   for (VPValue *V : Values) {
164     auto *U = cast<VPUser>(V);
165     Operands.push_back(U->getOperand(OperandIndex));
166   }
167   return Operands;
168 }
169 
170 static bool areCommutative(ArrayRef<VPValue *> Values) {
171   return Instruction::isCommutative(
172       cast<VPInstruction>(Values[0])->getOpcode());
173 }
174 
175 static SmallVector<SmallVector<VPValue *, 4>, 4>
176 getOperands(ArrayRef<VPValue *> Values) {
177   SmallVector<SmallVector<VPValue *, 4>, 4> Result;
178   auto *VPI = cast<VPInstruction>(Values[0]);
179 
180   switch (VPI->getOpcode()) {
181   case Instruction::Load:
182     llvm_unreachable("Loads terminate a tree, no need to get operands");
183   case Instruction::Store:
184     Result.push_back(getOperands(Values, 0));
185     break;
186   default:
187     for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
188       Result.push_back(getOperands(Values, I));
189     break;
190   }
191 
192   return Result;
193 }
194 
195 /// Returns the opcode of Values or ~0 if they do not all agree.
196 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
197   unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
198   if (any_of(Values, [Opcode](VPValue *V) {
199         return cast<VPInstruction>(V)->getOpcode() != Opcode;
200       }))
201     return None;
202   return {Opcode};
203 }
204 
205 /// Returns true if A and B access sequential memory if they are loads or
206 /// stores or if they have identical opcodes otherwise.
207 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
208                                   VPInterleavedAccessInfo &IAI) {
209   if (A->getOpcode() != B->getOpcode())
210     return false;
211 
212   if (A->getOpcode() != Instruction::Load &&
213       A->getOpcode() != Instruction::Store)
214     return true;
215   auto *GA = IAI.getInterleaveGroup(A);
216   auto *GB = IAI.getInterleaveGroup(B);
217 
218   return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
219 }
220 
221 /// Implements getLAScore from Listing 7 in the paper.
222 /// Traverses and compares operands of V1 and V2 to MaxLevel.
223 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
224                            VPInterleavedAccessInfo &IAI) {
225   if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2))
226     return 0;
227 
228   if (MaxLevel == 0)
229     return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1),
230                                            cast<VPInstruction>(V2), IAI);
231 
232   unsigned Score = 0;
233   for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I)
234     for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J)
235       Score += getLAScore(cast<VPUser>(V1)->getOperand(I),
236                           cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI);
237   return Score;
238 }
239 
240 std::pair<VPlanSlp::OpMode, VPValue *>
241 VPlanSlp::getBest(OpMode Mode, VPValue *Last,
242                   SmallPtrSetImpl<VPValue *> &Candidates,
243                   VPInterleavedAccessInfo &IAI) {
244   assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
245          "Currently we only handle load and commutative opcodes");
246   LLVM_DEBUG(dbgs() << "      getBest\n");
247 
248   SmallVector<VPValue *, 4> BestCandidates;
249   LLVM_DEBUG(dbgs() << "        Candidates  for "
250                     << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
251   for (auto *Candidate : Candidates) {
252     auto *LastI = cast<VPInstruction>(Last);
253     auto *CandidateI = cast<VPInstruction>(Candidate);
254     if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
255       LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
256                         << " ");
257       BestCandidates.push_back(Candidate);
258     }
259   }
260   LLVM_DEBUG(dbgs() << "\n");
261 
262   if (BestCandidates.empty())
263     return {OpMode::Failed, nullptr};
264 
265   if (BestCandidates.size() == 1)
266     return {Mode, BestCandidates[0]};
267 
268   VPValue *Best = nullptr;
269   unsigned BestScore = 0;
270   for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
271     unsigned PrevScore = ~0u;
272     bool AllSame = true;
273 
274     // FIXME: Avoid visiting the same operands multiple times.
275     for (auto *Candidate : BestCandidates) {
276       unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
277       if (PrevScore == ~0u)
278         PrevScore = Score;
279       if (PrevScore != Score)
280         AllSame = false;
281       PrevScore = Score;
282 
283       if (Score > BestScore) {
284         BestScore = Score;
285         Best = Candidate;
286       }
287     }
288     if (!AllSame)
289       break;
290   }
291   LLVM_DEBUG(dbgs() << "Found best "
292                     << *cast<VPInstruction>(Best)->getUnderlyingInstr()
293                     << "\n");
294   Candidates.erase(Best);
295 
296   return {Mode, Best};
297 }
298 
299 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
300   SmallVector<MultiNodeOpTy, 4> FinalOrder;
301   SmallVector<OpMode, 4> Mode;
302   FinalOrder.reserve(MultiNodeOps.size());
303   Mode.reserve(MultiNodeOps.size());
304 
305   LLVM_DEBUG(dbgs() << "Reordering multinode\n");
306 
307   for (auto &Operands : MultiNodeOps) {
308     FinalOrder.push_back({Operands.first, {Operands.second[0]}});
309     if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
310         Instruction::Load)
311       Mode.push_back(OpMode::Load);
312     else
313       Mode.push_back(OpMode::Opcode);
314   }
315 
316   for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
317     LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n");
318     SmallPtrSet<VPValue *, 4> Candidates;
319     LLVM_DEBUG(dbgs() << "  Candidates  ");
320     for (auto Ops : MultiNodeOps) {
321       LLVM_DEBUG(
322           dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
323                  << " ");
324       Candidates.insert(Ops.second[Lane]);
325     }
326     LLVM_DEBUG(dbgs() << "\n");
327 
328     for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
329       LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n");
330       if (Mode[Op] == OpMode::Failed)
331         continue;
332 
333       VPValue *Last = FinalOrder[Op].second[Lane - 1];
334       std::pair<OpMode, VPValue *> Res =
335           getBest(Mode[Op], Last, Candidates, IAI);
336       if (Res.second)
337         FinalOrder[Op].second.push_back(Res.second);
338       else
339         // TODO: handle this case
340         FinalOrder[Op].second.push_back(markFailed());
341     }
342   }
343 
344   return FinalOrder;
345 }
346 
347 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
348   dbgs() << " Ops: ";
349   for (auto Op : Values) {
350     if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
351       if (auto *Instr = VPInstr->getUnderlyingInstr()) {
352         dbgs() << *Instr << " | ";
353         continue;
354       }
355     dbgs() << " nullptr | ";
356   }
357   dbgs() << "\n";
358 }
359 
360 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
361   assert(!Values.empty() && "Need some operands!");
362 
363   // If we already visited this instruction bundle, re-use the existing node
364   auto I = BundleToCombined.find(to_vector<4>(Values));
365   if (I != BundleToCombined.end()) {
366 #ifndef NDEBUG
367     // Check that the resulting graph is a tree. If we re-use a node, this means
368     // its values have multiple users. We only allow this, if all users of each
369     // value are the same instruction.
370     for (auto *V : Values) {
371       auto UI = V->user_begin();
372       auto *FirstUser = *UI++;
373       while (UI != V->user_end()) {
374         assert(*UI == FirstUser && "Currently we only support SLP trees.");
375         UI++;
376       }
377     }
378 #endif
379     return I->second;
380   }
381 
382   // Dump inputs
383   LLVM_DEBUG({
384     dbgs() << "buildGraph: ";
385     dumpBundle(Values);
386   });
387 
388   if (!areVectorizable(Values))
389     return markFailed();
390 
391   assert(getOpcode(Values) && "Opcodes for all values must match");
392   unsigned ValuesOpcode = getOpcode(Values).getValue();
393 
394   SmallVector<VPValue *, 4> CombinedOperands;
395   if (areCommutative(Values)) {
396     bool MultiNodeRoot = !MultiNodeActive;
397     MultiNodeActive = true;
398     for (auto &Operands : getOperands(Values)) {
399       LLVM_DEBUG({
400         dbgs() << "  Visiting Commutative";
401         dumpBundle(Operands);
402       });
403 
404       auto OperandsOpcode = getOpcode(Operands);
405       if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
406         LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n");
407         CombinedOperands.push_back(buildGraph(Operands));
408       } else {
409         LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
410         // Create dummy VPInstruction, which will we replace later by the
411         // re-ordered operand.
412         VPInstruction *Op = new VPInstruction(0, {});
413         CombinedOperands.push_back(Op);
414         MultiNodeOps.emplace_back(Op, Operands);
415       }
416     }
417 
418     if (MultiNodeRoot) {
419       LLVM_DEBUG(dbgs() << "Reorder \n");
420       MultiNodeActive = false;
421 
422       auto FinalOrder = reorderMultiNodeOps();
423 
424       MultiNodeOps.clear();
425       for (auto &Ops : FinalOrder) {
426         VPInstruction *NewOp = buildGraph(Ops.second);
427         Ops.first->replaceAllUsesWith(NewOp);
428         for (unsigned i = 0; i < CombinedOperands.size(); i++)
429           if (CombinedOperands[i] == Ops.first)
430             CombinedOperands[i] = NewOp;
431         delete Ops.first;
432         Ops.first = NewOp;
433       }
434       LLVM_DEBUG(dbgs() << "Found final order\n");
435     }
436   } else {
437     LLVM_DEBUG(dbgs() << "  NonCommuntative\n");
438     if (ValuesOpcode == Instruction::Load)
439       for (VPValue *V : Values)
440         CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
441     else
442       for (auto &Operands : getOperands(Values))
443         CombinedOperands.push_back(buildGraph(Operands));
444   }
445 
446   unsigned Opcode;
447   switch (ValuesOpcode) {
448   case Instruction::Load:
449     Opcode = VPInstruction::SLPLoad;
450     break;
451   case Instruction::Store:
452     Opcode = VPInstruction::SLPStore;
453     break;
454   default:
455     Opcode = ValuesOpcode;
456     break;
457   }
458 
459   if (!CompletelySLP)
460     return markFailed();
461 
462   assert(CombinedOperands.size() > 0 && "Need more some operands");
463   auto *VPI = new VPInstruction(Opcode, CombinedOperands);
464   VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
465 
466   LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
467              cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n");
468   addCombined(Values, VPI);
469   return VPI;
470 }
471