xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// This file implements SLP analysis based on VPlan. The analysis is based on
9 /// the ideas described in
10 ///
11 ///   Look-ahead SLP: auto-vectorization in the presence of commutative
12 ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
13 ///   Luís F. W. Góes
14 ///
15 //===----------------------------------------------------------------------===//
16 
17 #include "VPlanSLP.h"
18 #include "VPlan.h"
19 #include "VPlanCFG.h"
20 #include "VPlanValue.h"
21 #include "llvm/ADT/DenseMap.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/Analysis/LoopInfo.h"
24 #include "llvm/Analysis/VectorUtils.h"
25 #include "llvm/IR/Instruction.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/Type.h"
28 #include "llvm/IR/Value.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/ErrorHandling.h"
32 #include "llvm/Support/raw_ostream.h"
33 #include <algorithm>
34 #include <cassert>
35 #include <optional>
36 #include <utility>
37 
38 using namespace llvm;
39 
40 #define DEBUG_TYPE "vplan-slp"
41 
42 // Number of levels to look ahead when re-ordering multi node operands.
43 static unsigned LookaheadMaxDepth = 5;
44 
45 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
46                                           Old2NewTy &Old2New,
47                                           InterleavedAccessInfo &IAI) {
48   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
49       Region->getEntry());
50   for (VPBlockBase *Base : RPOT) {
51     visitBlock(Base, Old2New, IAI);
52   }
53 }
54 
55 void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
56                                          InterleavedAccessInfo &IAI) {
57   if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
58     for (VPRecipeBase &VPI : *VPBB) {
59       if (isa<VPWidenPHIRecipe>(&VPI))
60         continue;
61       auto *VPInst = dyn_cast<VPInstruction>(&VPI);
62       if (!VPInst)
63         continue;
64       auto *Inst = dyn_cast_or_null<Instruction>(VPInst->getUnderlyingValue());
65       if (!Inst)
66         continue;
67       auto *IG = IAI.getInterleaveGroup(Inst);
68       if (!IG)
69         continue;
70 
71       auto NewIGIter = Old2New.find(IG);
72       if (NewIGIter == Old2New.end())
73         Old2New[IG] = new InterleaveGroup<VPInstruction>(
74             IG->getFactor(), IG->isReverse(), IG->getAlign());
75 
76       if (Inst == IG->getInsertPos())
77         Old2New[IG]->setInsertPos(VPInst);
78 
79       InterleaveGroupMap[VPInst] = Old2New[IG];
80       InterleaveGroupMap[VPInst]->insertMember(
81           VPInst, IG->getIndex(Inst),
82           Align(IG->isReverse() ? (-1) * int(IG->getFactor())
83                                 : IG->getFactor()));
84     }
85   } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) {
86     visitRegion(Region, Old2New, IAI);
87   } else {
88     llvm_unreachable("Unsupported kind of VPBlock.");
89   }
90 }
91 
92 VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
93                                                  InterleavedAccessInfo &IAI) {
94   Old2NewTy Old2New;
95   visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI);
96 }
97 
98 VPInstruction *VPlanSlp::markFailed() {
99   // FIXME: Currently this is used to signal we hit instructions we cannot
100   //        trivially SLP'ize.
101   CompletelySLP = false;
102   return nullptr;
103 }
104 
105 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
106   if (all_of(Operands, [](VPValue *V) {
107         return cast<VPInstruction>(V)->getUnderlyingInstr();
108       })) {
109     unsigned BundleSize = 0;
110     for (VPValue *V : Operands) {
111       Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
112       assert(!T->isVectorTy() && "Only scalar types supported for now");
113       BundleSize += T->getScalarSizeInBits();
114     }
115     WidestBundleBits = std::max(WidestBundleBits, BundleSize);
116   }
117 
118   auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
119   assert(Res.second &&
120          "Already created a combined instruction for the operand bundle");
121   (void)Res;
122 }
123 
124 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
125   // Currently we only support VPInstructions.
126   if (!all_of(Operands, [](VPValue *Op) {
127         return Op && isa<VPInstruction>(Op) &&
128                cast<VPInstruction>(Op)->getUnderlyingInstr();
129       })) {
130     LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
131     return false;
132   }
133 
134   // Check if opcodes and type width agree for all instructions in the bundle.
135   // FIXME: Differing widths/opcodes can be handled by inserting additional
136   //        instructions.
137   // FIXME: Deal with non-primitive types.
138   const Instruction *OriginalInstr =
139       cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
140   unsigned Opcode = OriginalInstr->getOpcode();
141   unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
142   if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
143         const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
144         return I->getOpcode() == Opcode &&
145                I->getType()->getPrimitiveSizeInBits() == Width;
146       })) {
147     LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
148     return false;
149   }
150 
151   // For now, all operands must be defined in the same BB.
152   if (any_of(Operands, [this](VPValue *Op) {
153         return cast<VPInstruction>(Op)->getParent() != &this->BB;
154       })) {
155     LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
156     return false;
157   }
158 
159   if (any_of(Operands,
160              [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
161     LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
162     return false;
163   }
164 
165   // For loads, check that there are no instructions writing to memory in
166   // between them.
167   // TODO: we only have to forbid instructions writing to memory that could
168   //       interfere with any of the loads in the bundle
169   if (Opcode == Instruction::Load) {
170     unsigned LoadsSeen = 0;
171     VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
172     for (auto &I : *Parent) {
173       auto *VPI = dyn_cast<VPInstruction>(&I);
174       if (!VPI)
175         break;
176       if (VPI->getOpcode() == Instruction::Load &&
177           llvm::is_contained(Operands, VPI))
178         LoadsSeen++;
179 
180       if (LoadsSeen == Operands.size())
181         break;
182       if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
183         LLVM_DEBUG(
184             dbgs() << "VPSLP: instruction modifying memory between loads\n");
185         return false;
186       }
187     }
188 
189     if (!all_of(Operands, [](VPValue *Op) {
190           return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
191               ->isSimple();
192         })) {
193       LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
194       return false;
195     }
196   }
197 
198   if (Opcode == Instruction::Store)
199     if (!all_of(Operands, [](VPValue *Op) {
200           return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
201               ->isSimple();
202         })) {
203       LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
204       return false;
205     }
206 
207   return true;
208 }
209 
210 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
211                                              unsigned OperandIndex) {
212   SmallVector<VPValue *, 4> Operands;
213   for (VPValue *V : Values) {
214     // Currently we only support VPInstructions.
215     auto *U = cast<VPInstruction>(V);
216     Operands.push_back(U->getOperand(OperandIndex));
217   }
218   return Operands;
219 }
220 
221 static bool areCommutative(ArrayRef<VPValue *> Values) {
222   return Instruction::isCommutative(
223       cast<VPInstruction>(Values[0])->getOpcode());
224 }
225 
226 static SmallVector<SmallVector<VPValue *, 4>, 4>
227 getOperands(ArrayRef<VPValue *> Values) {
228   SmallVector<SmallVector<VPValue *, 4>, 4> Result;
229   auto *VPI = cast<VPInstruction>(Values[0]);
230 
231   switch (VPI->getOpcode()) {
232   case Instruction::Load:
233     llvm_unreachable("Loads terminate a tree, no need to get operands");
234   case Instruction::Store:
235     Result.push_back(getOperands(Values, 0));
236     break;
237   default:
238     for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
239       Result.push_back(getOperands(Values, I));
240     break;
241   }
242 
243   return Result;
244 }
245 
246 /// Returns the opcode of Values or ~0 if they do not all agree.
247 static std::optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
248   unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
249   if (any_of(Values, [Opcode](VPValue *V) {
250         return cast<VPInstruction>(V)->getOpcode() != Opcode;
251       }))
252     return std::nullopt;
253   return {Opcode};
254 }
255 
256 /// Returns true if A and B access sequential memory if they are loads or
257 /// stores or if they have identical opcodes otherwise.
258 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
259                                   VPInterleavedAccessInfo &IAI) {
260   if (A->getOpcode() != B->getOpcode())
261     return false;
262 
263   if (A->getOpcode() != Instruction::Load &&
264       A->getOpcode() != Instruction::Store)
265     return true;
266   auto *GA = IAI.getInterleaveGroup(A);
267   auto *GB = IAI.getInterleaveGroup(B);
268 
269   return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
270 }
271 
272 /// Implements getLAScore from Listing 7 in the paper.
273 /// Traverses and compares operands of V1 and V2 to MaxLevel.
274 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
275                            VPInterleavedAccessInfo &IAI) {
276   auto *I1 = dyn_cast<VPInstruction>(V1);
277   auto *I2 = dyn_cast<VPInstruction>(V2);
278   // Currently we only support VPInstructions.
279   if (!I1 || !I2)
280     return 0;
281 
282   if (MaxLevel == 0)
283     return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);
284 
285   unsigned Score = 0;
286   for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
287     for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
288       Score +=
289           getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
290   return Score;
291 }
292 
293 std::pair<VPlanSlp::OpMode, VPValue *>
294 VPlanSlp::getBest(OpMode Mode, VPValue *Last,
295                   SmallPtrSetImpl<VPValue *> &Candidates,
296                   VPInterleavedAccessInfo &IAI) {
297   assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
298          "Currently we only handle load and commutative opcodes");
299   LLVM_DEBUG(dbgs() << "      getBest\n");
300 
301   SmallVector<VPValue *, 4> BestCandidates;
302   LLVM_DEBUG(dbgs() << "        Candidates  for "
303                     << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
304   for (auto *Candidate : Candidates) {
305     auto *LastI = cast<VPInstruction>(Last);
306     auto *CandidateI = cast<VPInstruction>(Candidate);
307     if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
308       LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
309                         << " ");
310       BestCandidates.push_back(Candidate);
311     }
312   }
313   LLVM_DEBUG(dbgs() << "\n");
314 
315   if (BestCandidates.empty())
316     return {OpMode::Failed, nullptr};
317 
318   if (BestCandidates.size() == 1)
319     return {Mode, BestCandidates[0]};
320 
321   VPValue *Best = nullptr;
322   unsigned BestScore = 0;
323   for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
324     unsigned PrevScore = ~0u;
325     bool AllSame = true;
326 
327     // FIXME: Avoid visiting the same operands multiple times.
328     for (auto *Candidate : BestCandidates) {
329       unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
330       if (PrevScore == ~0u)
331         PrevScore = Score;
332       if (PrevScore != Score)
333         AllSame = false;
334       PrevScore = Score;
335 
336       if (Score > BestScore) {
337         BestScore = Score;
338         Best = Candidate;
339       }
340     }
341     if (!AllSame)
342       break;
343   }
344   LLVM_DEBUG(dbgs() << "Found best "
345                     << *cast<VPInstruction>(Best)->getUnderlyingInstr()
346                     << "\n");
347   Candidates.erase(Best);
348 
349   return {Mode, Best};
350 }
351 
352 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
353   SmallVector<MultiNodeOpTy, 4> FinalOrder;
354   SmallVector<OpMode, 4> Mode;
355   FinalOrder.reserve(MultiNodeOps.size());
356   Mode.reserve(MultiNodeOps.size());
357 
358   LLVM_DEBUG(dbgs() << "Reordering multinode\n");
359 
360   for (auto &Operands : MultiNodeOps) {
361     FinalOrder.push_back({Operands.first, {Operands.second[0]}});
362     if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
363         Instruction::Load)
364       Mode.push_back(OpMode::Load);
365     else
366       Mode.push_back(OpMode::Opcode);
367   }
368 
369   for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
370     LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n");
371     SmallPtrSet<VPValue *, 4> Candidates;
372     LLVM_DEBUG(dbgs() << "  Candidates  ");
373     for (auto Ops : MultiNodeOps) {
374       LLVM_DEBUG(
375           dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
376                  << " ");
377       Candidates.insert(Ops.second[Lane]);
378     }
379     LLVM_DEBUG(dbgs() << "\n");
380 
381     for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
382       LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n");
383       if (Mode[Op] == OpMode::Failed)
384         continue;
385 
386       VPValue *Last = FinalOrder[Op].second[Lane - 1];
387       std::pair<OpMode, VPValue *> Res =
388           getBest(Mode[Op], Last, Candidates, IAI);
389       if (Res.second)
390         FinalOrder[Op].second.push_back(Res.second);
391       else
392         // TODO: handle this case
393         FinalOrder[Op].second.push_back(markFailed());
394     }
395   }
396 
397   return FinalOrder;
398 }
399 
400 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
401 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
402   dbgs() << " Ops: ";
403   for (auto *Op : Values) {
404     if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
405       if (auto *Instr = VPInstr->getUnderlyingInstr()) {
406         dbgs() << *Instr << " | ";
407         continue;
408       }
409     dbgs() << " nullptr | ";
410   }
411   dbgs() << "\n";
412 }
413 #endif
414 
415 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
416   assert(!Values.empty() && "Need some operands!");
417 
418   // If we already visited this instruction bundle, re-use the existing node
419   auto I = BundleToCombined.find(to_vector<4>(Values));
420   if (I != BundleToCombined.end()) {
421 #ifndef NDEBUG
422     // Check that the resulting graph is a tree. If we re-use a node, this means
423     // its values have multiple users. We only allow this, if all users of each
424     // value are the same instruction.
425     for (auto *V : Values) {
426       auto UI = V->user_begin();
427       auto *FirstUser = *UI++;
428       while (UI != V->user_end()) {
429         assert(*UI == FirstUser && "Currently we only support SLP trees.");
430         UI++;
431       }
432     }
433 #endif
434     return I->second;
435   }
436 
437   // Dump inputs
438   LLVM_DEBUG({
439     dbgs() << "buildGraph: ";
440     dumpBundle(Values);
441   });
442 
443   if (!areVectorizable(Values))
444     return markFailed();
445 
446   assert(getOpcode(Values) && "Opcodes for all values must match");
447   unsigned ValuesOpcode = *getOpcode(Values);
448 
449   SmallVector<VPValue *, 4> CombinedOperands;
450   if (areCommutative(Values)) {
451     bool MultiNodeRoot = !MultiNodeActive;
452     MultiNodeActive = true;
453     for (auto &Operands : getOperands(Values)) {
454       LLVM_DEBUG({
455         dbgs() << "  Visiting Commutative";
456         dumpBundle(Operands);
457       });
458 
459       auto OperandsOpcode = getOpcode(Operands);
460       if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
461         LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n");
462         CombinedOperands.push_back(buildGraph(Operands));
463       } else {
464         LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
465         // Create dummy VPInstruction, which will we replace later by the
466         // re-ordered operand.
467         VPInstruction *Op = new VPInstruction(0, {});
468         CombinedOperands.push_back(Op);
469         MultiNodeOps.emplace_back(Op, Operands);
470       }
471     }
472 
473     if (MultiNodeRoot) {
474       LLVM_DEBUG(dbgs() << "Reorder \n");
475       MultiNodeActive = false;
476 
477       auto FinalOrder = reorderMultiNodeOps();
478 
479       MultiNodeOps.clear();
480       for (auto &Ops : FinalOrder) {
481         VPInstruction *NewOp = buildGraph(Ops.second);
482         Ops.first->replaceAllUsesWith(NewOp);
483         for (unsigned i = 0; i < CombinedOperands.size(); i++)
484           if (CombinedOperands[i] == Ops.first)
485             CombinedOperands[i] = NewOp;
486         delete Ops.first;
487         Ops.first = NewOp;
488       }
489       LLVM_DEBUG(dbgs() << "Found final order\n");
490     }
491   } else {
492     LLVM_DEBUG(dbgs() << "  NonCommuntative\n");
493     if (ValuesOpcode == Instruction::Load)
494       for (VPValue *V : Values)
495         CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
496     else
497       for (auto &Operands : getOperands(Values))
498         CombinedOperands.push_back(buildGraph(Operands));
499   }
500 
501   unsigned Opcode;
502   switch (ValuesOpcode) {
503   case Instruction::Load:
504     Opcode = VPInstruction::SLPLoad;
505     break;
506   case Instruction::Store:
507     Opcode = VPInstruction::SLPStore;
508     break;
509   default:
510     Opcode = ValuesOpcode;
511     break;
512   }
513 
514   if (!CompletelySLP)
515     return markFailed();
516 
517   assert(CombinedOperands.size() > 0 && "Need more some operands");
518   auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
519   auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
520 
521   LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " << Values[0]
522                     << "\n");
523   addCombined(Values, VPI);
524   return VPI;
525 }
526