//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// This file implements SLP analysis based on VPlan. The analysis is based on /// the ideas described in /// /// Look-ahead SLP: auto-vectorization in the presence of commutative /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, /// Luís F. W. Góes /// //===----------------------------------------------------------------------===// #include "VPlan.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include #include #include #include using namespace llvm; #define DEBUG_TYPE "vplan-slp" // Number of levels to look ahead when re-ordering multi node operands. static unsigned LookaheadMaxDepth = 5; VPInstruction *VPlanSlp::markFailed() { // FIXME: Currently this is used to signal we hit instructions we cannot // trivially SLP'ize. CompletelySLP = false; return nullptr; } void VPlanSlp::addCombined(ArrayRef Operands, VPInstruction *New) { if (all_of(Operands, [](VPValue *V) { return cast(V)->getUnderlyingInstr(); })) { unsigned BundleSize = 0; for (VPValue *V : Operands) { Type *T = cast(V)->getUnderlyingInstr()->getType(); assert(!T->isVectorTy() && "Only scalar types supported for now"); BundleSize += T->getScalarSizeInBits(); } WidestBundleBits = std::max(WidestBundleBits, BundleSize); } auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); assert(Res.second && "Already created a combined instruction for the operand bundle"); (void)Res; } bool VPlanSlp::areVectorizable(ArrayRef Operands) const { // Currently we only support VPInstructions. if (!all_of(Operands, [](VPValue *Op) { return Op && isa(Op) && cast(Op)->getUnderlyingInstr(); })) { LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); return false; } // Check if opcodes and type width agree for all instructions in the bundle. // FIXME: Differing widths/opcodes can be handled by inserting additional // instructions. // FIXME: Deal with non-primitive types. const Instruction *OriginalInstr = cast(Operands[0])->getUnderlyingInstr(); unsigned Opcode = OriginalInstr->getOpcode(); unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); if (!all_of(Operands, [Opcode, Width](VPValue *Op) { const Instruction *I = cast(Op)->getUnderlyingInstr(); return I->getOpcode() == Opcode && I->getType()->getPrimitiveSizeInBits() == Width; })) { LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); return false; } // For now, all operands must be defined in the same BB. if (any_of(Operands, [this](VPValue *Op) { return cast(Op)->getParent() != &this->BB; })) { LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); return false; } if (any_of(Operands, [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); return false; } // For loads, check that there are no instructions writing to memory in // between them. // TODO: we only have to forbid instructions writing to memory that could // interfere with any of the loads in the bundle if (Opcode == Instruction::Load) { unsigned LoadsSeen = 0; VPBasicBlock *Parent = cast(Operands[0])->getParent(); for (auto &I : *Parent) { auto *VPI = dyn_cast(&I); if (!VPI) break; if (VPI->getOpcode() == Instruction::Load && llvm::is_contained(Operands, VPI)) LoadsSeen++; if (LoadsSeen == Operands.size()) break; if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { LLVM_DEBUG( dbgs() << "VPSLP: instruction modifying memory between loads\n"); return false; } } if (!all_of(Operands, [](VPValue *Op) { return cast(cast(Op)->getUnderlyingInstr()) ->isSimple(); })) { LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); return false; } } if (Opcode == Instruction::Store) if (!all_of(Operands, [](VPValue *Op) { return cast(cast(Op)->getUnderlyingInstr()) ->isSimple(); })) { LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); return false; } return true; } static SmallVector getOperands(ArrayRef Values, unsigned OperandIndex) { SmallVector Operands; for (VPValue *V : Values) { // Currently we only support VPInstructions. auto *U = cast(V); Operands.push_back(U->getOperand(OperandIndex)); } return Operands; } static bool areCommutative(ArrayRef Values) { return Instruction::isCommutative( cast(Values[0])->getOpcode()); } static SmallVector, 4> getOperands(ArrayRef Values) { SmallVector, 4> Result; auto *VPI = cast(Values[0]); switch (VPI->getOpcode()) { case Instruction::Load: llvm_unreachable("Loads terminate a tree, no need to get operands"); case Instruction::Store: Result.push_back(getOperands(Values, 0)); break; default: for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) Result.push_back(getOperands(Values, I)); break; } return Result; } /// Returns the opcode of Values or ~0 if they do not all agree. static Optional getOpcode(ArrayRef Values) { unsigned Opcode = cast(Values[0])->getOpcode(); if (any_of(Values, [Opcode](VPValue *V) { return cast(V)->getOpcode() != Opcode; })) return None; return {Opcode}; } /// Returns true if A and B access sequential memory if they are loads or /// stores or if they have identical opcodes otherwise. static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, VPInterleavedAccessInfo &IAI) { if (A->getOpcode() != B->getOpcode()) return false; if (A->getOpcode() != Instruction::Load && A->getOpcode() != Instruction::Store) return true; auto *GA = IAI.getInterleaveGroup(A); auto *GB = IAI.getInterleaveGroup(B); return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); } /// Implements getLAScore from Listing 7 in the paper. /// Traverses and compares operands of V1 and V2 to MaxLevel. static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, VPInterleavedAccessInfo &IAI) { auto *I1 = dyn_cast(V1); auto *I2 = dyn_cast(V2); // Currently we only support VPInstructions. if (!I1 || !I2) return 0; if (MaxLevel == 0) return (unsigned)areConsecutiveOrMatch(I1, I2, IAI); unsigned Score = 0; for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I) for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J) Score += getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI); return Score; } std::pair VPlanSlp::getBest(OpMode Mode, VPValue *Last, SmallPtrSetImpl &Candidates, VPInterleavedAccessInfo &IAI) { assert((Mode == OpMode::Load || Mode == OpMode::Opcode) && "Currently we only handle load and commutative opcodes"); LLVM_DEBUG(dbgs() << " getBest\n"); SmallVector BestCandidates; LLVM_DEBUG(dbgs() << " Candidates for " << *cast(Last)->getUnderlyingInstr() << " "); for (auto *Candidate : Candidates) { auto *LastI = cast(Last); auto *CandidateI = cast(Candidate); if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { LLVM_DEBUG(dbgs() << *cast(Candidate)->getUnderlyingInstr() << " "); BestCandidates.push_back(Candidate); } } LLVM_DEBUG(dbgs() << "\n"); if (BestCandidates.empty()) return {OpMode::Failed, nullptr}; if (BestCandidates.size() == 1) return {Mode, BestCandidates[0]}; VPValue *Best = nullptr; unsigned BestScore = 0; for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { unsigned PrevScore = ~0u; bool AllSame = true; // FIXME: Avoid visiting the same operands multiple times. for (auto *Candidate : BestCandidates) { unsigned Score = getLAScore(Last, Candidate, Depth, IAI); if (PrevScore == ~0u) PrevScore = Score; if (PrevScore != Score) AllSame = false; PrevScore = Score; if (Score > BestScore) { BestScore = Score; Best = Candidate; } } if (!AllSame) break; } LLVM_DEBUG(dbgs() << "Found best " << *cast(Best)->getUnderlyingInstr() << "\n"); Candidates.erase(Best); return {Mode, Best}; } SmallVector VPlanSlp::reorderMultiNodeOps() { SmallVector FinalOrder; SmallVector Mode; FinalOrder.reserve(MultiNodeOps.size()); Mode.reserve(MultiNodeOps.size()); LLVM_DEBUG(dbgs() << "Reordering multinode\n"); for (auto &Operands : MultiNodeOps) { FinalOrder.push_back({Operands.first, {Operands.second[0]}}); if (cast(Operands.second[0])->getOpcode() == Instruction::Load) Mode.push_back(OpMode::Load); else Mode.push_back(OpMode::Opcode); } for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); SmallPtrSet Candidates; LLVM_DEBUG(dbgs() << " Candidates "); for (auto Ops : MultiNodeOps) { LLVM_DEBUG( dbgs() << *cast(Ops.second[Lane])->getUnderlyingInstr() << " "); Candidates.insert(Ops.second[Lane]); } LLVM_DEBUG(dbgs() << "\n"); for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); if (Mode[Op] == OpMode::Failed) continue; VPValue *Last = FinalOrder[Op].second[Lane - 1]; std::pair Res = getBest(Mode[Op], Last, Candidates, IAI); if (Res.second) FinalOrder[Op].second.push_back(Res.second); else // TODO: handle this case FinalOrder[Op].second.push_back(markFailed()); } } return FinalOrder; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPlanSlp::dumpBundle(ArrayRef Values) { dbgs() << " Ops: "; for (auto Op : Values) { if (auto *VPInstr = cast_or_null(Op)) if (auto *Instr = VPInstr->getUnderlyingInstr()) { dbgs() << *Instr << " | "; continue; } dbgs() << " nullptr | "; } dbgs() << "\n"; } #endif VPInstruction *VPlanSlp::buildGraph(ArrayRef Values) { assert(!Values.empty() && "Need some operands!"); // If we already visited this instruction bundle, re-use the existing node auto I = BundleToCombined.find(to_vector<4>(Values)); if (I != BundleToCombined.end()) { #ifndef NDEBUG // Check that the resulting graph is a tree. If we re-use a node, this means // its values have multiple users. We only allow this, if all users of each // value are the same instruction. for (auto *V : Values) { auto UI = V->user_begin(); auto *FirstUser = *UI++; while (UI != V->user_end()) { assert(*UI == FirstUser && "Currently we only support SLP trees."); UI++; } } #endif return I->second; } // Dump inputs LLVM_DEBUG({ dbgs() << "buildGraph: "; dumpBundle(Values); }); if (!areVectorizable(Values)) return markFailed(); assert(getOpcode(Values) && "Opcodes for all values must match"); unsigned ValuesOpcode = getOpcode(Values).getValue(); SmallVector CombinedOperands; if (areCommutative(Values)) { bool MultiNodeRoot = !MultiNodeActive; MultiNodeActive = true; for (auto &Operands : getOperands(Values)) { LLVM_DEBUG({ dbgs() << " Visiting Commutative"; dumpBundle(Operands); }); auto OperandsOpcode = getOpcode(Operands); if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); CombinedOperands.push_back(buildGraph(Operands)); } else { LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); // Create dummy VPInstruction, which will we replace later by the // re-ordered operand. VPInstruction *Op = new VPInstruction(0, {}); CombinedOperands.push_back(Op); MultiNodeOps.emplace_back(Op, Operands); } } if (MultiNodeRoot) { LLVM_DEBUG(dbgs() << "Reorder \n"); MultiNodeActive = false; auto FinalOrder = reorderMultiNodeOps(); MultiNodeOps.clear(); for (auto &Ops : FinalOrder) { VPInstruction *NewOp = buildGraph(Ops.second); Ops.first->replaceAllUsesWith(NewOp); for (unsigned i = 0; i < CombinedOperands.size(); i++) if (CombinedOperands[i] == Ops.first) CombinedOperands[i] = NewOp; delete Ops.first; Ops.first = NewOp; } LLVM_DEBUG(dbgs() << "Found final order\n"); } } else { LLVM_DEBUG(dbgs() << " NonCommuntative\n"); if (ValuesOpcode == Instruction::Load) for (VPValue *V : Values) CombinedOperands.push_back(cast(V)->getOperand(0)); else for (auto &Operands : getOperands(Values)) CombinedOperands.push_back(buildGraph(Operands)); } unsigned Opcode; switch (ValuesOpcode) { case Instruction::Load: Opcode = VPInstruction::SLPLoad; break; case Instruction::Store: Opcode = VPInstruction::SLPStore; break; default: Opcode = ValuesOpcode; break; } if (!CompletelySLP) return markFailed(); assert(CombinedOperands.size() > 0 && "Need more some operands"); auto *Inst = cast(Values[0])->getUnderlyingInstr(); auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc()); VPI->setUnderlyingInstr(Inst); LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " << *cast(Values[0]) << "\n"); addCombined(Values, VPI); return VPI; }