1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// This file implements SLP analysis based on VPlan. The analysis is based on 9 /// the ideas described in 10 /// 11 /// Look-ahead SLP: auto-vectorization in the presence of commutative 12 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 13 /// Luís F. W. Góes 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "VPlan.h" 18 #include "llvm/ADT/DepthFirstIterator.h" 19 #include "llvm/ADT/PostOrderIterator.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Twine.h" 22 #include "llvm/Analysis/LoopInfo.h" 23 #include "llvm/Analysis/VectorUtils.h" 24 #include "llvm/IR/BasicBlock.h" 25 #include "llvm/IR/CFG.h" 26 #include "llvm/IR/Dominators.h" 27 #include "llvm/IR/InstrTypes.h" 28 #include "llvm/IR/Instruction.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Type.h" 31 #include "llvm/IR/Value.h" 32 #include "llvm/Support/Casting.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/ErrorHandling.h" 35 #include "llvm/Support/GraphWriter.h" 36 #include "llvm/Support/raw_ostream.h" 37 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 38 #include <cassert> 39 #include <iterator> 40 #include <string> 41 #include <vector> 42 43 using namespace llvm; 44 45 #define DEBUG_TYPE "vplan-slp" 46 47 // Number of levels to look ahead when re-ordering multi node operands. 48 static unsigned LookaheadMaxDepth = 5; 49 50 VPInstruction *VPlanSlp::markFailed() { 51 // FIXME: Currently this is used to signal we hit instructions we cannot 52 // trivially SLP'ize. 53 CompletelySLP = false; 54 return nullptr; 55 } 56 57 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) { 58 if (all_of(Operands, [](VPValue *V) { 59 return cast<VPInstruction>(V)->getUnderlyingInstr(); 60 })) { 61 unsigned BundleSize = 0; 62 for (VPValue *V : Operands) { 63 Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType(); 64 assert(!T->isVectorTy() && "Only scalar types supported for now"); 65 BundleSize += T->getScalarSizeInBits(); 66 } 67 WidestBundleBits = std::max(WidestBundleBits, BundleSize); 68 } 69 70 auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); 71 assert(Res.second && 72 "Already created a combined instruction for the operand bundle"); 73 (void)Res; 74 } 75 76 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { 77 // Currently we only support VPInstructions. 78 if (!all_of(Operands, [](VPValue *Op) { 79 return Op && isa<VPInstruction>(Op) && 80 cast<VPInstruction>(Op)->getUnderlyingInstr(); 81 })) { 82 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); 83 return false; 84 } 85 86 // Check if opcodes and type width agree for all instructions in the bundle. 87 // FIXME: Differing widths/opcodes can be handled by inserting additional 88 // instructions. 89 // FIXME: Deal with non-primitive types. 90 const Instruction *OriginalInstr = 91 cast<VPInstruction>(Operands[0])->getUnderlyingInstr(); 92 unsigned Opcode = OriginalInstr->getOpcode(); 93 unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); 94 if (!all_of(Operands, [Opcode, Width](VPValue *Op) { 95 const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr(); 96 return I->getOpcode() == Opcode && 97 I->getType()->getPrimitiveSizeInBits() == Width; 98 })) { 99 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); 100 return false; 101 } 102 103 // For now, all operands must be defined in the same BB. 104 if (any_of(Operands, [this](VPValue *Op) { 105 return cast<VPInstruction>(Op)->getParent() != &this->BB; 106 })) { 107 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); 108 return false; 109 } 110 111 if (any_of(Operands, 112 [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { 113 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); 114 return false; 115 } 116 117 // For loads, check that there are no instructions writing to memory in 118 // between them. 119 // TODO: we only have to forbid instructions writing to memory that could 120 // interfere with any of the loads in the bundle 121 if (Opcode == Instruction::Load) { 122 unsigned LoadsSeen = 0; 123 VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent(); 124 for (auto &I : *Parent) { 125 auto *VPI = cast<VPInstruction>(&I); 126 if (VPI->getOpcode() == Instruction::Load && 127 std::find(Operands.begin(), Operands.end(), VPI) != Operands.end()) 128 LoadsSeen++; 129 130 if (LoadsSeen == Operands.size()) 131 break; 132 if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { 133 LLVM_DEBUG( 134 dbgs() << "VPSLP: instruction modifying memory between loads\n"); 135 return false; 136 } 137 } 138 139 if (!all_of(Operands, [](VPValue *Op) { 140 return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 141 ->isSimple(); 142 })) { 143 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); 144 return false; 145 } 146 } 147 148 if (Opcode == Instruction::Store) 149 if (!all_of(Operands, [](VPValue *Op) { 150 return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 151 ->isSimple(); 152 })) { 153 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); 154 return false; 155 } 156 157 return true; 158 } 159 160 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, 161 unsigned OperandIndex) { 162 SmallVector<VPValue *, 4> Operands; 163 for (VPValue *V : Values) { 164 auto *U = cast<VPUser>(V); 165 Operands.push_back(U->getOperand(OperandIndex)); 166 } 167 return Operands; 168 } 169 170 static bool areCommutative(ArrayRef<VPValue *> Values) { 171 return Instruction::isCommutative( 172 cast<VPInstruction>(Values[0])->getOpcode()); 173 } 174 175 static SmallVector<SmallVector<VPValue *, 4>, 4> 176 getOperands(ArrayRef<VPValue *> Values) { 177 SmallVector<SmallVector<VPValue *, 4>, 4> Result; 178 auto *VPI = cast<VPInstruction>(Values[0]); 179 180 switch (VPI->getOpcode()) { 181 case Instruction::Load: 182 llvm_unreachable("Loads terminate a tree, no need to get operands"); 183 case Instruction::Store: 184 Result.push_back(getOperands(Values, 0)); 185 break; 186 default: 187 for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) 188 Result.push_back(getOperands(Values, I)); 189 break; 190 } 191 192 return Result; 193 } 194 195 /// Returns the opcode of Values or ~0 if they do not all agree. 196 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) { 197 unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode(); 198 if (any_of(Values, [Opcode](VPValue *V) { 199 return cast<VPInstruction>(V)->getOpcode() != Opcode; 200 })) 201 return None; 202 return {Opcode}; 203 } 204 205 /// Returns true if A and B access sequential memory if they are loads or 206 /// stores or if they have identical opcodes otherwise. 207 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, 208 VPInterleavedAccessInfo &IAI) { 209 if (A->getOpcode() != B->getOpcode()) 210 return false; 211 212 if (A->getOpcode() != Instruction::Load && 213 A->getOpcode() != Instruction::Store) 214 return true; 215 auto *GA = IAI.getInterleaveGroup(A); 216 auto *GB = IAI.getInterleaveGroup(B); 217 218 return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); 219 } 220 221 /// Implements getLAScore from Listing 7 in the paper. 222 /// Traverses and compares operands of V1 and V2 to MaxLevel. 223 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, 224 VPInterleavedAccessInfo &IAI) { 225 if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2)) 226 return 0; 227 228 if (MaxLevel == 0) 229 return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1), 230 cast<VPInstruction>(V2), IAI); 231 232 unsigned Score = 0; 233 for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I) 234 for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J) 235 Score += getLAScore(cast<VPUser>(V1)->getOperand(I), 236 cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI); 237 return Score; 238 } 239 240 std::pair<VPlanSlp::OpMode, VPValue *> 241 VPlanSlp::getBest(OpMode Mode, VPValue *Last, 242 SmallPtrSetImpl<VPValue *> &Candidates, 243 VPInterleavedAccessInfo &IAI) { 244 assert((Mode == OpMode::Load || Mode == OpMode::Opcode) && 245 "Currently we only handle load and commutative opcodes"); 246 LLVM_DEBUG(dbgs() << " getBest\n"); 247 248 SmallVector<VPValue *, 4> BestCandidates; 249 LLVM_DEBUG(dbgs() << " Candidates for " 250 << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " "); 251 for (auto *Candidate : Candidates) { 252 auto *LastI = cast<VPInstruction>(Last); 253 auto *CandidateI = cast<VPInstruction>(Candidate); 254 if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { 255 LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr() 256 << " "); 257 BestCandidates.push_back(Candidate); 258 } 259 } 260 LLVM_DEBUG(dbgs() << "\n"); 261 262 if (BestCandidates.empty()) 263 return {OpMode::Failed, nullptr}; 264 265 if (BestCandidates.size() == 1) 266 return {Mode, BestCandidates[0]}; 267 268 VPValue *Best = nullptr; 269 unsigned BestScore = 0; 270 for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { 271 unsigned PrevScore = ~0u; 272 bool AllSame = true; 273 274 // FIXME: Avoid visiting the same operands multiple times. 275 for (auto *Candidate : BestCandidates) { 276 unsigned Score = getLAScore(Last, Candidate, Depth, IAI); 277 if (PrevScore == ~0u) 278 PrevScore = Score; 279 if (PrevScore != Score) 280 AllSame = false; 281 PrevScore = Score; 282 283 if (Score > BestScore) { 284 BestScore = Score; 285 Best = Candidate; 286 } 287 } 288 if (!AllSame) 289 break; 290 } 291 LLVM_DEBUG(dbgs() << "Found best " 292 << *cast<VPInstruction>(Best)->getUnderlyingInstr() 293 << "\n"); 294 Candidates.erase(Best); 295 296 return {Mode, Best}; 297 } 298 299 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { 300 SmallVector<MultiNodeOpTy, 4> FinalOrder; 301 SmallVector<OpMode, 4> Mode; 302 FinalOrder.reserve(MultiNodeOps.size()); 303 Mode.reserve(MultiNodeOps.size()); 304 305 LLVM_DEBUG(dbgs() << "Reordering multinode\n"); 306 307 for (auto &Operands : MultiNodeOps) { 308 FinalOrder.push_back({Operands.first, {Operands.second[0]}}); 309 if (cast<VPInstruction>(Operands.second[0])->getOpcode() == 310 Instruction::Load) 311 Mode.push_back(OpMode::Load); 312 else 313 Mode.push_back(OpMode::Opcode); 314 } 315 316 for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { 317 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); 318 SmallPtrSet<VPValue *, 4> Candidates; 319 LLVM_DEBUG(dbgs() << " Candidates "); 320 for (auto Ops : MultiNodeOps) { 321 LLVM_DEBUG( 322 dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr() 323 << " "); 324 Candidates.insert(Ops.second[Lane]); 325 } 326 LLVM_DEBUG(dbgs() << "\n"); 327 328 for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { 329 LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); 330 if (Mode[Op] == OpMode::Failed) 331 continue; 332 333 VPValue *Last = FinalOrder[Op].second[Lane - 1]; 334 std::pair<OpMode, VPValue *> Res = 335 getBest(Mode[Op], Last, Candidates, IAI); 336 if (Res.second) 337 FinalOrder[Op].second.push_back(Res.second); 338 else 339 // TODO: handle this case 340 FinalOrder[Op].second.push_back(markFailed()); 341 } 342 } 343 344 return FinalOrder; 345 } 346 347 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { 348 dbgs() << " Ops: "; 349 for (auto Op : Values) 350 if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr()) 351 dbgs() << *Instr << " | "; 352 else 353 dbgs() << " nullptr | "; 354 dbgs() << "\n"; 355 } 356 357 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { 358 assert(!Values.empty() && "Need some operands!"); 359 360 // If we already visited this instruction bundle, re-use the existing node 361 auto I = BundleToCombined.find(to_vector<4>(Values)); 362 if (I != BundleToCombined.end()) { 363 #ifndef NDEBUG 364 // Check that the resulting graph is a tree. If we re-use a node, this means 365 // its values have multiple users. We only allow this, if all users of each 366 // value are the same instruction. 367 for (auto *V : Values) { 368 auto UI = V->user_begin(); 369 auto *FirstUser = *UI++; 370 while (UI != V->user_end()) { 371 assert(*UI == FirstUser && "Currently we only support SLP trees."); 372 UI++; 373 } 374 } 375 #endif 376 return I->second; 377 } 378 379 // Dump inputs 380 LLVM_DEBUG({ 381 dbgs() << "buildGraph: "; 382 dumpBundle(Values); 383 }); 384 385 if (!areVectorizable(Values)) 386 return markFailed(); 387 388 assert(getOpcode(Values) && "Opcodes for all values must match"); 389 unsigned ValuesOpcode = getOpcode(Values).getValue(); 390 391 SmallVector<VPValue *, 4> CombinedOperands; 392 if (areCommutative(Values)) { 393 bool MultiNodeRoot = !MultiNodeActive; 394 MultiNodeActive = true; 395 for (auto &Operands : getOperands(Values)) { 396 LLVM_DEBUG({ 397 dbgs() << " Visiting Commutative"; 398 dumpBundle(Operands); 399 }); 400 401 auto OperandsOpcode = getOpcode(Operands); 402 if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { 403 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); 404 CombinedOperands.push_back(buildGraph(Operands)); 405 } else { 406 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); 407 // Create dummy VPInstruction, which will we replace later by the 408 // re-ordered operand. 409 VPInstruction *Op = new VPInstruction(0, {}); 410 CombinedOperands.push_back(Op); 411 MultiNodeOps.emplace_back(Op, Operands); 412 } 413 } 414 415 if (MultiNodeRoot) { 416 LLVM_DEBUG(dbgs() << "Reorder \n"); 417 MultiNodeActive = false; 418 419 auto FinalOrder = reorderMultiNodeOps(); 420 421 MultiNodeOps.clear(); 422 for (auto &Ops : FinalOrder) { 423 VPInstruction *NewOp = buildGraph(Ops.second); 424 Ops.first->replaceAllUsesWith(NewOp); 425 for (unsigned i = 0; i < CombinedOperands.size(); i++) 426 if (CombinedOperands[i] == Ops.first) 427 CombinedOperands[i] = NewOp; 428 delete Ops.first; 429 Ops.first = NewOp; 430 } 431 LLVM_DEBUG(dbgs() << "Found final order\n"); 432 } 433 } else { 434 LLVM_DEBUG(dbgs() << " NonCommuntative\n"); 435 if (ValuesOpcode == Instruction::Load) 436 for (VPValue *V : Values) 437 CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0)); 438 else 439 for (auto &Operands : getOperands(Values)) 440 CombinedOperands.push_back(buildGraph(Operands)); 441 } 442 443 unsigned Opcode; 444 switch (ValuesOpcode) { 445 case Instruction::Load: 446 Opcode = VPInstruction::SLPLoad; 447 break; 448 case Instruction::Store: 449 Opcode = VPInstruction::SLPStore; 450 break; 451 default: 452 Opcode = ValuesOpcode; 453 break; 454 } 455 456 if (!CompletelySLP) 457 return markFailed(); 458 459 assert(CombinedOperands.size() > 0 && "Need more some operands"); 460 auto *VPI = new VPInstruction(Opcode, CombinedOperands); 461 VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr()); 462 463 LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs()); 464 cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n"); 465 addCombined(Values, VPI); 466 return VPI; 467 } 468