1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// This file implements SLP analysis based on VPlan. The analysis is based on 9 /// the ideas described in 10 /// 11 /// Look-ahead SLP: auto-vectorization in the presence of commutative 12 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 13 /// Luís F. W. Góes 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "VPlan.h" 18 #include "llvm/ADT/DepthFirstIterator.h" 19 #include "llvm/ADT/PostOrderIterator.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Twine.h" 22 #include "llvm/Analysis/LoopInfo.h" 23 #include "llvm/Analysis/VectorUtils.h" 24 #include "llvm/IR/BasicBlock.h" 25 #include "llvm/IR/CFG.h" 26 #include "llvm/IR/Dominators.h" 27 #include "llvm/IR/InstrTypes.h" 28 #include "llvm/IR/Instruction.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Type.h" 31 #include "llvm/IR/Value.h" 32 #include "llvm/Support/Casting.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/ErrorHandling.h" 35 #include "llvm/Support/GraphWriter.h" 36 #include "llvm/Support/raw_ostream.h" 37 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 38 #include <cassert> 39 #include <iterator> 40 #include <string> 41 #include <vector> 42 43 using namespace llvm; 44 45 #define DEBUG_TYPE "vplan-slp" 46 47 // Number of levels to look ahead when re-ordering multi node operands. 48 static unsigned LookaheadMaxDepth = 5; 49 50 VPInstruction *VPlanSlp::markFailed() { 51 // FIXME: Currently this is used to signal we hit instructions we cannot 52 // trivially SLP'ize. 53 CompletelySLP = false; 54 return nullptr; 55 } 56 57 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) { 58 if (all_of(Operands, [](VPValue *V) { 59 return cast<VPInstruction>(V)->getUnderlyingInstr(); 60 })) { 61 unsigned BundleSize = 0; 62 for (VPValue *V : Operands) { 63 Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType(); 64 assert(!T->isVectorTy() && "Only scalar types supported for now"); 65 BundleSize += T->getScalarSizeInBits(); 66 } 67 WidestBundleBits = std::max(WidestBundleBits, BundleSize); 68 } 69 70 auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); 71 assert(Res.second && 72 "Already created a combined instruction for the operand bundle"); 73 (void)Res; 74 } 75 76 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { 77 // Currently we only support VPInstructions. 78 if (!all_of(Operands, [](VPValue *Op) { 79 return Op && isa<VPInstruction>(Op) && 80 cast<VPInstruction>(Op)->getUnderlyingInstr(); 81 })) { 82 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); 83 return false; 84 } 85 86 // Check if opcodes and type width agree for all instructions in the bundle. 87 // FIXME: Differing widths/opcodes can be handled by inserting additional 88 // instructions. 89 // FIXME: Deal with non-primitive types. 90 const Instruction *OriginalInstr = 91 cast<VPInstruction>(Operands[0])->getUnderlyingInstr(); 92 unsigned Opcode = OriginalInstr->getOpcode(); 93 unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); 94 if (!all_of(Operands, [Opcode, Width](VPValue *Op) { 95 const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr(); 96 return I->getOpcode() == Opcode && 97 I->getType()->getPrimitiveSizeInBits() == Width; 98 })) { 99 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); 100 return false; 101 } 102 103 // For now, all operands must be defined in the same BB. 104 if (any_of(Operands, [this](VPValue *Op) { 105 return cast<VPInstruction>(Op)->getParent() != &this->BB; 106 })) { 107 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); 108 return false; 109 } 110 111 if (any_of(Operands, 112 [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { 113 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); 114 return false; 115 } 116 117 // For loads, check that there are no instructions writing to memory in 118 // between them. 119 // TODO: we only have to forbid instructions writing to memory that could 120 // interfere with any of the loads in the bundle 121 if (Opcode == Instruction::Load) { 122 unsigned LoadsSeen = 0; 123 VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent(); 124 for (auto &I : *Parent) { 125 auto *VPI = cast<VPInstruction>(&I); 126 if (VPI->getOpcode() == Instruction::Load && 127 std::find(Operands.begin(), Operands.end(), VPI) != Operands.end()) 128 LoadsSeen++; 129 130 if (LoadsSeen == Operands.size()) 131 break; 132 if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { 133 LLVM_DEBUG( 134 dbgs() << "VPSLP: instruction modifying memory between loads\n"); 135 return false; 136 } 137 } 138 139 if (!all_of(Operands, [](VPValue *Op) { 140 return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 141 ->isSimple(); 142 })) { 143 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); 144 return false; 145 } 146 } 147 148 if (Opcode == Instruction::Store) 149 if (!all_of(Operands, [](VPValue *Op) { 150 return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 151 ->isSimple(); 152 })) { 153 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); 154 return false; 155 } 156 157 return true; 158 } 159 160 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, 161 unsigned OperandIndex) { 162 SmallVector<VPValue *, 4> Operands; 163 for (VPValue *V : Values) { 164 auto *U = cast<VPUser>(V); 165 Operands.push_back(U->getOperand(OperandIndex)); 166 } 167 return Operands; 168 } 169 170 static bool areCommutative(ArrayRef<VPValue *> Values) { 171 return Instruction::isCommutative( 172 cast<VPInstruction>(Values[0])->getOpcode()); 173 } 174 175 static SmallVector<SmallVector<VPValue *, 4>, 4> 176 getOperands(ArrayRef<VPValue *> Values) { 177 SmallVector<SmallVector<VPValue *, 4>, 4> Result; 178 auto *VPI = cast<VPInstruction>(Values[0]); 179 180 switch (VPI->getOpcode()) { 181 case Instruction::Load: 182 llvm_unreachable("Loads terminate a tree, no need to get operands"); 183 case Instruction::Store: 184 Result.push_back(getOperands(Values, 0)); 185 break; 186 default: 187 for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) 188 Result.push_back(getOperands(Values, I)); 189 break; 190 } 191 192 return Result; 193 } 194 195 /// Returns the opcode of Values or ~0 if they do not all agree. 196 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) { 197 unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode(); 198 if (any_of(Values, [Opcode](VPValue *V) { 199 return cast<VPInstruction>(V)->getOpcode() != Opcode; 200 })) 201 return None; 202 return {Opcode}; 203 } 204 205 /// Returns true if A and B access sequential memory if they are loads or 206 /// stores or if they have identical opcodes otherwise. 207 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, 208 VPInterleavedAccessInfo &IAI) { 209 if (A->getOpcode() != B->getOpcode()) 210 return false; 211 212 if (A->getOpcode() != Instruction::Load && 213 A->getOpcode() != Instruction::Store) 214 return true; 215 auto *GA = IAI.getInterleaveGroup(A); 216 auto *GB = IAI.getInterleaveGroup(B); 217 218 return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); 219 } 220 221 /// Implements getLAScore from Listing 7 in the paper. 222 /// Traverses and compares operands of V1 and V2 to MaxLevel. 223 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, 224 VPInterleavedAccessInfo &IAI) { 225 if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2)) 226 return 0; 227 228 if (MaxLevel == 0) 229 return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1), 230 cast<VPInstruction>(V2), IAI); 231 232 unsigned Score = 0; 233 for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I) 234 for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J) 235 Score += getLAScore(cast<VPUser>(V1)->getOperand(I), 236 cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI); 237 return Score; 238 } 239 240 std::pair<VPlanSlp::OpMode, VPValue *> 241 VPlanSlp::getBest(OpMode Mode, VPValue *Last, 242 SmallPtrSetImpl<VPValue *> &Candidates, 243 VPInterleavedAccessInfo &IAI) { 244 assert((Mode == OpMode::Load || Mode == OpMode::Opcode) && 245 "Currently we only handle load and commutative opcodes"); 246 LLVM_DEBUG(dbgs() << " getBest\n"); 247 248 SmallVector<VPValue *, 4> BestCandidates; 249 LLVM_DEBUG(dbgs() << " Candidates for " 250 << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " "); 251 for (auto *Candidate : Candidates) { 252 auto *LastI = cast<VPInstruction>(Last); 253 auto *CandidateI = cast<VPInstruction>(Candidate); 254 if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { 255 LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr() 256 << " "); 257 BestCandidates.push_back(Candidate); 258 } 259 } 260 LLVM_DEBUG(dbgs() << "\n"); 261 262 if (BestCandidates.empty()) 263 return {OpMode::Failed, nullptr}; 264 265 if (BestCandidates.size() == 1) 266 return {Mode, BestCandidates[0]}; 267 268 VPValue *Best = nullptr; 269 unsigned BestScore = 0; 270 for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { 271 unsigned PrevScore = ~0u; 272 bool AllSame = true; 273 274 // FIXME: Avoid visiting the same operands multiple times. 275 for (auto *Candidate : BestCandidates) { 276 unsigned Score = getLAScore(Last, Candidate, Depth, IAI); 277 if (PrevScore == ~0u) 278 PrevScore = Score; 279 if (PrevScore != Score) 280 AllSame = false; 281 PrevScore = Score; 282 283 if (Score > BestScore) { 284 BestScore = Score; 285 Best = Candidate; 286 } 287 } 288 if (!AllSame) 289 break; 290 } 291 LLVM_DEBUG(dbgs() << "Found best " 292 << *cast<VPInstruction>(Best)->getUnderlyingInstr() 293 << "\n"); 294 Candidates.erase(Best); 295 296 return {Mode, Best}; 297 } 298 299 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { 300 SmallVector<MultiNodeOpTy, 4> FinalOrder; 301 SmallVector<OpMode, 4> Mode; 302 FinalOrder.reserve(MultiNodeOps.size()); 303 Mode.reserve(MultiNodeOps.size()); 304 305 LLVM_DEBUG(dbgs() << "Reordering multinode\n"); 306 307 for (auto &Operands : MultiNodeOps) { 308 FinalOrder.push_back({Operands.first, {Operands.second[0]}}); 309 if (cast<VPInstruction>(Operands.second[0])->getOpcode() == 310 Instruction::Load) 311 Mode.push_back(OpMode::Load); 312 else 313 Mode.push_back(OpMode::Opcode); 314 } 315 316 for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { 317 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); 318 SmallPtrSet<VPValue *, 4> Candidates; 319 LLVM_DEBUG(dbgs() << " Candidates "); 320 for (auto Ops : MultiNodeOps) { 321 LLVM_DEBUG( 322 dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr() 323 << " "); 324 Candidates.insert(Ops.second[Lane]); 325 } 326 LLVM_DEBUG(dbgs() << "\n"); 327 328 for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { 329 LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); 330 if (Mode[Op] == OpMode::Failed) 331 continue; 332 333 VPValue *Last = FinalOrder[Op].second[Lane - 1]; 334 std::pair<OpMode, VPValue *> Res = 335 getBest(Mode[Op], Last, Candidates, IAI); 336 if (Res.second) 337 FinalOrder[Op].second.push_back(Res.second); 338 else 339 // TODO: handle this case 340 FinalOrder[Op].second.push_back(markFailed()); 341 } 342 } 343 344 return FinalOrder; 345 } 346 347 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { 348 dbgs() << " Ops: "; 349 for (auto Op : Values) { 350 if (auto *VPInstr = cast_or_null<VPInstruction>(Op)) 351 if (auto *Instr = VPInstr->getUnderlyingInstr()) { 352 dbgs() << *Instr << " | "; 353 continue; 354 } 355 dbgs() << " nullptr | "; 356 } 357 dbgs() << "\n"; 358 } 359 360 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { 361 assert(!Values.empty() && "Need some operands!"); 362 363 // If we already visited this instruction bundle, re-use the existing node 364 auto I = BundleToCombined.find(to_vector<4>(Values)); 365 if (I != BundleToCombined.end()) { 366 #ifndef NDEBUG 367 // Check that the resulting graph is a tree. If we re-use a node, this means 368 // its values have multiple users. We only allow this, if all users of each 369 // value are the same instruction. 370 for (auto *V : Values) { 371 auto UI = V->user_begin(); 372 auto *FirstUser = *UI++; 373 while (UI != V->user_end()) { 374 assert(*UI == FirstUser && "Currently we only support SLP trees."); 375 UI++; 376 } 377 } 378 #endif 379 return I->second; 380 } 381 382 // Dump inputs 383 LLVM_DEBUG({ 384 dbgs() << "buildGraph: "; 385 dumpBundle(Values); 386 }); 387 388 if (!areVectorizable(Values)) 389 return markFailed(); 390 391 assert(getOpcode(Values) && "Opcodes for all values must match"); 392 unsigned ValuesOpcode = getOpcode(Values).getValue(); 393 394 SmallVector<VPValue *, 4> CombinedOperands; 395 if (areCommutative(Values)) { 396 bool MultiNodeRoot = !MultiNodeActive; 397 MultiNodeActive = true; 398 for (auto &Operands : getOperands(Values)) { 399 LLVM_DEBUG({ 400 dbgs() << " Visiting Commutative"; 401 dumpBundle(Operands); 402 }); 403 404 auto OperandsOpcode = getOpcode(Operands); 405 if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { 406 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); 407 CombinedOperands.push_back(buildGraph(Operands)); 408 } else { 409 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); 410 // Create dummy VPInstruction, which will we replace later by the 411 // re-ordered operand. 412 VPInstruction *Op = new VPInstruction(0, {}); 413 CombinedOperands.push_back(Op); 414 MultiNodeOps.emplace_back(Op, Operands); 415 } 416 } 417 418 if (MultiNodeRoot) { 419 LLVM_DEBUG(dbgs() << "Reorder \n"); 420 MultiNodeActive = false; 421 422 auto FinalOrder = reorderMultiNodeOps(); 423 424 MultiNodeOps.clear(); 425 for (auto &Ops : FinalOrder) { 426 VPInstruction *NewOp = buildGraph(Ops.second); 427 Ops.first->replaceAllUsesWith(NewOp); 428 for (unsigned i = 0; i < CombinedOperands.size(); i++) 429 if (CombinedOperands[i] == Ops.first) 430 CombinedOperands[i] = NewOp; 431 delete Ops.first; 432 Ops.first = NewOp; 433 } 434 LLVM_DEBUG(dbgs() << "Found final order\n"); 435 } 436 } else { 437 LLVM_DEBUG(dbgs() << " NonCommuntative\n"); 438 if (ValuesOpcode == Instruction::Load) 439 for (VPValue *V : Values) 440 CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0)); 441 else 442 for (auto &Operands : getOperands(Values)) 443 CombinedOperands.push_back(buildGraph(Operands)); 444 } 445 446 unsigned Opcode; 447 switch (ValuesOpcode) { 448 case Instruction::Load: 449 Opcode = VPInstruction::SLPLoad; 450 break; 451 case Instruction::Store: 452 Opcode = VPInstruction::SLPStore; 453 break; 454 default: 455 Opcode = ValuesOpcode; 456 break; 457 } 458 459 if (!CompletelySLP) 460 return markFailed(); 461 462 assert(CombinedOperands.size() > 0 && "Need more some operands"); 463 auto *VPI = new VPInstruction(Opcode, CombinedOperands); 464 VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr()); 465 466 LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs()); 467 cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n"); 468 addCombined(Values, VPI); 469 return VPI; 470 } 471