1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// This file implements SLP analysis based on VPlan. The analysis is based on 9 /// the ideas described in 10 /// 11 /// Look-ahead SLP: auto-vectorization in the presence of commutative 12 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 13 /// Luís F. W. Góes 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "VPlan.h" 18 #include "llvm/ADT/DepthFirstIterator.h" 19 #include "llvm/ADT/PostOrderIterator.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Twine.h" 22 #include "llvm/Analysis/LoopInfo.h" 23 #include "llvm/Analysis/VectorUtils.h" 24 #include "llvm/IR/BasicBlock.h" 25 #include "llvm/IR/CFG.h" 26 #include "llvm/IR/Dominators.h" 27 #include "llvm/IR/InstrTypes.h" 28 #include "llvm/IR/Instruction.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Type.h" 31 #include "llvm/IR/Value.h" 32 #include "llvm/Support/Casting.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/ErrorHandling.h" 35 #include "llvm/Support/GraphWriter.h" 36 #include "llvm/Support/raw_ostream.h" 37 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 38 #include <algorithm> 39 #include <cassert> 40 #include <iterator> 41 #include <utility> 42 43 using namespace llvm; 44 45 #define DEBUG_TYPE "vplan-slp" 46 47 // Number of levels to look ahead when re-ordering multi node operands. 48 static unsigned LookaheadMaxDepth = 5; 49 50 VPInstruction *VPlanSlp::markFailed() { 51 // FIXME: Currently this is used to signal we hit instructions we cannot 52 // trivially SLP'ize. 53 CompletelySLP = false; 54 return nullptr; 55 } 56 57 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) { 58 if (all_of(Operands, [](VPValue *V) { 59 return cast<VPInstruction>(V)->getUnderlyingInstr(); 60 })) { 61 unsigned BundleSize = 0; 62 for (VPValue *V : Operands) { 63 Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType(); 64 assert(!T->isVectorTy() && "Only scalar types supported for now"); 65 BundleSize += T->getScalarSizeInBits(); 66 } 67 WidestBundleBits = std::max(WidestBundleBits, BundleSize); 68 } 69 70 auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); 71 assert(Res.second && 72 "Already created a combined instruction for the operand bundle"); 73 (void)Res; 74 } 75 76 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { 77 // Currently we only support VPInstructions. 78 if (!all_of(Operands, [](VPValue *Op) { 79 return Op && isa<VPInstruction>(Op) && 80 cast<VPInstruction>(Op)->getUnderlyingInstr(); 81 })) { 82 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); 83 return false; 84 } 85 86 // Check if opcodes and type width agree for all instructions in the bundle. 87 // FIXME: Differing widths/opcodes can be handled by inserting additional 88 // instructions. 89 // FIXME: Deal with non-primitive types. 90 const Instruction *OriginalInstr = 91 cast<VPInstruction>(Operands[0])->getUnderlyingInstr(); 92 unsigned Opcode = OriginalInstr->getOpcode(); 93 unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); 94 if (!all_of(Operands, [Opcode, Width](VPValue *Op) { 95 const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr(); 96 return I->getOpcode() == Opcode && 97 I->getType()->getPrimitiveSizeInBits() == Width; 98 })) { 99 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); 100 return false; 101 } 102 103 // For now, all operands must be defined in the same BB. 104 if (any_of(Operands, [this](VPValue *Op) { 105 return cast<VPInstruction>(Op)->getParent() != &this->BB; 106 })) { 107 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); 108 return false; 109 } 110 111 if (any_of(Operands, 112 [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { 113 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); 114 return false; 115 } 116 117 // For loads, check that there are no instructions writing to memory in 118 // between them. 119 // TODO: we only have to forbid instructions writing to memory that could 120 // interfere with any of the loads in the bundle 121 if (Opcode == Instruction::Load) { 122 unsigned LoadsSeen = 0; 123 VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent(); 124 for (auto &I : *Parent) { 125 auto *VPI = dyn_cast<VPInstruction>(&I); 126 if (!VPI) 127 break; 128 if (VPI->getOpcode() == Instruction::Load && 129 llvm::is_contained(Operands, VPI)) 130 LoadsSeen++; 131 132 if (LoadsSeen == Operands.size()) 133 break; 134 if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { 135 LLVM_DEBUG( 136 dbgs() << "VPSLP: instruction modifying memory between loads\n"); 137 return false; 138 } 139 } 140 141 if (!all_of(Operands, [](VPValue *Op) { 142 return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 143 ->isSimple(); 144 })) { 145 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); 146 return false; 147 } 148 } 149 150 if (Opcode == Instruction::Store) 151 if (!all_of(Operands, [](VPValue *Op) { 152 return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 153 ->isSimple(); 154 })) { 155 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); 156 return false; 157 } 158 159 return true; 160 } 161 162 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, 163 unsigned OperandIndex) { 164 SmallVector<VPValue *, 4> Operands; 165 for (VPValue *V : Values) { 166 // Currently we only support VPInstructions. 167 auto *U = cast<VPInstruction>(V); 168 Operands.push_back(U->getOperand(OperandIndex)); 169 } 170 return Operands; 171 } 172 173 static bool areCommutative(ArrayRef<VPValue *> Values) { 174 return Instruction::isCommutative( 175 cast<VPInstruction>(Values[0])->getOpcode()); 176 } 177 178 static SmallVector<SmallVector<VPValue *, 4>, 4> 179 getOperands(ArrayRef<VPValue *> Values) { 180 SmallVector<SmallVector<VPValue *, 4>, 4> Result; 181 auto *VPI = cast<VPInstruction>(Values[0]); 182 183 switch (VPI->getOpcode()) { 184 case Instruction::Load: 185 llvm_unreachable("Loads terminate a tree, no need to get operands"); 186 case Instruction::Store: 187 Result.push_back(getOperands(Values, 0)); 188 break; 189 default: 190 for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) 191 Result.push_back(getOperands(Values, I)); 192 break; 193 } 194 195 return Result; 196 } 197 198 /// Returns the opcode of Values or ~0 if they do not all agree. 199 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) { 200 unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode(); 201 if (any_of(Values, [Opcode](VPValue *V) { 202 return cast<VPInstruction>(V)->getOpcode() != Opcode; 203 })) 204 return None; 205 return {Opcode}; 206 } 207 208 /// Returns true if A and B access sequential memory if they are loads or 209 /// stores or if they have identical opcodes otherwise. 210 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, 211 VPInterleavedAccessInfo &IAI) { 212 if (A->getOpcode() != B->getOpcode()) 213 return false; 214 215 if (A->getOpcode() != Instruction::Load && 216 A->getOpcode() != Instruction::Store) 217 return true; 218 auto *GA = IAI.getInterleaveGroup(A); 219 auto *GB = IAI.getInterleaveGroup(B); 220 221 return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); 222 } 223 224 /// Implements getLAScore from Listing 7 in the paper. 225 /// Traverses and compares operands of V1 and V2 to MaxLevel. 226 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, 227 VPInterleavedAccessInfo &IAI) { 228 auto *I1 = dyn_cast<VPInstruction>(V1); 229 auto *I2 = dyn_cast<VPInstruction>(V2); 230 // Currently we only support VPInstructions. 231 if (!I1 || !I2) 232 return 0; 233 234 if (MaxLevel == 0) 235 return (unsigned)areConsecutiveOrMatch(I1, I2, IAI); 236 237 unsigned Score = 0; 238 for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I) 239 for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J) 240 Score += 241 getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI); 242 return Score; 243 } 244 245 std::pair<VPlanSlp::OpMode, VPValue *> 246 VPlanSlp::getBest(OpMode Mode, VPValue *Last, 247 SmallPtrSetImpl<VPValue *> &Candidates, 248 VPInterleavedAccessInfo &IAI) { 249 assert((Mode == OpMode::Load || Mode == OpMode::Opcode) && 250 "Currently we only handle load and commutative opcodes"); 251 LLVM_DEBUG(dbgs() << " getBest\n"); 252 253 SmallVector<VPValue *, 4> BestCandidates; 254 LLVM_DEBUG(dbgs() << " Candidates for " 255 << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " "); 256 for (auto *Candidate : Candidates) { 257 auto *LastI = cast<VPInstruction>(Last); 258 auto *CandidateI = cast<VPInstruction>(Candidate); 259 if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { 260 LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr() 261 << " "); 262 BestCandidates.push_back(Candidate); 263 } 264 } 265 LLVM_DEBUG(dbgs() << "\n"); 266 267 if (BestCandidates.empty()) 268 return {OpMode::Failed, nullptr}; 269 270 if (BestCandidates.size() == 1) 271 return {Mode, BestCandidates[0]}; 272 273 VPValue *Best = nullptr; 274 unsigned BestScore = 0; 275 for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { 276 unsigned PrevScore = ~0u; 277 bool AllSame = true; 278 279 // FIXME: Avoid visiting the same operands multiple times. 280 for (auto *Candidate : BestCandidates) { 281 unsigned Score = getLAScore(Last, Candidate, Depth, IAI); 282 if (PrevScore == ~0u) 283 PrevScore = Score; 284 if (PrevScore != Score) 285 AllSame = false; 286 PrevScore = Score; 287 288 if (Score > BestScore) { 289 BestScore = Score; 290 Best = Candidate; 291 } 292 } 293 if (!AllSame) 294 break; 295 } 296 LLVM_DEBUG(dbgs() << "Found best " 297 << *cast<VPInstruction>(Best)->getUnderlyingInstr() 298 << "\n"); 299 Candidates.erase(Best); 300 301 return {Mode, Best}; 302 } 303 304 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { 305 SmallVector<MultiNodeOpTy, 4> FinalOrder; 306 SmallVector<OpMode, 4> Mode; 307 FinalOrder.reserve(MultiNodeOps.size()); 308 Mode.reserve(MultiNodeOps.size()); 309 310 LLVM_DEBUG(dbgs() << "Reordering multinode\n"); 311 312 for (auto &Operands : MultiNodeOps) { 313 FinalOrder.push_back({Operands.first, {Operands.second[0]}}); 314 if (cast<VPInstruction>(Operands.second[0])->getOpcode() == 315 Instruction::Load) 316 Mode.push_back(OpMode::Load); 317 else 318 Mode.push_back(OpMode::Opcode); 319 } 320 321 for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { 322 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); 323 SmallPtrSet<VPValue *, 4> Candidates; 324 LLVM_DEBUG(dbgs() << " Candidates "); 325 for (auto Ops : MultiNodeOps) { 326 LLVM_DEBUG( 327 dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr() 328 << " "); 329 Candidates.insert(Ops.second[Lane]); 330 } 331 LLVM_DEBUG(dbgs() << "\n"); 332 333 for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { 334 LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); 335 if (Mode[Op] == OpMode::Failed) 336 continue; 337 338 VPValue *Last = FinalOrder[Op].second[Lane - 1]; 339 std::pair<OpMode, VPValue *> Res = 340 getBest(Mode[Op], Last, Candidates, IAI); 341 if (Res.second) 342 FinalOrder[Op].second.push_back(Res.second); 343 else 344 // TODO: handle this case 345 FinalOrder[Op].second.push_back(markFailed()); 346 } 347 } 348 349 return FinalOrder; 350 } 351 352 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 353 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { 354 dbgs() << " Ops: "; 355 for (auto Op : Values) { 356 if (auto *VPInstr = cast_or_null<VPInstruction>(Op)) 357 if (auto *Instr = VPInstr->getUnderlyingInstr()) { 358 dbgs() << *Instr << " | "; 359 continue; 360 } 361 dbgs() << " nullptr | "; 362 } 363 dbgs() << "\n"; 364 } 365 #endif 366 367 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { 368 assert(!Values.empty() && "Need some operands!"); 369 370 // If we already visited this instruction bundle, re-use the existing node 371 auto I = BundleToCombined.find(to_vector<4>(Values)); 372 if (I != BundleToCombined.end()) { 373 #ifndef NDEBUG 374 // Check that the resulting graph is a tree. If we re-use a node, this means 375 // its values have multiple users. We only allow this, if all users of each 376 // value are the same instruction. 377 for (auto *V : Values) { 378 auto UI = V->user_begin(); 379 auto *FirstUser = *UI++; 380 while (UI != V->user_end()) { 381 assert(*UI == FirstUser && "Currently we only support SLP trees."); 382 UI++; 383 } 384 } 385 #endif 386 return I->second; 387 } 388 389 // Dump inputs 390 LLVM_DEBUG({ 391 dbgs() << "buildGraph: "; 392 dumpBundle(Values); 393 }); 394 395 if (!areVectorizable(Values)) 396 return markFailed(); 397 398 assert(getOpcode(Values) && "Opcodes for all values must match"); 399 unsigned ValuesOpcode = getOpcode(Values).getValue(); 400 401 SmallVector<VPValue *, 4> CombinedOperands; 402 if (areCommutative(Values)) { 403 bool MultiNodeRoot = !MultiNodeActive; 404 MultiNodeActive = true; 405 for (auto &Operands : getOperands(Values)) { 406 LLVM_DEBUG({ 407 dbgs() << " Visiting Commutative"; 408 dumpBundle(Operands); 409 }); 410 411 auto OperandsOpcode = getOpcode(Operands); 412 if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { 413 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); 414 CombinedOperands.push_back(buildGraph(Operands)); 415 } else { 416 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); 417 // Create dummy VPInstruction, which will we replace later by the 418 // re-ordered operand. 419 VPInstruction *Op = new VPInstruction(0, {}); 420 CombinedOperands.push_back(Op); 421 MultiNodeOps.emplace_back(Op, Operands); 422 } 423 } 424 425 if (MultiNodeRoot) { 426 LLVM_DEBUG(dbgs() << "Reorder \n"); 427 MultiNodeActive = false; 428 429 auto FinalOrder = reorderMultiNodeOps(); 430 431 MultiNodeOps.clear(); 432 for (auto &Ops : FinalOrder) { 433 VPInstruction *NewOp = buildGraph(Ops.second); 434 Ops.first->replaceAllUsesWith(NewOp); 435 for (unsigned i = 0; i < CombinedOperands.size(); i++) 436 if (CombinedOperands[i] == Ops.first) 437 CombinedOperands[i] = NewOp; 438 delete Ops.first; 439 Ops.first = NewOp; 440 } 441 LLVM_DEBUG(dbgs() << "Found final order\n"); 442 } 443 } else { 444 LLVM_DEBUG(dbgs() << " NonCommuntative\n"); 445 if (ValuesOpcode == Instruction::Load) 446 for (VPValue *V : Values) 447 CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0)); 448 else 449 for (auto &Operands : getOperands(Values)) 450 CombinedOperands.push_back(buildGraph(Operands)); 451 } 452 453 unsigned Opcode; 454 switch (ValuesOpcode) { 455 case Instruction::Load: 456 Opcode = VPInstruction::SLPLoad; 457 break; 458 case Instruction::Store: 459 Opcode = VPInstruction::SLPStore; 460 break; 461 default: 462 Opcode = ValuesOpcode; 463 break; 464 } 465 466 if (!CompletelySLP) 467 return markFailed(); 468 469 assert(CombinedOperands.size() > 0 && "Need more some operands"); 470 auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr(); 471 auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc()); 472 VPI->setUnderlyingInstr(Inst); 473 474 LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " 475 << *cast<VPInstruction>(Values[0]) << "\n"); 476 addCombined(Values, VPI); 477 return VPI; 478 } 479