1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// This file implements SLP analysis based on VPlan. The analysis is based on 9 /// the ideas described in 10 /// 11 /// Look-ahead SLP: auto-vectorization in the presence of commutative 12 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 13 /// Luís F. W. Góes 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "VPlan.h" 18 #include "llvm/ADT/DepthFirstIterator.h" 19 #include "llvm/ADT/PostOrderIterator.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Twine.h" 22 #include "llvm/Analysis/LoopInfo.h" 23 #include "llvm/Analysis/VectorUtils.h" 24 #include "llvm/IR/BasicBlock.h" 25 #include "llvm/IR/CFG.h" 26 #include "llvm/IR/Dominators.h" 27 #include "llvm/IR/InstrTypes.h" 28 #include "llvm/IR/Instruction.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Type.h" 31 #include "llvm/IR/Value.h" 32 #include "llvm/Support/Casting.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/ErrorHandling.h" 35 #include "llvm/Support/GraphWriter.h" 36 #include "llvm/Support/raw_ostream.h" 37 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 38 #include <cassert> 39 #include <iterator> 40 #include <string> 41 #include <vector> 42 43 using namespace llvm; 44 45 #define DEBUG_TYPE "vplan-slp" 46 47 // Number of levels to look ahead when re-ordering multi node operands. 48 static unsigned LookaheadMaxDepth = 5; 49 50 VPInstruction *VPlanSlp::markFailed() { 51 // FIXME: Currently this is used to signal we hit instructions we cannot 52 // trivially SLP'ize. 53 CompletelySLP = false; 54 return nullptr; 55 } 56 57 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) { 58 if (all_of(Operands, [](VPValue *V) { 59 return cast<VPInstruction>(V)->getUnderlyingInstr(); 60 })) { 61 unsigned BundleSize = 0; 62 for (VPValue *V : Operands) { 63 Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType(); 64 assert(!T->isVectorTy() && "Only scalar types supported for now"); 65 BundleSize += T->getScalarSizeInBits(); 66 } 67 WidestBundleBits = std::max(WidestBundleBits, BundleSize); 68 } 69 70 auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); 71 assert(Res.second && 72 "Already created a combined instruction for the operand bundle"); 73 (void)Res; 74 } 75 76 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { 77 // Currently we only support VPInstructions. 78 if (!all_of(Operands, [](VPValue *Op) { 79 return Op && isa<VPInstruction>(Op) && 80 cast<VPInstruction>(Op)->getUnderlyingInstr(); 81 })) { 82 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); 83 return false; 84 } 85 86 // Check if opcodes and type width agree for all instructions in the bundle. 87 // FIXME: Differing widths/opcodes can be handled by inserting additional 88 // instructions. 89 // FIXME: Deal with non-primitive types. 90 const Instruction *OriginalInstr = 91 cast<VPInstruction>(Operands[0])->getUnderlyingInstr(); 92 unsigned Opcode = OriginalInstr->getOpcode(); 93 unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); 94 if (!all_of(Operands, [Opcode, Width](VPValue *Op) { 95 const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr(); 96 return I->getOpcode() == Opcode && 97 I->getType()->getPrimitiveSizeInBits() == Width; 98 })) { 99 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); 100 return false; 101 } 102 103 // For now, all operands must be defined in the same BB. 104 if (any_of(Operands, [this](VPValue *Op) { 105 return cast<VPInstruction>(Op)->getParent() != &this->BB; 106 })) { 107 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); 108 return false; 109 } 110 111 if (any_of(Operands, 112 [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { 113 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); 114 return false; 115 } 116 117 // For loads, check that there are no instructions writing to memory in 118 // between them. 119 // TODO: we only have to forbid instructions writing to memory that could 120 // interfere with any of the loads in the bundle 121 if (Opcode == Instruction::Load) { 122 unsigned LoadsSeen = 0; 123 VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent(); 124 for (auto &I : *Parent) { 125 auto *VPI = cast<VPInstruction>(&I); 126 if (VPI->getOpcode() == Instruction::Load && 127 llvm::is_contained(Operands, VPI)) 128 LoadsSeen++; 129 130 if (LoadsSeen == Operands.size()) 131 break; 132 if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { 133 LLVM_DEBUG( 134 dbgs() << "VPSLP: instruction modifying memory between loads\n"); 135 return false; 136 } 137 } 138 139 if (!all_of(Operands, [](VPValue *Op) { 140 return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 141 ->isSimple(); 142 })) { 143 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); 144 return false; 145 } 146 } 147 148 if (Opcode == Instruction::Store) 149 if (!all_of(Operands, [](VPValue *Op) { 150 return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 151 ->isSimple(); 152 })) { 153 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); 154 return false; 155 } 156 157 return true; 158 } 159 160 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, 161 unsigned OperandIndex) { 162 SmallVector<VPValue *, 4> Operands; 163 for (VPValue *V : Values) { 164 // Currently we only support VPInstructions. 165 auto *U = cast<VPInstruction>(V); 166 Operands.push_back(U->getOperand(OperandIndex)); 167 } 168 return Operands; 169 } 170 171 static bool areCommutative(ArrayRef<VPValue *> Values) { 172 return Instruction::isCommutative( 173 cast<VPInstruction>(Values[0])->getOpcode()); 174 } 175 176 static SmallVector<SmallVector<VPValue *, 4>, 4> 177 getOperands(ArrayRef<VPValue *> Values) { 178 SmallVector<SmallVector<VPValue *, 4>, 4> Result; 179 auto *VPI = cast<VPInstruction>(Values[0]); 180 181 switch (VPI->getOpcode()) { 182 case Instruction::Load: 183 llvm_unreachable("Loads terminate a tree, no need to get operands"); 184 case Instruction::Store: 185 Result.push_back(getOperands(Values, 0)); 186 break; 187 default: 188 for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) 189 Result.push_back(getOperands(Values, I)); 190 break; 191 } 192 193 return Result; 194 } 195 196 /// Returns the opcode of Values or ~0 if they do not all agree. 197 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) { 198 unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode(); 199 if (any_of(Values, [Opcode](VPValue *V) { 200 return cast<VPInstruction>(V)->getOpcode() != Opcode; 201 })) 202 return None; 203 return {Opcode}; 204 } 205 206 /// Returns true if A and B access sequential memory if they are loads or 207 /// stores or if they have identical opcodes otherwise. 208 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, 209 VPInterleavedAccessInfo &IAI) { 210 if (A->getOpcode() != B->getOpcode()) 211 return false; 212 213 if (A->getOpcode() != Instruction::Load && 214 A->getOpcode() != Instruction::Store) 215 return true; 216 auto *GA = IAI.getInterleaveGroup(A); 217 auto *GB = IAI.getInterleaveGroup(B); 218 219 return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); 220 } 221 222 /// Implements getLAScore from Listing 7 in the paper. 223 /// Traverses and compares operands of V1 and V2 to MaxLevel. 224 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, 225 VPInterleavedAccessInfo &IAI) { 226 auto *I1 = dyn_cast<VPInstruction>(V1); 227 auto *I2 = dyn_cast<VPInstruction>(V2); 228 // Currently we only support VPInstructions. 229 if (!I1 || !I2) 230 return 0; 231 232 if (MaxLevel == 0) 233 return (unsigned)areConsecutiveOrMatch(I1, I2, IAI); 234 235 unsigned Score = 0; 236 for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I) 237 for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J) 238 Score += 239 getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI); 240 return Score; 241 } 242 243 std::pair<VPlanSlp::OpMode, VPValue *> 244 VPlanSlp::getBest(OpMode Mode, VPValue *Last, 245 SmallPtrSetImpl<VPValue *> &Candidates, 246 VPInterleavedAccessInfo &IAI) { 247 assert((Mode == OpMode::Load || Mode == OpMode::Opcode) && 248 "Currently we only handle load and commutative opcodes"); 249 LLVM_DEBUG(dbgs() << " getBest\n"); 250 251 SmallVector<VPValue *, 4> BestCandidates; 252 LLVM_DEBUG(dbgs() << " Candidates for " 253 << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " "); 254 for (auto *Candidate : Candidates) { 255 auto *LastI = cast<VPInstruction>(Last); 256 auto *CandidateI = cast<VPInstruction>(Candidate); 257 if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { 258 LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr() 259 << " "); 260 BestCandidates.push_back(Candidate); 261 } 262 } 263 LLVM_DEBUG(dbgs() << "\n"); 264 265 if (BestCandidates.empty()) 266 return {OpMode::Failed, nullptr}; 267 268 if (BestCandidates.size() == 1) 269 return {Mode, BestCandidates[0]}; 270 271 VPValue *Best = nullptr; 272 unsigned BestScore = 0; 273 for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { 274 unsigned PrevScore = ~0u; 275 bool AllSame = true; 276 277 // FIXME: Avoid visiting the same operands multiple times. 278 for (auto *Candidate : BestCandidates) { 279 unsigned Score = getLAScore(Last, Candidate, Depth, IAI); 280 if (PrevScore == ~0u) 281 PrevScore = Score; 282 if (PrevScore != Score) 283 AllSame = false; 284 PrevScore = Score; 285 286 if (Score > BestScore) { 287 BestScore = Score; 288 Best = Candidate; 289 } 290 } 291 if (!AllSame) 292 break; 293 } 294 LLVM_DEBUG(dbgs() << "Found best " 295 << *cast<VPInstruction>(Best)->getUnderlyingInstr() 296 << "\n"); 297 Candidates.erase(Best); 298 299 return {Mode, Best}; 300 } 301 302 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { 303 SmallVector<MultiNodeOpTy, 4> FinalOrder; 304 SmallVector<OpMode, 4> Mode; 305 FinalOrder.reserve(MultiNodeOps.size()); 306 Mode.reserve(MultiNodeOps.size()); 307 308 LLVM_DEBUG(dbgs() << "Reordering multinode\n"); 309 310 for (auto &Operands : MultiNodeOps) { 311 FinalOrder.push_back({Operands.first, {Operands.second[0]}}); 312 if (cast<VPInstruction>(Operands.second[0])->getOpcode() == 313 Instruction::Load) 314 Mode.push_back(OpMode::Load); 315 else 316 Mode.push_back(OpMode::Opcode); 317 } 318 319 for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { 320 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); 321 SmallPtrSet<VPValue *, 4> Candidates; 322 LLVM_DEBUG(dbgs() << " Candidates "); 323 for (auto Ops : MultiNodeOps) { 324 LLVM_DEBUG( 325 dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr() 326 << " "); 327 Candidates.insert(Ops.second[Lane]); 328 } 329 LLVM_DEBUG(dbgs() << "\n"); 330 331 for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { 332 LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); 333 if (Mode[Op] == OpMode::Failed) 334 continue; 335 336 VPValue *Last = FinalOrder[Op].second[Lane - 1]; 337 std::pair<OpMode, VPValue *> Res = 338 getBest(Mode[Op], Last, Candidates, IAI); 339 if (Res.second) 340 FinalOrder[Op].second.push_back(Res.second); 341 else 342 // TODO: handle this case 343 FinalOrder[Op].second.push_back(markFailed()); 344 } 345 } 346 347 return FinalOrder; 348 } 349 350 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { 351 dbgs() << " Ops: "; 352 for (auto Op : Values) { 353 if (auto *VPInstr = cast_or_null<VPInstruction>(Op)) 354 if (auto *Instr = VPInstr->getUnderlyingInstr()) { 355 dbgs() << *Instr << " | "; 356 continue; 357 } 358 dbgs() << " nullptr | "; 359 } 360 dbgs() << "\n"; 361 } 362 363 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { 364 assert(!Values.empty() && "Need some operands!"); 365 366 // If we already visited this instruction bundle, re-use the existing node 367 auto I = BundleToCombined.find(to_vector<4>(Values)); 368 if (I != BundleToCombined.end()) { 369 #ifndef NDEBUG 370 // Check that the resulting graph is a tree. If we re-use a node, this means 371 // its values have multiple users. We only allow this, if all users of each 372 // value are the same instruction. 373 for (auto *V : Values) { 374 auto UI = V->user_begin(); 375 auto *FirstUser = *UI++; 376 while (UI != V->user_end()) { 377 assert(*UI == FirstUser && "Currently we only support SLP trees."); 378 UI++; 379 } 380 } 381 #endif 382 return I->second; 383 } 384 385 // Dump inputs 386 LLVM_DEBUG({ 387 dbgs() << "buildGraph: "; 388 dumpBundle(Values); 389 }); 390 391 if (!areVectorizable(Values)) 392 return markFailed(); 393 394 assert(getOpcode(Values) && "Opcodes for all values must match"); 395 unsigned ValuesOpcode = getOpcode(Values).getValue(); 396 397 SmallVector<VPValue *, 4> CombinedOperands; 398 if (areCommutative(Values)) { 399 bool MultiNodeRoot = !MultiNodeActive; 400 MultiNodeActive = true; 401 for (auto &Operands : getOperands(Values)) { 402 LLVM_DEBUG({ 403 dbgs() << " Visiting Commutative"; 404 dumpBundle(Operands); 405 }); 406 407 auto OperandsOpcode = getOpcode(Operands); 408 if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { 409 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); 410 CombinedOperands.push_back(buildGraph(Operands)); 411 } else { 412 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); 413 // Create dummy VPInstruction, which will we replace later by the 414 // re-ordered operand. 415 VPInstruction *Op = new VPInstruction(0, {}); 416 CombinedOperands.push_back(Op); 417 MultiNodeOps.emplace_back(Op, Operands); 418 } 419 } 420 421 if (MultiNodeRoot) { 422 LLVM_DEBUG(dbgs() << "Reorder \n"); 423 MultiNodeActive = false; 424 425 auto FinalOrder = reorderMultiNodeOps(); 426 427 MultiNodeOps.clear(); 428 for (auto &Ops : FinalOrder) { 429 VPInstruction *NewOp = buildGraph(Ops.second); 430 Ops.first->replaceAllUsesWith(NewOp); 431 for (unsigned i = 0; i < CombinedOperands.size(); i++) 432 if (CombinedOperands[i] == Ops.first) 433 CombinedOperands[i] = NewOp; 434 delete Ops.first; 435 Ops.first = NewOp; 436 } 437 LLVM_DEBUG(dbgs() << "Found final order\n"); 438 } 439 } else { 440 LLVM_DEBUG(dbgs() << " NonCommuntative\n"); 441 if (ValuesOpcode == Instruction::Load) 442 for (VPValue *V : Values) 443 CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0)); 444 else 445 for (auto &Operands : getOperands(Values)) 446 CombinedOperands.push_back(buildGraph(Operands)); 447 } 448 449 unsigned Opcode; 450 switch (ValuesOpcode) { 451 case Instruction::Load: 452 Opcode = VPInstruction::SLPLoad; 453 break; 454 case Instruction::Store: 455 Opcode = VPInstruction::SLPStore; 456 break; 457 default: 458 Opcode = ValuesOpcode; 459 break; 460 } 461 462 if (!CompletelySLP) 463 return markFailed(); 464 465 assert(CombinedOperands.size() > 0 && "Need more some operands"); 466 auto *VPI = new VPInstruction(Opcode, CombinedOperands); 467 VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr()); 468 469 LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " 470 << *cast<VPInstruction>(Values[0]) << "\n"); 471 addCombined(Values, VPI); 472 return VPI; 473 } 474