1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// This file implements SLP analysis based on VPlan. The analysis is based on 9 /// the ideas described in 10 /// 11 /// Look-ahead SLP: auto-vectorization in the presence of commutative 12 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 13 /// Luís F. W. Góes 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "VPlanSLP.h" 18 #include "VPlan.h" 19 #include "VPlanCFG.h" 20 #include "VPlanValue.h" 21 #include "llvm/ADT/DenseMap.h" 22 #include "llvm/ADT/SmallVector.h" 23 #include "llvm/Analysis/LoopInfo.h" 24 #include "llvm/Analysis/VectorUtils.h" 25 #include "llvm/IR/Instruction.h" 26 #include "llvm/IR/Instructions.h" 27 #include "llvm/IR/Type.h" 28 #include "llvm/IR/Value.h" 29 #include "llvm/Support/Casting.h" 30 #include "llvm/Support/Debug.h" 31 #include "llvm/Support/ErrorHandling.h" 32 #include "llvm/Support/raw_ostream.h" 33 #include <algorithm> 34 #include <cassert> 35 #include <optional> 36 #include <utility> 37 38 using namespace llvm; 39 40 #define DEBUG_TYPE "vplan-slp" 41 42 // Number of levels to look ahead when re-ordering multi node operands. 43 static unsigned LookaheadMaxDepth = 5; 44 45 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, 46 Old2NewTy &Old2New, 47 InterleavedAccessInfo &IAI) { 48 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT( 49 Region->getEntry()); 50 for (VPBlockBase *Base : RPOT) { 51 visitBlock(Base, Old2New, IAI); 52 } 53 } 54 55 void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, 56 InterleavedAccessInfo &IAI) { 57 if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) { 58 for (VPRecipeBase &VPI : *VPBB) { 59 if (isa<VPWidenPHIRecipe>(&VPI)) 60 continue; 61 auto *VPInst = dyn_cast<VPInstruction>(&VPI); 62 if (!VPInst) 63 continue; 64 auto *Inst = dyn_cast_or_null<Instruction>(VPInst->getUnderlyingValue()); 65 if (!Inst) 66 continue; 67 auto *IG = IAI.getInterleaveGroup(Inst); 68 if (!IG) 69 continue; 70 71 auto NewIGIter = Old2New.find(IG); 72 if (NewIGIter == Old2New.end()) 73 Old2New[IG] = new InterleaveGroup<VPInstruction>( 74 IG->getFactor(), IG->isReverse(), IG->getAlign()); 75 76 if (Inst == IG->getInsertPos()) 77 Old2New[IG]->setInsertPos(VPInst); 78 79 InterleaveGroupMap[VPInst] = Old2New[IG]; 80 InterleaveGroupMap[VPInst]->insertMember( 81 VPInst, IG->getIndex(Inst), 82 Align(IG->isReverse() ? (-1) * int(IG->getFactor()) 83 : IG->getFactor())); 84 } 85 } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) { 86 visitRegion(Region, Old2New, IAI); 87 } else { 88 llvm_unreachable("Unsupported kind of VPBlock."); 89 } 90 } 91 92 VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, 93 InterleavedAccessInfo &IAI) { 94 Old2NewTy Old2New; 95 visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI); 96 } 97 98 VPInstruction *VPlanSlp::markFailed() { 99 // FIXME: Currently this is used to signal we hit instructions we cannot 100 // trivially SLP'ize. 101 CompletelySLP = false; 102 return nullptr; 103 } 104 105 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) { 106 if (all_of(Operands, [](VPValue *V) { 107 return cast<VPInstruction>(V)->getUnderlyingInstr(); 108 })) { 109 unsigned BundleSize = 0; 110 for (VPValue *V : Operands) { 111 Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType(); 112 assert(!T->isVectorTy() && "Only scalar types supported for now"); 113 BundleSize += T->getScalarSizeInBits(); 114 } 115 WidestBundleBits = std::max(WidestBundleBits, BundleSize); 116 } 117 118 auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); 119 assert(Res.second && 120 "Already created a combined instruction for the operand bundle"); 121 (void)Res; 122 } 123 124 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { 125 // Currently we only support VPInstructions. 126 if (!all_of(Operands, [](VPValue *Op) { 127 return Op && isa<VPInstruction>(Op) && 128 cast<VPInstruction>(Op)->getUnderlyingInstr(); 129 })) { 130 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); 131 return false; 132 } 133 134 // Check if opcodes and type width agree for all instructions in the bundle. 135 // FIXME: Differing widths/opcodes can be handled by inserting additional 136 // instructions. 137 // FIXME: Deal with non-primitive types. 138 const Instruction *OriginalInstr = 139 cast<VPInstruction>(Operands[0])->getUnderlyingInstr(); 140 unsigned Opcode = OriginalInstr->getOpcode(); 141 unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); 142 if (!all_of(Operands, [Opcode, Width](VPValue *Op) { 143 const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr(); 144 return I->getOpcode() == Opcode && 145 I->getType()->getPrimitiveSizeInBits() == Width; 146 })) { 147 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); 148 return false; 149 } 150 151 // For now, all operands must be defined in the same BB. 152 if (any_of(Operands, [this](VPValue *Op) { 153 return cast<VPInstruction>(Op)->getParent() != &this->BB; 154 })) { 155 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); 156 return false; 157 } 158 159 if (any_of(Operands, 160 [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { 161 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); 162 return false; 163 } 164 165 // For loads, check that there are no instructions writing to memory in 166 // between them. 167 // TODO: we only have to forbid instructions writing to memory that could 168 // interfere with any of the loads in the bundle 169 if (Opcode == Instruction::Load) { 170 unsigned LoadsSeen = 0; 171 VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent(); 172 for (auto &I : *Parent) { 173 auto *VPI = dyn_cast<VPInstruction>(&I); 174 if (!VPI) 175 break; 176 if (VPI->getOpcode() == Instruction::Load && 177 llvm::is_contained(Operands, VPI)) 178 LoadsSeen++; 179 180 if (LoadsSeen == Operands.size()) 181 break; 182 if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { 183 LLVM_DEBUG( 184 dbgs() << "VPSLP: instruction modifying memory between loads\n"); 185 return false; 186 } 187 } 188 189 if (!all_of(Operands, [](VPValue *Op) { 190 return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 191 ->isSimple(); 192 })) { 193 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); 194 return false; 195 } 196 } 197 198 if (Opcode == Instruction::Store) 199 if (!all_of(Operands, [](VPValue *Op) { 200 return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 201 ->isSimple(); 202 })) { 203 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); 204 return false; 205 } 206 207 return true; 208 } 209 210 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, 211 unsigned OperandIndex) { 212 SmallVector<VPValue *, 4> Operands; 213 for (VPValue *V : Values) { 214 // Currently we only support VPInstructions. 215 auto *U = cast<VPInstruction>(V); 216 Operands.push_back(U->getOperand(OperandIndex)); 217 } 218 return Operands; 219 } 220 221 static bool areCommutative(ArrayRef<VPValue *> Values) { 222 return Instruction::isCommutative( 223 cast<VPInstruction>(Values[0])->getOpcode()); 224 } 225 226 static SmallVector<SmallVector<VPValue *, 4>, 4> 227 getOperands(ArrayRef<VPValue *> Values) { 228 SmallVector<SmallVector<VPValue *, 4>, 4> Result; 229 auto *VPI = cast<VPInstruction>(Values[0]); 230 231 switch (VPI->getOpcode()) { 232 case Instruction::Load: 233 llvm_unreachable("Loads terminate a tree, no need to get operands"); 234 case Instruction::Store: 235 Result.push_back(getOperands(Values, 0)); 236 break; 237 default: 238 for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) 239 Result.push_back(getOperands(Values, I)); 240 break; 241 } 242 243 return Result; 244 } 245 246 /// Returns the opcode of Values or ~0 if they do not all agree. 247 static std::optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) { 248 unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode(); 249 if (any_of(Values, [Opcode](VPValue *V) { 250 return cast<VPInstruction>(V)->getOpcode() != Opcode; 251 })) 252 return std::nullopt; 253 return {Opcode}; 254 } 255 256 /// Returns true if A and B access sequential memory if they are loads or 257 /// stores or if they have identical opcodes otherwise. 258 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, 259 VPInterleavedAccessInfo &IAI) { 260 if (A->getOpcode() != B->getOpcode()) 261 return false; 262 263 if (A->getOpcode() != Instruction::Load && 264 A->getOpcode() != Instruction::Store) 265 return true; 266 auto *GA = IAI.getInterleaveGroup(A); 267 auto *GB = IAI.getInterleaveGroup(B); 268 269 return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); 270 } 271 272 /// Implements getLAScore from Listing 7 in the paper. 273 /// Traverses and compares operands of V1 and V2 to MaxLevel. 274 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, 275 VPInterleavedAccessInfo &IAI) { 276 auto *I1 = dyn_cast<VPInstruction>(V1); 277 auto *I2 = dyn_cast<VPInstruction>(V2); 278 // Currently we only support VPInstructions. 279 if (!I1 || !I2) 280 return 0; 281 282 if (MaxLevel == 0) 283 return (unsigned)areConsecutiveOrMatch(I1, I2, IAI); 284 285 unsigned Score = 0; 286 for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I) 287 for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J) 288 Score += 289 getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI); 290 return Score; 291 } 292 293 std::pair<VPlanSlp::OpMode, VPValue *> 294 VPlanSlp::getBest(OpMode Mode, VPValue *Last, 295 SmallPtrSetImpl<VPValue *> &Candidates, 296 VPInterleavedAccessInfo &IAI) { 297 assert((Mode == OpMode::Load || Mode == OpMode::Opcode) && 298 "Currently we only handle load and commutative opcodes"); 299 LLVM_DEBUG(dbgs() << " getBest\n"); 300 301 SmallVector<VPValue *, 4> BestCandidates; 302 LLVM_DEBUG(dbgs() << " Candidates for " 303 << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " "); 304 for (auto *Candidate : Candidates) { 305 auto *LastI = cast<VPInstruction>(Last); 306 auto *CandidateI = cast<VPInstruction>(Candidate); 307 if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { 308 LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr() 309 << " "); 310 BestCandidates.push_back(Candidate); 311 } 312 } 313 LLVM_DEBUG(dbgs() << "\n"); 314 315 if (BestCandidates.empty()) 316 return {OpMode::Failed, nullptr}; 317 318 if (BestCandidates.size() == 1) 319 return {Mode, BestCandidates[0]}; 320 321 VPValue *Best = nullptr; 322 unsigned BestScore = 0; 323 for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { 324 unsigned PrevScore = ~0u; 325 bool AllSame = true; 326 327 // FIXME: Avoid visiting the same operands multiple times. 328 for (auto *Candidate : BestCandidates) { 329 unsigned Score = getLAScore(Last, Candidate, Depth, IAI); 330 if (PrevScore == ~0u) 331 PrevScore = Score; 332 if (PrevScore != Score) 333 AllSame = false; 334 PrevScore = Score; 335 336 if (Score > BestScore) { 337 BestScore = Score; 338 Best = Candidate; 339 } 340 } 341 if (!AllSame) 342 break; 343 } 344 LLVM_DEBUG(dbgs() << "Found best " 345 << *cast<VPInstruction>(Best)->getUnderlyingInstr() 346 << "\n"); 347 Candidates.erase(Best); 348 349 return {Mode, Best}; 350 } 351 352 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { 353 SmallVector<MultiNodeOpTy, 4> FinalOrder; 354 SmallVector<OpMode, 4> Mode; 355 FinalOrder.reserve(MultiNodeOps.size()); 356 Mode.reserve(MultiNodeOps.size()); 357 358 LLVM_DEBUG(dbgs() << "Reordering multinode\n"); 359 360 for (auto &Operands : MultiNodeOps) { 361 FinalOrder.push_back({Operands.first, {Operands.second[0]}}); 362 if (cast<VPInstruction>(Operands.second[0])->getOpcode() == 363 Instruction::Load) 364 Mode.push_back(OpMode::Load); 365 else 366 Mode.push_back(OpMode::Opcode); 367 } 368 369 for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { 370 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); 371 SmallPtrSet<VPValue *, 4> Candidates; 372 LLVM_DEBUG(dbgs() << " Candidates "); 373 for (auto Ops : MultiNodeOps) { 374 LLVM_DEBUG( 375 dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr() 376 << " "); 377 Candidates.insert(Ops.second[Lane]); 378 } 379 LLVM_DEBUG(dbgs() << "\n"); 380 381 for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { 382 LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); 383 if (Mode[Op] == OpMode::Failed) 384 continue; 385 386 VPValue *Last = FinalOrder[Op].second[Lane - 1]; 387 std::pair<OpMode, VPValue *> Res = 388 getBest(Mode[Op], Last, Candidates, IAI); 389 if (Res.second) 390 FinalOrder[Op].second.push_back(Res.second); 391 else 392 // TODO: handle this case 393 FinalOrder[Op].second.push_back(markFailed()); 394 } 395 } 396 397 return FinalOrder; 398 } 399 400 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 401 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { 402 dbgs() << " Ops: "; 403 for (auto *Op : Values) { 404 if (auto *VPInstr = cast_or_null<VPInstruction>(Op)) 405 if (auto *Instr = VPInstr->getUnderlyingInstr()) { 406 dbgs() << *Instr << " | "; 407 continue; 408 } 409 dbgs() << " nullptr | "; 410 } 411 dbgs() << "\n"; 412 } 413 #endif 414 415 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { 416 assert(!Values.empty() && "Need some operands!"); 417 418 // If we already visited this instruction bundle, re-use the existing node 419 auto I = BundleToCombined.find(to_vector<4>(Values)); 420 if (I != BundleToCombined.end()) { 421 #ifndef NDEBUG 422 // Check that the resulting graph is a tree. If we re-use a node, this means 423 // its values have multiple users. We only allow this, if all users of each 424 // value are the same instruction. 425 for (auto *V : Values) { 426 auto UI = V->user_begin(); 427 auto *FirstUser = *UI++; 428 while (UI != V->user_end()) { 429 assert(*UI == FirstUser && "Currently we only support SLP trees."); 430 UI++; 431 } 432 } 433 #endif 434 return I->second; 435 } 436 437 // Dump inputs 438 LLVM_DEBUG({ 439 dbgs() << "buildGraph: "; 440 dumpBundle(Values); 441 }); 442 443 if (!areVectorizable(Values)) 444 return markFailed(); 445 446 assert(getOpcode(Values) && "Opcodes for all values must match"); 447 unsigned ValuesOpcode = *getOpcode(Values); 448 449 SmallVector<VPValue *, 4> CombinedOperands; 450 if (areCommutative(Values)) { 451 bool MultiNodeRoot = !MultiNodeActive; 452 MultiNodeActive = true; 453 for (auto &Operands : getOperands(Values)) { 454 LLVM_DEBUG({ 455 dbgs() << " Visiting Commutative"; 456 dumpBundle(Operands); 457 }); 458 459 auto OperandsOpcode = getOpcode(Operands); 460 if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { 461 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); 462 CombinedOperands.push_back(buildGraph(Operands)); 463 } else { 464 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); 465 // Create dummy VPInstruction, which will we replace later by the 466 // re-ordered operand. 467 VPInstruction *Op = new VPInstruction(0, {}); 468 CombinedOperands.push_back(Op); 469 MultiNodeOps.emplace_back(Op, Operands); 470 } 471 } 472 473 if (MultiNodeRoot) { 474 LLVM_DEBUG(dbgs() << "Reorder \n"); 475 MultiNodeActive = false; 476 477 auto FinalOrder = reorderMultiNodeOps(); 478 479 MultiNodeOps.clear(); 480 for (auto &Ops : FinalOrder) { 481 VPInstruction *NewOp = buildGraph(Ops.second); 482 Ops.first->replaceAllUsesWith(NewOp); 483 for (unsigned i = 0; i < CombinedOperands.size(); i++) 484 if (CombinedOperands[i] == Ops.first) 485 CombinedOperands[i] = NewOp; 486 delete Ops.first; 487 Ops.first = NewOp; 488 } 489 LLVM_DEBUG(dbgs() << "Found final order\n"); 490 } 491 } else { 492 LLVM_DEBUG(dbgs() << " NonCommuntative\n"); 493 if (ValuesOpcode == Instruction::Load) 494 for (VPValue *V : Values) 495 CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0)); 496 else 497 for (auto &Operands : getOperands(Values)) 498 CombinedOperands.push_back(buildGraph(Operands)); 499 } 500 501 unsigned Opcode; 502 switch (ValuesOpcode) { 503 case Instruction::Load: 504 Opcode = VPInstruction::SLPLoad; 505 break; 506 case Instruction::Store: 507 Opcode = VPInstruction::SLPStore; 508 break; 509 default: 510 Opcode = ValuesOpcode; 511 break; 512 } 513 514 if (!CompletelySLP) 515 return markFailed(); 516 517 assert(CombinedOperands.size() > 0 && "Need more some operands"); 518 auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr(); 519 auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc()); 520 521 LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " << Values[0] 522 << "\n"); 523 addCombined(Values, VPI); 524 return VPI; 525 } 526