1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// This file implements SLP analysis based on VPlan. The analysis is based on 9 /// the ideas described in 10 /// 11 /// Look-ahead SLP: auto-vectorization in the presence of commutative 12 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 13 /// Luís F. W. Góes 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "VPlan.h" 18 #include "VPlanValue.h" 19 #include "llvm/ADT/DenseMap.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/Analysis/VectorUtils.h" 22 #include "llvm/IR/Instruction.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/Type.h" 25 #include "llvm/IR/Value.h" 26 #include "llvm/Support/Casting.h" 27 #include "llvm/Support/Debug.h" 28 #include "llvm/Support/ErrorHandling.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include <algorithm> 31 #include <cassert> 32 #include <utility> 33 34 using namespace llvm; 35 36 #define DEBUG_TYPE "vplan-slp" 37 38 // Number of levels to look ahead when re-ordering multi node operands. 39 static unsigned LookaheadMaxDepth = 5; 40 41 VPInstruction *VPlanSlp::markFailed() { 42 // FIXME: Currently this is used to signal we hit instructions we cannot 43 // trivially SLP'ize. 44 CompletelySLP = false; 45 return nullptr; 46 } 47 48 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) { 49 if (all_of(Operands, [](VPValue *V) { 50 return cast<VPInstruction>(V)->getUnderlyingInstr(); 51 })) { 52 unsigned BundleSize = 0; 53 for (VPValue *V : Operands) { 54 Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType(); 55 assert(!T->isVectorTy() && "Only scalar types supported for now"); 56 BundleSize += T->getScalarSizeInBits(); 57 } 58 WidestBundleBits = std::max(WidestBundleBits, BundleSize); 59 } 60 61 auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); 62 assert(Res.second && 63 "Already created a combined instruction for the operand bundle"); 64 (void)Res; 65 } 66 67 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { 68 // Currently we only support VPInstructions. 69 if (!all_of(Operands, [](VPValue *Op) { 70 return Op && isa<VPInstruction>(Op) && 71 cast<VPInstruction>(Op)->getUnderlyingInstr(); 72 })) { 73 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); 74 return false; 75 } 76 77 // Check if opcodes and type width agree for all instructions in the bundle. 78 // FIXME: Differing widths/opcodes can be handled by inserting additional 79 // instructions. 80 // FIXME: Deal with non-primitive types. 81 const Instruction *OriginalInstr = 82 cast<VPInstruction>(Operands[0])->getUnderlyingInstr(); 83 unsigned Opcode = OriginalInstr->getOpcode(); 84 unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); 85 if (!all_of(Operands, [Opcode, Width](VPValue *Op) { 86 const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr(); 87 return I->getOpcode() == Opcode && 88 I->getType()->getPrimitiveSizeInBits() == Width; 89 })) { 90 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); 91 return false; 92 } 93 94 // For now, all operands must be defined in the same BB. 95 if (any_of(Operands, [this](VPValue *Op) { 96 return cast<VPInstruction>(Op)->getParent() != &this->BB; 97 })) { 98 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); 99 return false; 100 } 101 102 if (any_of(Operands, 103 [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { 104 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); 105 return false; 106 } 107 108 // For loads, check that there are no instructions writing to memory in 109 // between them. 110 // TODO: we only have to forbid instructions writing to memory that could 111 // interfere with any of the loads in the bundle 112 if (Opcode == Instruction::Load) { 113 unsigned LoadsSeen = 0; 114 VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent(); 115 for (auto &I : *Parent) { 116 auto *VPI = dyn_cast<VPInstruction>(&I); 117 if (!VPI) 118 break; 119 if (VPI->getOpcode() == Instruction::Load && 120 llvm::is_contained(Operands, VPI)) 121 LoadsSeen++; 122 123 if (LoadsSeen == Operands.size()) 124 break; 125 if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { 126 LLVM_DEBUG( 127 dbgs() << "VPSLP: instruction modifying memory between loads\n"); 128 return false; 129 } 130 } 131 132 if (!all_of(Operands, [](VPValue *Op) { 133 return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 134 ->isSimple(); 135 })) { 136 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); 137 return false; 138 } 139 } 140 141 if (Opcode == Instruction::Store) 142 if (!all_of(Operands, [](VPValue *Op) { 143 return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 144 ->isSimple(); 145 })) { 146 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); 147 return false; 148 } 149 150 return true; 151 } 152 153 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, 154 unsigned OperandIndex) { 155 SmallVector<VPValue *, 4> Operands; 156 for (VPValue *V : Values) { 157 // Currently we only support VPInstructions. 158 auto *U = cast<VPInstruction>(V); 159 Operands.push_back(U->getOperand(OperandIndex)); 160 } 161 return Operands; 162 } 163 164 static bool areCommutative(ArrayRef<VPValue *> Values) { 165 return Instruction::isCommutative( 166 cast<VPInstruction>(Values[0])->getOpcode()); 167 } 168 169 static SmallVector<SmallVector<VPValue *, 4>, 4> 170 getOperands(ArrayRef<VPValue *> Values) { 171 SmallVector<SmallVector<VPValue *, 4>, 4> Result; 172 auto *VPI = cast<VPInstruction>(Values[0]); 173 174 switch (VPI->getOpcode()) { 175 case Instruction::Load: 176 llvm_unreachable("Loads terminate a tree, no need to get operands"); 177 case Instruction::Store: 178 Result.push_back(getOperands(Values, 0)); 179 break; 180 default: 181 for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) 182 Result.push_back(getOperands(Values, I)); 183 break; 184 } 185 186 return Result; 187 } 188 189 /// Returns the opcode of Values or ~0 if they do not all agree. 190 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) { 191 unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode(); 192 if (any_of(Values, [Opcode](VPValue *V) { 193 return cast<VPInstruction>(V)->getOpcode() != Opcode; 194 })) 195 return None; 196 return {Opcode}; 197 } 198 199 /// Returns true if A and B access sequential memory if they are loads or 200 /// stores or if they have identical opcodes otherwise. 201 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, 202 VPInterleavedAccessInfo &IAI) { 203 if (A->getOpcode() != B->getOpcode()) 204 return false; 205 206 if (A->getOpcode() != Instruction::Load && 207 A->getOpcode() != Instruction::Store) 208 return true; 209 auto *GA = IAI.getInterleaveGroup(A); 210 auto *GB = IAI.getInterleaveGroup(B); 211 212 return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); 213 } 214 215 /// Implements getLAScore from Listing 7 in the paper. 216 /// Traverses and compares operands of V1 and V2 to MaxLevel. 217 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, 218 VPInterleavedAccessInfo &IAI) { 219 auto *I1 = dyn_cast<VPInstruction>(V1); 220 auto *I2 = dyn_cast<VPInstruction>(V2); 221 // Currently we only support VPInstructions. 222 if (!I1 || !I2) 223 return 0; 224 225 if (MaxLevel == 0) 226 return (unsigned)areConsecutiveOrMatch(I1, I2, IAI); 227 228 unsigned Score = 0; 229 for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I) 230 for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J) 231 Score += 232 getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI); 233 return Score; 234 } 235 236 std::pair<VPlanSlp::OpMode, VPValue *> 237 VPlanSlp::getBest(OpMode Mode, VPValue *Last, 238 SmallPtrSetImpl<VPValue *> &Candidates, 239 VPInterleavedAccessInfo &IAI) { 240 assert((Mode == OpMode::Load || Mode == OpMode::Opcode) && 241 "Currently we only handle load and commutative opcodes"); 242 LLVM_DEBUG(dbgs() << " getBest\n"); 243 244 SmallVector<VPValue *, 4> BestCandidates; 245 LLVM_DEBUG(dbgs() << " Candidates for " 246 << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " "); 247 for (auto *Candidate : Candidates) { 248 auto *LastI = cast<VPInstruction>(Last); 249 auto *CandidateI = cast<VPInstruction>(Candidate); 250 if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { 251 LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr() 252 << " "); 253 BestCandidates.push_back(Candidate); 254 } 255 } 256 LLVM_DEBUG(dbgs() << "\n"); 257 258 if (BestCandidates.empty()) 259 return {OpMode::Failed, nullptr}; 260 261 if (BestCandidates.size() == 1) 262 return {Mode, BestCandidates[0]}; 263 264 VPValue *Best = nullptr; 265 unsigned BestScore = 0; 266 for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { 267 unsigned PrevScore = ~0u; 268 bool AllSame = true; 269 270 // FIXME: Avoid visiting the same operands multiple times. 271 for (auto *Candidate : BestCandidates) { 272 unsigned Score = getLAScore(Last, Candidate, Depth, IAI); 273 if (PrevScore == ~0u) 274 PrevScore = Score; 275 if (PrevScore != Score) 276 AllSame = false; 277 PrevScore = Score; 278 279 if (Score > BestScore) { 280 BestScore = Score; 281 Best = Candidate; 282 } 283 } 284 if (!AllSame) 285 break; 286 } 287 LLVM_DEBUG(dbgs() << "Found best " 288 << *cast<VPInstruction>(Best)->getUnderlyingInstr() 289 << "\n"); 290 Candidates.erase(Best); 291 292 return {Mode, Best}; 293 } 294 295 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { 296 SmallVector<MultiNodeOpTy, 4> FinalOrder; 297 SmallVector<OpMode, 4> Mode; 298 FinalOrder.reserve(MultiNodeOps.size()); 299 Mode.reserve(MultiNodeOps.size()); 300 301 LLVM_DEBUG(dbgs() << "Reordering multinode\n"); 302 303 for (auto &Operands : MultiNodeOps) { 304 FinalOrder.push_back({Operands.first, {Operands.second[0]}}); 305 if (cast<VPInstruction>(Operands.second[0])->getOpcode() == 306 Instruction::Load) 307 Mode.push_back(OpMode::Load); 308 else 309 Mode.push_back(OpMode::Opcode); 310 } 311 312 for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { 313 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); 314 SmallPtrSet<VPValue *, 4> Candidates; 315 LLVM_DEBUG(dbgs() << " Candidates "); 316 for (auto Ops : MultiNodeOps) { 317 LLVM_DEBUG( 318 dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr() 319 << " "); 320 Candidates.insert(Ops.second[Lane]); 321 } 322 LLVM_DEBUG(dbgs() << "\n"); 323 324 for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { 325 LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); 326 if (Mode[Op] == OpMode::Failed) 327 continue; 328 329 VPValue *Last = FinalOrder[Op].second[Lane - 1]; 330 std::pair<OpMode, VPValue *> Res = 331 getBest(Mode[Op], Last, Candidates, IAI); 332 if (Res.second) 333 FinalOrder[Op].second.push_back(Res.second); 334 else 335 // TODO: handle this case 336 FinalOrder[Op].second.push_back(markFailed()); 337 } 338 } 339 340 return FinalOrder; 341 } 342 343 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 344 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { 345 dbgs() << " Ops: "; 346 for (auto Op : Values) { 347 if (auto *VPInstr = cast_or_null<VPInstruction>(Op)) 348 if (auto *Instr = VPInstr->getUnderlyingInstr()) { 349 dbgs() << *Instr << " | "; 350 continue; 351 } 352 dbgs() << " nullptr | "; 353 } 354 dbgs() << "\n"; 355 } 356 #endif 357 358 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { 359 assert(!Values.empty() && "Need some operands!"); 360 361 // If we already visited this instruction bundle, re-use the existing node 362 auto I = BundleToCombined.find(to_vector<4>(Values)); 363 if (I != BundleToCombined.end()) { 364 #ifndef NDEBUG 365 // Check that the resulting graph is a tree. If we re-use a node, this means 366 // its values have multiple users. We only allow this, if all users of each 367 // value are the same instruction. 368 for (auto *V : Values) { 369 auto UI = V->user_begin(); 370 auto *FirstUser = *UI++; 371 while (UI != V->user_end()) { 372 assert(*UI == FirstUser && "Currently we only support SLP trees."); 373 UI++; 374 } 375 } 376 #endif 377 return I->second; 378 } 379 380 // Dump inputs 381 LLVM_DEBUG({ 382 dbgs() << "buildGraph: "; 383 dumpBundle(Values); 384 }); 385 386 if (!areVectorizable(Values)) 387 return markFailed(); 388 389 assert(getOpcode(Values) && "Opcodes for all values must match"); 390 unsigned ValuesOpcode = *getOpcode(Values); 391 392 SmallVector<VPValue *, 4> CombinedOperands; 393 if (areCommutative(Values)) { 394 bool MultiNodeRoot = !MultiNodeActive; 395 MultiNodeActive = true; 396 for (auto &Operands : getOperands(Values)) { 397 LLVM_DEBUG({ 398 dbgs() << " Visiting Commutative"; 399 dumpBundle(Operands); 400 }); 401 402 auto OperandsOpcode = getOpcode(Operands); 403 if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { 404 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); 405 CombinedOperands.push_back(buildGraph(Operands)); 406 } else { 407 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); 408 // Create dummy VPInstruction, which will we replace later by the 409 // re-ordered operand. 410 VPInstruction *Op = new VPInstruction(0, {}); 411 CombinedOperands.push_back(Op); 412 MultiNodeOps.emplace_back(Op, Operands); 413 } 414 } 415 416 if (MultiNodeRoot) { 417 LLVM_DEBUG(dbgs() << "Reorder \n"); 418 MultiNodeActive = false; 419 420 auto FinalOrder = reorderMultiNodeOps(); 421 422 MultiNodeOps.clear(); 423 for (auto &Ops : FinalOrder) { 424 VPInstruction *NewOp = buildGraph(Ops.second); 425 Ops.first->replaceAllUsesWith(NewOp); 426 for (unsigned i = 0; i < CombinedOperands.size(); i++) 427 if (CombinedOperands[i] == Ops.first) 428 CombinedOperands[i] = NewOp; 429 delete Ops.first; 430 Ops.first = NewOp; 431 } 432 LLVM_DEBUG(dbgs() << "Found final order\n"); 433 } 434 } else { 435 LLVM_DEBUG(dbgs() << " NonCommuntative\n"); 436 if (ValuesOpcode == Instruction::Load) 437 for (VPValue *V : Values) 438 CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0)); 439 else 440 for (auto &Operands : getOperands(Values)) 441 CombinedOperands.push_back(buildGraph(Operands)); 442 } 443 444 unsigned Opcode; 445 switch (ValuesOpcode) { 446 case Instruction::Load: 447 Opcode = VPInstruction::SLPLoad; 448 break; 449 case Instruction::Store: 450 Opcode = VPInstruction::SLPStore; 451 break; 452 default: 453 Opcode = ValuesOpcode; 454 break; 455 } 456 457 if (!CompletelySLP) 458 return markFailed(); 459 460 assert(CombinedOperands.size() > 0 && "Need more some operands"); 461 auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr(); 462 auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc()); 463 VPI->setUnderlyingInstr(Inst); 464 465 LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " 466 << *cast<VPInstruction>(Values[0]) << "\n"); 467 addCombined(Values, VPI); 468 return VPI; 469 } 470