1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Twine.h" 19 #include "llvm/Analysis/IVDescriptors.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/Instruction.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/Type.h" 25 #include "llvm/IR/Value.h" 26 #include "llvm/Support/Casting.h" 27 #include "llvm/Support/CommandLine.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 31 #include "llvm/Transforms/Utils/LoopUtils.h" 32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 33 #include <cassert> 34 35 using namespace llvm; 36 37 using VectorParts = SmallVector<Value *, 2>; 38 39 namespace llvm { 40 extern cl::opt<bool> EnableVPlanNativePath; 41 } 42 43 #define LV_NAME "loop-vectorize" 44 #define DEBUG_TYPE LV_NAME 45 46 bool VPRecipeBase::mayWriteToMemory() const { 47 switch (getVPDefID()) { 48 case VPInterleaveSC: 49 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 50 case VPWidenMemoryInstructionSC: { 51 return cast<VPWidenMemoryInstructionRecipe>(this)->isStore(); 52 } 53 case VPReplicateSC: 54 case VPWidenCallSC: 55 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 56 ->mayWriteToMemory(); 57 case VPBranchOnMaskSC: 58 case VPScalarIVStepsSC: 59 case VPPredInstPHISC: 60 return false; 61 case VPBlendSC: 62 case VPReductionSC: 63 case VPWidenCanonicalIVSC: 64 case VPWidenCastSC: 65 case VPWidenGEPSC: 66 case VPWidenIntOrFpInductionSC: 67 case VPWidenPHISC: 68 case VPWidenSC: 69 case VPWidenSelectSC: { 70 const Instruction *I = 71 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 72 (void)I; 73 assert((!I || !I->mayWriteToMemory()) && 74 "underlying instruction may write to memory"); 75 return false; 76 } 77 default: 78 return true; 79 } 80 } 81 82 bool VPRecipeBase::mayReadFromMemory() const { 83 switch (getVPDefID()) { 84 case VPWidenMemoryInstructionSC: { 85 return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore(); 86 } 87 case VPReplicateSC: 88 case VPWidenCallSC: 89 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 90 ->mayReadFromMemory(); 91 case VPBranchOnMaskSC: 92 case VPScalarIVStepsSC: 93 case VPPredInstPHISC: 94 return false; 95 case VPBlendSC: 96 case VPReductionSC: 97 case VPWidenCanonicalIVSC: 98 case VPWidenCastSC: 99 case VPWidenGEPSC: 100 case VPWidenIntOrFpInductionSC: 101 case VPWidenPHISC: 102 case VPWidenSC: 103 case VPWidenSelectSC: { 104 const Instruction *I = 105 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 106 (void)I; 107 assert((!I || !I->mayReadFromMemory()) && 108 "underlying instruction may read from memory"); 109 return false; 110 } 111 default: 112 return true; 113 } 114 } 115 116 bool VPRecipeBase::mayHaveSideEffects() const { 117 switch (getVPDefID()) { 118 case VPDerivedIVSC: 119 case VPPredInstPHISC: 120 return false; 121 case VPInstructionSC: 122 switch (cast<VPInstruction>(this)->getOpcode()) { 123 case Instruction::Or: 124 case Instruction::ICmp: 125 case Instruction::Select: 126 case VPInstruction::Not: 127 case VPInstruction::CalculateTripCountMinusVF: 128 case VPInstruction::CanonicalIVIncrementForPart: 129 return false; 130 default: 131 return true; 132 } 133 case VPWidenCallSC: 134 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 135 ->mayHaveSideEffects(); 136 case VPBlendSC: 137 case VPReductionSC: 138 case VPScalarIVStepsSC: 139 case VPWidenCanonicalIVSC: 140 case VPWidenCastSC: 141 case VPWidenGEPSC: 142 case VPWidenIntOrFpInductionSC: 143 case VPWidenPHISC: 144 case VPWidenPointerInductionSC: 145 case VPWidenSC: 146 case VPWidenSelectSC: { 147 const Instruction *I = 148 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 149 (void)I; 150 assert((!I || !I->mayHaveSideEffects()) && 151 "underlying instruction has side-effects"); 152 return false; 153 } 154 case VPInterleaveSC: 155 return mayWriteToMemory(); 156 case VPWidenMemoryInstructionSC: 157 assert(cast<VPWidenMemoryInstructionRecipe>(this) 158 ->getIngredient() 159 .mayHaveSideEffects() == mayWriteToMemory() && 160 "mayHaveSideffects result for ingredient differs from this " 161 "implementation"); 162 return mayWriteToMemory(); 163 case VPReplicateSC: { 164 auto *R = cast<VPReplicateRecipe>(this); 165 return R->getUnderlyingInstr()->mayHaveSideEffects(); 166 } 167 default: 168 return true; 169 } 170 } 171 172 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { 173 auto Lane = VPLane::getLastLaneForVF(State.VF); 174 VPValue *ExitValue = getOperand(0); 175 if (vputils::isUniformAfterVectorization(ExitValue)) 176 Lane = VPLane::getFirstLane(); 177 VPBasicBlock *MiddleVPBB = 178 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); 179 assert(MiddleVPBB->getNumSuccessors() == 0 && 180 "the middle block must not have any successors"); 181 BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB]; 182 Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)), 183 MiddleBB); 184 } 185 186 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 187 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 188 O << "Live-out "; 189 getPhi()->printAsOperand(O); 190 O << " = "; 191 getOperand(0)->printAsOperand(O, SlotTracker); 192 O << "\n"; 193 } 194 #endif 195 196 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 197 assert(!Parent && "Recipe already in some VPBasicBlock"); 198 assert(InsertPos->getParent() && 199 "Insertion position not in any VPBasicBlock"); 200 Parent = InsertPos->getParent(); 201 Parent->getRecipeList().insert(InsertPos->getIterator(), this); 202 } 203 204 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 205 iplist<VPRecipeBase>::iterator I) { 206 assert(!Parent && "Recipe already in some VPBasicBlock"); 207 assert(I == BB.end() || I->getParent() == &BB); 208 Parent = &BB; 209 BB.getRecipeList().insert(I, this); 210 } 211 212 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 213 assert(!Parent && "Recipe already in some VPBasicBlock"); 214 assert(InsertPos->getParent() && 215 "Insertion position not in any VPBasicBlock"); 216 Parent = InsertPos->getParent(); 217 Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this); 218 } 219 220 void VPRecipeBase::removeFromParent() { 221 assert(getParent() && "Recipe not in any VPBasicBlock"); 222 getParent()->getRecipeList().remove(getIterator()); 223 Parent = nullptr; 224 } 225 226 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 227 assert(getParent() && "Recipe not in any VPBasicBlock"); 228 return getParent()->getRecipeList().erase(getIterator()); 229 } 230 231 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 232 removeFromParent(); 233 insertAfter(InsertPos); 234 } 235 236 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 237 iplist<VPRecipeBase>::iterator I) { 238 removeFromParent(); 239 insertBefore(BB, I); 240 } 241 242 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 243 assert(OpType == OperationType::FPMathOp && 244 "recipe doesn't have fast math flags"); 245 FastMathFlags Res; 246 Res.setAllowReassoc(FMFs.AllowReassoc); 247 Res.setNoNaNs(FMFs.NoNaNs); 248 Res.setNoInfs(FMFs.NoInfs); 249 Res.setNoSignedZeros(FMFs.NoSignedZeros); 250 Res.setAllowReciprocal(FMFs.AllowReciprocal); 251 Res.setAllowContract(FMFs.AllowContract); 252 Res.setApproxFunc(FMFs.ApproxFunc); 253 return Res; 254 } 255 256 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 257 VPValue *A, VPValue *B, DebugLoc DL, 258 const Twine &Name) 259 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 260 Pred, DL), 261 VPValue(this), Opcode(Opcode), Name(Name.str()) { 262 assert(Opcode == Instruction::ICmp && 263 "only ICmp predicates supported at the moment"); 264 } 265 266 VPInstruction::VPInstruction(unsigned Opcode, 267 std::initializer_list<VPValue *> Operands, 268 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 269 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 270 VPValue(this), Opcode(Opcode), Name(Name.str()) { 271 // Make sure the VPInstruction is a floating-point operation. 272 assert(isFPMathOp() && "this op can't take fast-math flags"); 273 } 274 275 Value *VPInstruction::generateInstruction(VPTransformState &State, 276 unsigned Part) { 277 IRBuilderBase &Builder = State.Builder; 278 Builder.SetCurrentDebugLocation(getDebugLoc()); 279 280 if (Instruction::isBinaryOp(getOpcode())) { 281 if (Part != 0 && vputils::onlyFirstPartUsed(this)) 282 return State.get(this, 0); 283 284 Value *A = State.get(getOperand(0), Part); 285 Value *B = State.get(getOperand(1), Part); 286 auto *Res = 287 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 288 if (auto *I = dyn_cast<Instruction>(Res)) 289 setFlags(I); 290 return Res; 291 } 292 293 switch (getOpcode()) { 294 case VPInstruction::Not: { 295 Value *A = State.get(getOperand(0), Part); 296 return Builder.CreateNot(A, Name); 297 } 298 case Instruction::ICmp: { 299 Value *A = State.get(getOperand(0), Part); 300 Value *B = State.get(getOperand(1), Part); 301 return Builder.CreateCmp(getPredicate(), A, B, Name); 302 } 303 case Instruction::Select: { 304 Value *Cond = State.get(getOperand(0), Part); 305 Value *Op1 = State.get(getOperand(1), Part); 306 Value *Op2 = State.get(getOperand(2), Part); 307 return Builder.CreateSelect(Cond, Op1, Op2, Name); 308 } 309 case VPInstruction::ActiveLaneMask: { 310 // Get first lane of vector induction variable. 311 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); 312 // Get the original loop tripcount. 313 Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); 314 315 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 316 auto *PredTy = VectorType::get(Int1Ty, State.VF); 317 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 318 {PredTy, ScalarTC->getType()}, 319 {VIVElem0, ScalarTC}, nullptr, Name); 320 } 321 case VPInstruction::FirstOrderRecurrenceSplice: { 322 // Generate code to combine the previous and current values in vector v3. 323 // 324 // vector.ph: 325 // v_init = vector(..., ..., ..., a[-1]) 326 // br vector.body 327 // 328 // vector.body 329 // i = phi [0, vector.ph], [i+4, vector.body] 330 // v1 = phi [v_init, vector.ph], [v2, vector.body] 331 // v2 = a[i, i+1, i+2, i+3]; 332 // v3 = vector(v1(3), v2(0, 1, 2)) 333 334 // For the first part, use the recurrence phi (v1), otherwise v2. 335 auto *V1 = State.get(getOperand(0), 0); 336 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); 337 if (!PartMinus1->getType()->isVectorTy()) 338 return PartMinus1; 339 Value *V2 = State.get(getOperand(1), Part); 340 return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); 341 } 342 case VPInstruction::CalculateTripCountMinusVF: { 343 Value *ScalarTC = State.get(getOperand(0), {0, 0}); 344 Value *Step = 345 createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); 346 Value *Sub = Builder.CreateSub(ScalarTC, Step); 347 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 348 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 349 return Builder.CreateSelect(Cmp, Sub, Zero); 350 } 351 case VPInstruction::CanonicalIVIncrementForPart: { 352 auto *IV = State.get(getOperand(0), VPIteration(0, 0)); 353 if (Part == 0) 354 return IV; 355 356 // The canonical IV is incremented by the vectorization factor (num of SIMD 357 // elements) times the unroll part. 358 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 359 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 360 hasNoSignedWrap()); 361 } 362 case VPInstruction::BranchOnCond: { 363 if (Part != 0) 364 return nullptr; 365 366 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); 367 VPRegionBlock *ParentRegion = getParent()->getParent(); 368 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 369 370 // Replace the temporary unreachable terminator with a new conditional 371 // branch, hooking it up to backward destination for exiting blocks now and 372 // to forward destination(s) later when they are created. 373 BranchInst *CondBr = 374 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 375 376 if (getParent()->isExiting()) 377 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 378 379 CondBr->setSuccessor(0, nullptr); 380 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 381 return CondBr; 382 } 383 case VPInstruction::BranchOnCount: { 384 if (Part != 0) 385 return nullptr; 386 // First create the compare. 387 Value *IV = State.get(getOperand(0), Part); 388 Value *TC = State.get(getOperand(1), Part); 389 Value *Cond = Builder.CreateICmpEQ(IV, TC); 390 391 // Now create the branch. 392 auto *Plan = getParent()->getPlan(); 393 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 394 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 395 396 // Replace the temporary unreachable terminator with a new conditional 397 // branch, hooking it up to backward destination (the header) now and to the 398 // forward destination (the exit/middle block) later when it is created. 399 // Note that CreateCondBr expects a valid BB as first argument, so we need 400 // to set it to nullptr later. 401 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 402 State.CFG.VPBB2IRBB[Header]); 403 CondBr->setSuccessor(0, nullptr); 404 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 405 return CondBr; 406 } 407 case VPInstruction::ComputeReductionResult: { 408 if (Part != 0) 409 return State.get(this, 0); 410 411 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 412 // and will be removed by breaking up the recipe further. 413 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 414 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 415 // Get its reduction variable descriptor. 416 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 417 418 RecurKind RK = RdxDesc.getRecurrenceKind(); 419 420 State.setDebugLocFrom(getDebugLoc()); 421 422 VPValue *LoopExitingDef = getOperand(1); 423 Type *PhiTy = OrigPhi->getType(); 424 VectorParts RdxParts(State.UF); 425 for (unsigned Part = 0; Part < State.UF; ++Part) 426 RdxParts[Part] = State.get(LoopExitingDef, Part); 427 428 // If the vector reduction can be performed in a smaller type, we truncate 429 // then extend the loop exit value to enable InstCombine to evaluate the 430 // entire expression in the smaller type. 431 // TODO: Handle this in truncateToMinBW. 432 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 433 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 434 for (unsigned Part = 0; Part < State.UF; ++Part) 435 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 436 } 437 // Reduce all of the unrolled parts into a single vector. 438 Value *ReducedPartRdx = RdxParts[0]; 439 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 440 441 if (PhiR->isOrdered()) { 442 ReducedPartRdx = RdxParts[State.UF - 1]; 443 } else { 444 // Floating-point operations should have some FMF to enable the reduction. 445 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 446 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 447 for (unsigned Part = 1; Part < State.UF; ++Part) { 448 Value *RdxPart = RdxParts[Part]; 449 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 450 ReducedPartRdx = Builder.CreateBinOp( 451 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 452 else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 453 TrackingVH<Value> ReductionStartValue = 454 RdxDesc.getRecurrenceStartValue(); 455 ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK, 456 ReducedPartRdx, RdxPart); 457 } else 458 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 459 } 460 } 461 462 // Create the reduction after the loop. Note that inloop reductions create 463 // the target reduction in the loop using a Reduction recipe. 464 if (State.VF.isVector() && !PhiR->isInLoop()) { 465 ReducedPartRdx = 466 createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 467 // If the reduction can be performed in a smaller type, we need to extend 468 // the reduction to the wider type before we branch to the original loop. 469 if (PhiTy != RdxDesc.getRecurrenceType()) 470 ReducedPartRdx = RdxDesc.isSigned() 471 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 472 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 473 } 474 475 // If there were stores of the reduction value to a uniform memory address 476 // inside the loop, create the final store here. 477 if (StoreInst *SI = RdxDesc.IntermediateStore) { 478 auto *NewSI = Builder.CreateAlignedStore( 479 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); 480 propagateMetadata(NewSI, SI); 481 } 482 483 return ReducedPartRdx; 484 } 485 default: 486 llvm_unreachable("Unsupported opcode for instruction"); 487 } 488 } 489 490 #if !defined(NDEBUG) 491 bool VPInstruction::isFPMathOp() const { 492 // Inspired by FPMathOperator::classof. Notable differences are that we don't 493 // support Call, PHI and Select opcodes here yet. 494 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 495 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 496 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 497 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 498 } 499 #endif 500 501 void VPInstruction::execute(VPTransformState &State) { 502 assert(!State.Instance && "VPInstruction executing an Instance"); 503 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 504 assert((hasFastMathFlags() == isFPMathOp() || 505 getOpcode() == Instruction::Select) && 506 "Recipe not a FPMathOp but has fast-math flags?"); 507 if (hasFastMathFlags()) 508 State.Builder.setFastMathFlags(getFastMathFlags()); 509 for (unsigned Part = 0; Part < State.UF; ++Part) { 510 Value *GeneratedValue = generateInstruction(State, Part); 511 if (!hasResult()) 512 continue; 513 assert(GeneratedValue && "generateInstruction must produce a value"); 514 State.set(this, GeneratedValue, Part); 515 } 516 } 517 518 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 519 void VPInstruction::dump() const { 520 VPSlotTracker SlotTracker(getParent()->getPlan()); 521 print(dbgs(), "", SlotTracker); 522 } 523 524 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 525 VPSlotTracker &SlotTracker) const { 526 O << Indent << "EMIT "; 527 528 if (hasResult()) { 529 printAsOperand(O, SlotTracker); 530 O << " = "; 531 } 532 533 switch (getOpcode()) { 534 case VPInstruction::Not: 535 O << "not"; 536 break; 537 case VPInstruction::SLPLoad: 538 O << "combined load"; 539 break; 540 case VPInstruction::SLPStore: 541 O << "combined store"; 542 break; 543 case VPInstruction::ActiveLaneMask: 544 O << "active lane mask"; 545 break; 546 case VPInstruction::FirstOrderRecurrenceSplice: 547 O << "first-order splice"; 548 break; 549 case VPInstruction::BranchOnCond: 550 O << "branch-on-cond"; 551 break; 552 case VPInstruction::CalculateTripCountMinusVF: 553 O << "TC > VF ? TC - VF : 0"; 554 break; 555 case VPInstruction::CanonicalIVIncrementForPart: 556 O << "VF * Part +"; 557 break; 558 case VPInstruction::BranchOnCount: 559 O << "branch-on-count"; 560 break; 561 case VPInstruction::ComputeReductionResult: 562 O << "compute-reduction-result"; 563 break; 564 default: 565 O << Instruction::getOpcodeName(getOpcode()); 566 } 567 568 printFlags(O); 569 printOperands(O, SlotTracker); 570 571 if (auto DL = getDebugLoc()) { 572 O << ", !dbg "; 573 DL.print(O); 574 } 575 } 576 #endif 577 578 void VPWidenCallRecipe::execute(VPTransformState &State) { 579 assert(State.VF.isVector() && "not widening"); 580 auto &CI = *cast<CallInst>(getUnderlyingInstr()); 581 assert(!isa<DbgInfoIntrinsic>(CI) && 582 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 583 State.setDebugLocFrom(CI.getDebugLoc()); 584 585 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; 586 FunctionType *VFTy = nullptr; 587 if (Variant) 588 VFTy = Variant->getFunctionType(); 589 for (unsigned Part = 0; Part < State.UF; ++Part) { 590 SmallVector<Type *, 2> TysForDecl; 591 // Add return type if intrinsic is overloaded on it. 592 if (UseIntrinsic && 593 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) 594 TysForDecl.push_back( 595 VectorType::get(CI.getType()->getScalarType(), State.VF)); 596 SmallVector<Value *, 4> Args; 597 for (const auto &I : enumerate(operands())) { 598 // Some intrinsics have a scalar argument - don't replace it with a 599 // vector. 600 // Some vectorized function variants may also take a scalar argument, 601 // e.g. linear parameters for pointers. 602 Value *Arg; 603 if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) || 604 (UseIntrinsic && 605 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))) 606 Arg = State.get(I.value(), VPIteration(0, 0)); 607 else 608 Arg = State.get(I.value(), Part); 609 if (UseIntrinsic && 610 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) 611 TysForDecl.push_back(Arg->getType()); 612 Args.push_back(Arg); 613 } 614 615 Function *VectorF; 616 if (UseIntrinsic) { 617 // Use vector version of the intrinsic. 618 Module *M = State.Builder.GetInsertBlock()->getModule(); 619 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); 620 assert(VectorF && "Can't retrieve vector intrinsic."); 621 } else { 622 #ifndef NDEBUG 623 assert(Variant != nullptr && "Can't create vector function."); 624 #endif 625 VectorF = Variant; 626 } 627 628 SmallVector<OperandBundleDef, 1> OpBundles; 629 CI.getOperandBundlesAsDefs(OpBundles); 630 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 631 632 if (isa<FPMathOperator>(V)) 633 V->copyFastMathFlags(&CI); 634 635 State.set(this, V, Part); 636 State.addMetadata(V, &CI); 637 } 638 } 639 640 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 641 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 642 VPSlotTracker &SlotTracker) const { 643 O << Indent << "WIDEN-CALL "; 644 645 auto *CI = cast<CallInst>(getUnderlyingInstr()); 646 if (CI->getType()->isVoidTy()) 647 O << "void "; 648 else { 649 printAsOperand(O, SlotTracker); 650 O << " = "; 651 } 652 653 O << "call @" << CI->getCalledFunction()->getName() << "("; 654 printOperands(O, SlotTracker); 655 O << ")"; 656 657 if (VectorIntrinsicID) 658 O << " (using vector intrinsic)"; 659 else { 660 O << " (using library function"; 661 if (Variant->hasName()) 662 O << ": " << Variant->getName(); 663 O << ")"; 664 } 665 } 666 667 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 668 VPSlotTracker &SlotTracker) const { 669 O << Indent << "WIDEN-SELECT "; 670 printAsOperand(O, SlotTracker); 671 O << " = select "; 672 getOperand(0)->printAsOperand(O, SlotTracker); 673 O << ", "; 674 getOperand(1)->printAsOperand(O, SlotTracker); 675 O << ", "; 676 getOperand(2)->printAsOperand(O, SlotTracker); 677 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 678 } 679 #endif 680 681 void VPWidenSelectRecipe::execute(VPTransformState &State) { 682 State.setDebugLocFrom(getDebugLoc()); 683 684 // The condition can be loop invariant but still defined inside the 685 // loop. This means that we can't just use the original 'cond' value. 686 // We have to take the 'vectorized' value and pick the first lane. 687 // Instcombine will make this a no-op. 688 auto *InvarCond = 689 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; 690 691 for (unsigned Part = 0; Part < State.UF; ++Part) { 692 Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); 693 Value *Op0 = State.get(getOperand(1), Part); 694 Value *Op1 = State.get(getOperand(2), Part); 695 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 696 State.set(this, Sel, Part); 697 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 698 } 699 } 700 701 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 702 const FastMathFlags &FMF) { 703 AllowReassoc = FMF.allowReassoc(); 704 NoNaNs = FMF.noNaNs(); 705 NoInfs = FMF.noInfs(); 706 NoSignedZeros = FMF.noSignedZeros(); 707 AllowReciprocal = FMF.allowReciprocal(); 708 AllowContract = FMF.allowContract(); 709 ApproxFunc = FMF.approxFunc(); 710 } 711 712 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 713 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 714 switch (OpType) { 715 case OperationType::Cmp: 716 O << " " << CmpInst::getPredicateName(getPredicate()); 717 break; 718 case OperationType::DisjointOp: 719 if (DisjointFlags.IsDisjoint) 720 O << " disjoint"; 721 break; 722 case OperationType::PossiblyExactOp: 723 if (ExactFlags.IsExact) 724 O << " exact"; 725 break; 726 case OperationType::OverflowingBinOp: 727 if (WrapFlags.HasNUW) 728 O << " nuw"; 729 if (WrapFlags.HasNSW) 730 O << " nsw"; 731 break; 732 case OperationType::FPMathOp: 733 getFastMathFlags().print(O); 734 break; 735 case OperationType::GEPOp: 736 if (GEPFlags.IsInBounds) 737 O << " inbounds"; 738 break; 739 case OperationType::NonNegOp: 740 if (NonNegFlags.NonNeg) 741 O << " nneg"; 742 break; 743 case OperationType::Other: 744 break; 745 } 746 if (getNumOperands() > 0) 747 O << " "; 748 } 749 #endif 750 751 void VPWidenRecipe::execute(VPTransformState &State) { 752 State.setDebugLocFrom(getDebugLoc()); 753 auto &Builder = State.Builder; 754 switch (Opcode) { 755 case Instruction::Call: 756 case Instruction::Br: 757 case Instruction::PHI: 758 case Instruction::GetElementPtr: 759 case Instruction::Select: 760 llvm_unreachable("This instruction is handled by a different recipe."); 761 case Instruction::UDiv: 762 case Instruction::SDiv: 763 case Instruction::SRem: 764 case Instruction::URem: 765 case Instruction::Add: 766 case Instruction::FAdd: 767 case Instruction::Sub: 768 case Instruction::FSub: 769 case Instruction::FNeg: 770 case Instruction::Mul: 771 case Instruction::FMul: 772 case Instruction::FDiv: 773 case Instruction::FRem: 774 case Instruction::Shl: 775 case Instruction::LShr: 776 case Instruction::AShr: 777 case Instruction::And: 778 case Instruction::Or: 779 case Instruction::Xor: { 780 // Just widen unops and binops. 781 for (unsigned Part = 0; Part < State.UF; ++Part) { 782 SmallVector<Value *, 2> Ops; 783 for (VPValue *VPOp : operands()) 784 Ops.push_back(State.get(VPOp, Part)); 785 786 Value *V = Builder.CreateNAryOp(Opcode, Ops); 787 788 if (auto *VecOp = dyn_cast<Instruction>(V)) 789 setFlags(VecOp); 790 791 // Use this vector value for all users of the original instruction. 792 State.set(this, V, Part); 793 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 794 } 795 796 break; 797 } 798 case Instruction::Freeze: { 799 for (unsigned Part = 0; Part < State.UF; ++Part) { 800 Value *Op = State.get(getOperand(0), Part); 801 802 Value *Freeze = Builder.CreateFreeze(Op); 803 State.set(this, Freeze, Part); 804 } 805 break; 806 } 807 case Instruction::ICmp: 808 case Instruction::FCmp: { 809 // Widen compares. Generate vector compares. 810 bool FCmp = Opcode == Instruction::FCmp; 811 for (unsigned Part = 0; Part < State.UF; ++Part) { 812 Value *A = State.get(getOperand(0), Part); 813 Value *B = State.get(getOperand(1), Part); 814 Value *C = nullptr; 815 if (FCmp) { 816 // Propagate fast math flags. 817 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 818 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 819 Builder.setFastMathFlags(I->getFastMathFlags()); 820 C = Builder.CreateFCmp(getPredicate(), A, B); 821 } else { 822 C = Builder.CreateICmp(getPredicate(), A, B); 823 } 824 State.set(this, C, Part); 825 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 826 } 827 828 break; 829 } 830 default: 831 // This instruction is not vectorized by simple widening. 832 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 833 << Instruction::getOpcodeName(Opcode)); 834 llvm_unreachable("Unhandled instruction!"); 835 } // end of switch. 836 837 #if !defined(NDEBUG) 838 // Verify that VPlan type inference results agree with the type of the 839 // generated values. 840 for (unsigned Part = 0; Part < State.UF; ++Part) { 841 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), 842 State.VF) == State.get(this, Part)->getType() && 843 "inferred type and type from generated instructions do not match"); 844 } 845 #endif 846 } 847 848 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 849 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 850 VPSlotTracker &SlotTracker) const { 851 O << Indent << "WIDEN "; 852 printAsOperand(O, SlotTracker); 853 O << " = " << Instruction::getOpcodeName(Opcode); 854 printFlags(O); 855 printOperands(O, SlotTracker); 856 } 857 #endif 858 859 void VPWidenCastRecipe::execute(VPTransformState &State) { 860 State.setDebugLocFrom(getDebugLoc()); 861 auto &Builder = State.Builder; 862 /// Vectorize casts. 863 assert(State.VF.isVector() && "Not vectorizing?"); 864 Type *DestTy = VectorType::get(getResultType(), State.VF); 865 VPValue *Op = getOperand(0); 866 for (unsigned Part = 0; Part < State.UF; ++Part) { 867 if (Part > 0 && Op->isLiveIn()) { 868 // FIXME: Remove once explicit unrolling is implemented using VPlan. 869 State.set(this, State.get(this, 0), Part); 870 continue; 871 } 872 Value *A = State.get(Op, Part); 873 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 874 State.set(this, Cast, Part); 875 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 876 } 877 } 878 879 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 880 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 881 VPSlotTracker &SlotTracker) const { 882 O << Indent << "WIDEN-CAST "; 883 printAsOperand(O, SlotTracker); 884 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 885 printFlags(O); 886 printOperands(O, SlotTracker); 887 O << " to " << *getResultType(); 888 } 889 #endif 890 891 /// This function adds 892 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 893 /// to each vector element of Val. The sequence starts at StartIndex. 894 /// \p Opcode is relevant for FP induction variable. 895 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 896 Instruction::BinaryOps BinOp, ElementCount VF, 897 IRBuilderBase &Builder) { 898 assert(VF.isVector() && "only vector VFs are supported"); 899 900 // Create and check the types. 901 auto *ValVTy = cast<VectorType>(Val->getType()); 902 ElementCount VLen = ValVTy->getElementCount(); 903 904 Type *STy = Val->getType()->getScalarType(); 905 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 906 "Induction Step must be an integer or FP"); 907 assert(Step->getType() == STy && "Step has wrong type"); 908 909 SmallVector<Constant *, 8> Indices; 910 911 // Create a vector of consecutive numbers from zero to VF. 912 VectorType *InitVecValVTy = ValVTy; 913 if (STy->isFloatingPointTy()) { 914 Type *InitVecValSTy = 915 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 916 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 917 } 918 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 919 920 // Splat the StartIdx 921 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 922 923 if (STy->isIntegerTy()) { 924 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 925 Step = Builder.CreateVectorSplat(VLen, Step); 926 assert(Step->getType() == Val->getType() && "Invalid step vec"); 927 // FIXME: The newly created binary instructions should contain nsw/nuw 928 // flags, which can be found from the original scalar operations. 929 Step = Builder.CreateMul(InitVec, Step); 930 return Builder.CreateAdd(Val, Step, "induction"); 931 } 932 933 // Floating point induction. 934 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 935 "Binary Opcode should be specified for FP induction"); 936 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 937 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 938 939 Step = Builder.CreateVectorSplat(VLen, Step); 940 Value *MulOp = Builder.CreateFMul(InitVec, Step); 941 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 942 } 943 944 /// A helper function that returns an integer or floating-point constant with 945 /// value C. 946 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 947 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 948 : ConstantFP::get(Ty, C); 949 } 950 951 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 952 ElementCount VF) { 953 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 954 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 955 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 956 return B.CreateUIToFP(RuntimeVF, FTy); 957 } 958 959 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 960 assert(!State.Instance && "Int or FP induction being replicated."); 961 962 Value *Start = getStartValue()->getLiveInIRValue(); 963 const InductionDescriptor &ID = getInductionDescriptor(); 964 TruncInst *Trunc = getTruncInst(); 965 IRBuilderBase &Builder = State.Builder; 966 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 967 assert(State.VF.isVector() && "must have vector VF"); 968 969 // The value from the original loop to which we are mapping the new induction 970 // variable. 971 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 972 973 // Fast-math-flags propagate from the original induction instruction. 974 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 975 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 976 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 977 978 // Now do the actual transformations, and start with fetching the step value. 979 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 980 981 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 982 "Expected either an induction phi-node or a truncate of it!"); 983 984 // Construct the initial value of the vector IV in the vector loop preheader 985 auto CurrIP = Builder.saveIP(); 986 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 987 Builder.SetInsertPoint(VectorPH->getTerminator()); 988 if (isa<TruncInst>(EntryVal)) { 989 assert(Start->getType()->isIntegerTy() && 990 "Truncation requires an integer type"); 991 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 992 Step = Builder.CreateTrunc(Step, TruncType); 993 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 994 } 995 996 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 997 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 998 Value *SteppedStart = getStepVector( 999 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 1000 1001 // We create vector phi nodes for both integer and floating-point induction 1002 // variables. Here, we determine the kind of arithmetic we will perform. 1003 Instruction::BinaryOps AddOp; 1004 Instruction::BinaryOps MulOp; 1005 if (Step->getType()->isIntegerTy()) { 1006 AddOp = Instruction::Add; 1007 MulOp = Instruction::Mul; 1008 } else { 1009 AddOp = ID.getInductionOpcode(); 1010 MulOp = Instruction::FMul; 1011 } 1012 1013 // Multiply the vectorization factor by the step using integer or 1014 // floating-point arithmetic as appropriate. 1015 Type *StepType = Step->getType(); 1016 Value *RuntimeVF; 1017 if (Step->getType()->isFloatingPointTy()) 1018 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 1019 else 1020 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 1021 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1022 1023 // Create a vector splat to use in the induction update. 1024 // 1025 // FIXME: If the step is non-constant, we create the vector splat with 1026 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1027 // handle a constant vector splat. 1028 Value *SplatVF = isa<Constant>(Mul) 1029 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 1030 : Builder.CreateVectorSplat(State.VF, Mul); 1031 Builder.restoreIP(CurrIP); 1032 1033 // We may need to add the step a number of times, depending on the unroll 1034 // factor. The last of those goes into the PHI. 1035 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1036 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1037 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1038 Instruction *LastInduction = VecInd; 1039 for (unsigned Part = 0; Part < State.UF; ++Part) { 1040 State.set(this, LastInduction, Part); 1041 1042 if (isa<TruncInst>(EntryVal)) 1043 State.addMetadata(LastInduction, EntryVal); 1044 1045 LastInduction = cast<Instruction>( 1046 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 1047 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1048 } 1049 1050 LastInduction->setName("vec.ind.next"); 1051 VecInd->addIncoming(SteppedStart, VectorPH); 1052 // Add induction update using an incorrect block temporarily. The phi node 1053 // will be fixed after VPlan execution. Note that at this point the latch 1054 // block cannot be used, as it does not exist yet. 1055 // TODO: Model increment value in VPlan, by turning the recipe into a 1056 // multi-def and a subclass of VPHeaderPHIRecipe. 1057 VecInd->addIncoming(LastInduction, VectorPH); 1058 } 1059 1060 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1061 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1062 VPSlotTracker &SlotTracker) const { 1063 O << Indent << "WIDEN-INDUCTION"; 1064 if (getTruncInst()) { 1065 O << "\\l\""; 1066 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 1067 O << " +\n" << Indent << "\" "; 1068 getVPValue(0)->printAsOperand(O, SlotTracker); 1069 } else 1070 O << " " << VPlanIngredient(IV); 1071 1072 O << ", "; 1073 getStepValue()->printAsOperand(O, SlotTracker); 1074 } 1075 #endif 1076 1077 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1078 // The step may be defined by a recipe in the preheader (e.g. if it requires 1079 // SCEV expansion), but for the canonical induction the step is required to be 1080 // 1, which is represented as live-in. 1081 if (getStepValue()->getDefiningRecipe()) 1082 return false; 1083 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1084 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1085 return StartC && StartC->isZero() && StepC && StepC->isOne(); 1086 } 1087 1088 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1089 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1090 VPSlotTracker &SlotTracker) const { 1091 O << Indent; 1092 printAsOperand(O, SlotTracker); 1093 O << Indent << "= DERIVED-IV "; 1094 getStartValue()->printAsOperand(O, SlotTracker); 1095 O << " + "; 1096 getCanonicalIV()->printAsOperand(O, SlotTracker); 1097 O << " * "; 1098 getStepValue()->printAsOperand(O, SlotTracker); 1099 1100 if (TruncResultTy) 1101 O << " (truncated to " << *TruncResultTy << ")"; 1102 } 1103 #endif 1104 1105 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1106 // Fast-math-flags propagate from the original induction instruction. 1107 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1108 if (hasFastMathFlags()) 1109 State.Builder.setFastMathFlags(getFastMathFlags()); 1110 1111 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1112 /// variable on which to base the steps, \p Step is the size of the step. 1113 1114 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 1115 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1116 IRBuilderBase &Builder = State.Builder; 1117 1118 // Ensure step has the same type as that of scalar IV. 1119 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1120 if (BaseIVTy != Step->getType()) { 1121 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to 1122 // avoid separate truncate here. 1123 assert(Step->getType()->isIntegerTy() && 1124 "Truncation requires an integer step"); 1125 Step = State.Builder.CreateTrunc(Step, BaseIVTy); 1126 } 1127 1128 // We build scalar steps for both integer and floating-point induction 1129 // variables. Here, we determine the kind of arithmetic we will perform. 1130 Instruction::BinaryOps AddOp; 1131 Instruction::BinaryOps MulOp; 1132 if (BaseIVTy->isIntegerTy()) { 1133 AddOp = Instruction::Add; 1134 MulOp = Instruction::Mul; 1135 } else { 1136 AddOp = InductionOpcode; 1137 MulOp = Instruction::FMul; 1138 } 1139 1140 // Determine the number of scalars we need to generate for each unroll 1141 // iteration. 1142 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1143 // Compute the scalar steps and save the results in State. 1144 Type *IntStepTy = 1145 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1146 Type *VecIVTy = nullptr; 1147 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1148 if (!FirstLaneOnly && State.VF.isScalable()) { 1149 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1150 UnitStepVec = 1151 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1152 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1153 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1154 } 1155 1156 unsigned StartPart = 0; 1157 unsigned EndPart = State.UF; 1158 unsigned StartLane = 0; 1159 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1160 if (State.Instance) { 1161 StartPart = State.Instance->Part; 1162 EndPart = StartPart + 1; 1163 StartLane = State.Instance->Lane.getKnownLane(); 1164 EndLane = StartLane + 1; 1165 } 1166 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 1167 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 1168 1169 if (!FirstLaneOnly && State.VF.isScalable()) { 1170 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1171 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1172 if (BaseIVTy->isFloatingPointTy()) 1173 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1174 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1175 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1176 State.set(this, Add, Part); 1177 // It's useful to record the lane values too for the known minimum number 1178 // of elements so we do those below. This improves the code quality when 1179 // trying to extract the first element, for example. 1180 } 1181 1182 if (BaseIVTy->isFloatingPointTy()) 1183 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1184 1185 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1186 Value *StartIdx = Builder.CreateBinOp( 1187 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1188 // The step returned by `createStepForVF` is a runtime-evaluated value 1189 // when VF is scalable. Otherwise, it should be folded into a Constant. 1190 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1191 "Expected StartIdx to be folded to a constant when VF is not " 1192 "scalable"); 1193 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1194 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1195 State.set(this, Add, VPIteration(Part, Lane)); 1196 } 1197 } 1198 } 1199 1200 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1201 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1202 VPSlotTracker &SlotTracker) const { 1203 O << Indent; 1204 printAsOperand(O, SlotTracker); 1205 O << " = SCALAR-STEPS "; 1206 printOperands(O, SlotTracker); 1207 } 1208 #endif 1209 1210 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1211 assert(State.VF.isVector() && "not widening"); 1212 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1213 // Construct a vector GEP by widening the operands of the scalar GEP as 1214 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1215 // results in a vector of pointers when at least one operand of the GEP 1216 // is vector-typed. Thus, to keep the representation compact, we only use 1217 // vector-typed operands for loop-varying values. 1218 1219 if (areAllOperandsInvariant()) { 1220 // If we are vectorizing, but the GEP has only loop-invariant operands, 1221 // the GEP we build (by only using vector-typed operands for 1222 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1223 // produce a vector of pointers, we need to either arbitrarily pick an 1224 // operand to broadcast, or broadcast a clone of the original GEP. 1225 // Here, we broadcast a clone of the original. 1226 // 1227 // TODO: If at some point we decide to scalarize instructions having 1228 // loop-invariant operands, this special case will no longer be 1229 // required. We would add the scalarization decision to 1230 // collectLoopScalars() and teach getVectorValue() to broadcast 1231 // the lane-zero scalar value. 1232 SmallVector<Value *> Ops; 1233 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1234 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); 1235 1236 auto *NewGEP = 1237 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1238 ArrayRef(Ops).drop_front(), "", isInBounds()); 1239 for (unsigned Part = 0; Part < State.UF; ++Part) { 1240 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1241 State.set(this, EntryPart, Part); 1242 State.addMetadata(EntryPart, GEP); 1243 } 1244 } else { 1245 // If the GEP has at least one loop-varying operand, we are sure to 1246 // produce a vector of pointers. But if we are only unrolling, we want 1247 // to produce a scalar GEP for each unroll part. Thus, the GEP we 1248 // produce with the code below will be scalar (if VF == 1) or vector 1249 // (otherwise). Note that for the unroll-only case, we still maintain 1250 // values in the vector mapping with initVector, as we do for other 1251 // instructions. 1252 for (unsigned Part = 0; Part < State.UF; ++Part) { 1253 // The pointer operand of the new GEP. If it's loop-invariant, we 1254 // won't broadcast it. 1255 auto *Ptr = isPointerLoopInvariant() 1256 ? State.get(getOperand(0), VPIteration(0, 0)) 1257 : State.get(getOperand(0), Part); 1258 1259 // Collect all the indices for the new GEP. If any index is 1260 // loop-invariant, we won't broadcast it. 1261 SmallVector<Value *, 4> Indices; 1262 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1263 VPValue *Operand = getOperand(I); 1264 if (isIndexLoopInvariant(I - 1)) 1265 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 1266 else 1267 Indices.push_back(State.get(Operand, Part)); 1268 } 1269 1270 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1271 // but it should be a vector, otherwise. 1272 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1273 Indices, "", isInBounds()); 1274 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1275 "NewGEP is not a pointer vector"); 1276 State.set(this, NewGEP, Part); 1277 State.addMetadata(NewGEP, GEP); 1278 } 1279 } 1280 } 1281 1282 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1283 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1284 VPSlotTracker &SlotTracker) const { 1285 O << Indent << "WIDEN-GEP "; 1286 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1287 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1288 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1289 1290 O << " "; 1291 printAsOperand(O, SlotTracker); 1292 O << " = getelementptr"; 1293 printFlags(O); 1294 printOperands(O, SlotTracker); 1295 } 1296 #endif 1297 1298 void VPVectorPointerRecipe ::execute(VPTransformState &State) { 1299 auto &Builder = State.Builder; 1300 State.setDebugLocFrom(getDebugLoc()); 1301 for (unsigned Part = 0; Part < State.UF; ++Part) { 1302 // Calculate the pointer for the specific unroll-part. 1303 Value *PartPtr = nullptr; 1304 // Use i32 for the gep index type when the value is constant, 1305 // or query DataLayout for a more suitable index type otherwise. 1306 const DataLayout &DL = 1307 Builder.GetInsertBlock()->getModule()->getDataLayout(); 1308 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) 1309 ? DL.getIndexType(IndexedTy->getPointerTo()) 1310 : Builder.getInt32Ty(); 1311 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); 1312 bool InBounds = isInBounds(); 1313 if (IsReverse) { 1314 // If the address is consecutive but reversed, then the 1315 // wide store needs to start at the last vector element. 1316 // RunTimeVF = VScale * VF.getKnownMinValue() 1317 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 1318 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 1319 // NumElt = -Part * RunTimeVF 1320 Value *NumElt = Builder.CreateMul( 1321 ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 1322 // LastLane = 1 - RunTimeVF 1323 Value *LastLane = 1324 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1325 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); 1326 PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); 1327 } else { 1328 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 1329 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); 1330 } 1331 1332 State.set(this, PartPtr, Part); 1333 } 1334 } 1335 1336 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1337 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1338 VPSlotTracker &SlotTracker) const { 1339 O << Indent; 1340 printAsOperand(O, SlotTracker); 1341 O << " = vector-pointer "; 1342 if (IsReverse) 1343 O << "(reverse) "; 1344 1345 printOperands(O, SlotTracker); 1346 } 1347 #endif 1348 1349 void VPBlendRecipe::execute(VPTransformState &State) { 1350 State.setDebugLocFrom(getDebugLoc()); 1351 // We know that all PHIs in non-header blocks are converted into 1352 // selects, so we don't have to worry about the insertion order and we 1353 // can just use the builder. 1354 // At this point we generate the predication tree. There may be 1355 // duplications since this is a simple recursive scan, but future 1356 // optimizations will clean it up. 1357 1358 unsigned NumIncoming = getNumIncomingValues(); 1359 1360 // Generate a sequence of selects of the form: 1361 // SELECT(Mask3, In3, 1362 // SELECT(Mask2, In2, 1363 // SELECT(Mask1, In1, 1364 // In0))) 1365 // Note that Mask0 is never used: lanes for which no path reaches this phi and 1366 // are essentially undef are taken from In0. 1367 VectorParts Entry(State.UF); 1368 for (unsigned In = 0; In < NumIncoming; ++In) { 1369 for (unsigned Part = 0; Part < State.UF; ++Part) { 1370 // We might have single edge PHIs (blocks) - use an identity 1371 // 'select' for the first PHI operand. 1372 Value *In0 = State.get(getIncomingValue(In), Part); 1373 if (In == 0) 1374 Entry[Part] = In0; // Initialize with the first incoming value. 1375 else { 1376 // Select between the current value and the previous incoming edge 1377 // based on the incoming mask. 1378 Value *Cond = State.get(getMask(In), Part); 1379 Entry[Part] = 1380 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 1381 } 1382 } 1383 } 1384 for (unsigned Part = 0; Part < State.UF; ++Part) 1385 State.set(this, Entry[Part], Part); 1386 } 1387 1388 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1389 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 1390 VPSlotTracker &SlotTracker) const { 1391 O << Indent << "BLEND "; 1392 printAsOperand(O, SlotTracker); 1393 O << " ="; 1394 if (getNumIncomingValues() == 1) { 1395 // Not a User of any mask: not really blending, this is a 1396 // single-predecessor phi. 1397 O << " "; 1398 getIncomingValue(0)->printAsOperand(O, SlotTracker); 1399 } else { 1400 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 1401 O << " "; 1402 getIncomingValue(I)->printAsOperand(O, SlotTracker); 1403 O << "/"; 1404 getMask(I)->printAsOperand(O, SlotTracker); 1405 } 1406 } 1407 } 1408 1409 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 1410 VPSlotTracker &SlotTracker) const { 1411 O << Indent << "REDUCE "; 1412 printAsOperand(O, SlotTracker); 1413 O << " = "; 1414 getChainOp()->printAsOperand(O, SlotTracker); 1415 O << " +"; 1416 if (isa<FPMathOperator>(getUnderlyingInstr())) 1417 O << getUnderlyingInstr()->getFastMathFlags(); 1418 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1419 getVecOp()->printAsOperand(O, SlotTracker); 1420 if (getCondOp()) { 1421 O << ", "; 1422 getCondOp()->printAsOperand(O, SlotTracker); 1423 } 1424 O << ")"; 1425 if (RdxDesc.IntermediateStore) 1426 O << " (with final reduction value stored in invariant address sank " 1427 "outside of loop)"; 1428 } 1429 #endif 1430 1431 bool VPReplicateRecipe::shouldPack() const { 1432 // Find if the recipe is used by a widened recipe via an intervening 1433 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 1434 return any_of(users(), [](const VPUser *U) { 1435 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 1436 return any_of(PredR->users(), [PredR](const VPUser *U) { 1437 return !U->usesScalars(PredR); 1438 }); 1439 return false; 1440 }); 1441 } 1442 1443 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1444 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 1445 VPSlotTracker &SlotTracker) const { 1446 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 1447 1448 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 1449 printAsOperand(O, SlotTracker); 1450 O << " = "; 1451 } 1452 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 1453 O << "call"; 1454 printFlags(O); 1455 O << "@" << CB->getCalledFunction()->getName() << "("; 1456 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 1457 O, [&O, &SlotTracker](VPValue *Op) { 1458 Op->printAsOperand(O, SlotTracker); 1459 }); 1460 O << ")"; 1461 } else { 1462 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 1463 printFlags(O); 1464 printOperands(O, SlotTracker); 1465 } 1466 1467 if (shouldPack()) 1468 O << " (S->V)"; 1469 } 1470 #endif 1471 1472 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 1473 assert(State.Instance && "Branch on Mask works only on single instance."); 1474 1475 unsigned Part = State.Instance->Part; 1476 unsigned Lane = State.Instance->Lane.getKnownLane(); 1477 1478 Value *ConditionBit = nullptr; 1479 VPValue *BlockInMask = getMask(); 1480 if (BlockInMask) { 1481 ConditionBit = State.get(BlockInMask, Part); 1482 if (ConditionBit->getType()->isVectorTy()) 1483 ConditionBit = State.Builder.CreateExtractElement( 1484 ConditionBit, State.Builder.getInt32(Lane)); 1485 } else // Block in mask is all-one. 1486 ConditionBit = State.Builder.getTrue(); 1487 1488 // Replace the temporary unreachable terminator with a new conditional branch, 1489 // whose two destinations will be set later when they are created. 1490 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 1491 assert(isa<UnreachableInst>(CurrentTerminator) && 1492 "Expected to replace unreachable terminator with conditional branch."); 1493 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 1494 CondBr->setSuccessor(0, nullptr); 1495 ReplaceInstWithInst(CurrentTerminator, CondBr); 1496 } 1497 1498 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 1499 assert(State.Instance && "Predicated instruction PHI works per instance."); 1500 Instruction *ScalarPredInst = 1501 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 1502 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 1503 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 1504 assert(PredicatingBB && "Predicated block has no single predecessor."); 1505 assert(isa<VPReplicateRecipe>(getOperand(0)) && 1506 "operand must be VPReplicateRecipe"); 1507 1508 // By current pack/unpack logic we need to generate only a single phi node: if 1509 // a vector value for the predicated instruction exists at this point it means 1510 // the instruction has vector users only, and a phi for the vector value is 1511 // needed. In this case the recipe of the predicated instruction is marked to 1512 // also do that packing, thereby "hoisting" the insert-element sequence. 1513 // Otherwise, a phi node for the scalar value is needed. 1514 unsigned Part = State.Instance->Part; 1515 if (State.hasVectorValue(getOperand(0), Part)) { 1516 Value *VectorValue = State.get(getOperand(0), Part); 1517 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 1518 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 1519 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 1520 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 1521 if (State.hasVectorValue(this, Part)) 1522 State.reset(this, VPhi, Part); 1523 else 1524 State.set(this, VPhi, Part); 1525 // NOTE: Currently we need to update the value of the operand, so the next 1526 // predicated iteration inserts its generated value in the correct vector. 1527 State.reset(getOperand(0), VPhi, Part); 1528 } else { 1529 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 1530 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 1531 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 1532 PredicatingBB); 1533 Phi->addIncoming(ScalarPredInst, PredicatedBB); 1534 if (State.hasScalarValue(this, *State.Instance)) 1535 State.reset(this, Phi, *State.Instance); 1536 else 1537 State.set(this, Phi, *State.Instance); 1538 // NOTE: Currently we need to update the value of the operand, so the next 1539 // predicated iteration inserts its generated value in the correct vector. 1540 State.reset(getOperand(0), Phi, *State.Instance); 1541 } 1542 } 1543 1544 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1545 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1546 VPSlotTracker &SlotTracker) const { 1547 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 1548 printAsOperand(O, SlotTracker); 1549 O << " = "; 1550 printOperands(O, SlotTracker); 1551 } 1552 1553 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, 1554 VPSlotTracker &SlotTracker) const { 1555 O << Indent << "WIDEN "; 1556 1557 if (!isStore()) { 1558 getVPSingleValue()->printAsOperand(O, SlotTracker); 1559 O << " = "; 1560 } 1561 O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; 1562 1563 printOperands(O, SlotTracker); 1564 } 1565 #endif 1566 1567 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { 1568 Value *Start = getStartValue()->getLiveInIRValue(); 1569 PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); 1570 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1571 1572 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1573 EntryPart->addIncoming(Start, VectorPH); 1574 EntryPart->setDebugLoc(getDebugLoc()); 1575 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1576 State.set(this, EntryPart, Part); 1577 } 1578 1579 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1580 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1581 VPSlotTracker &SlotTracker) const { 1582 O << Indent << "EMIT "; 1583 printAsOperand(O, SlotTracker); 1584 O << " = CANONICAL-INDUCTION "; 1585 printOperands(O, SlotTracker); 1586 } 1587 #endif 1588 1589 bool VPCanonicalIVPHIRecipe::isCanonical( 1590 InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step, 1591 Type *Ty) const { 1592 // The types must match and it must be an integer induction. 1593 if (Ty != getScalarType() || Kind != InductionDescriptor::IK_IntInduction) 1594 return false; 1595 // Start must match the start value of this canonical induction. 1596 if (Start != getStartValue()) 1597 return false; 1598 1599 // If the step is defined by a recipe, it is not a ConstantInt. 1600 if (Step->getDefiningRecipe()) 1601 return false; 1602 1603 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 1604 return StepC && StepC->isOne(); 1605 } 1606 1607 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) { 1608 return IsScalarAfterVectorization && 1609 (!VF.isScalable() || vputils::onlyFirstLaneUsed(this)); 1610 } 1611 1612 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1613 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1614 VPSlotTracker &SlotTracker) const { 1615 O << Indent << "EMIT "; 1616 printAsOperand(O, SlotTracker); 1617 O << " = WIDEN-POINTER-INDUCTION "; 1618 getStartValue()->printAsOperand(O, SlotTracker); 1619 O << ", " << *IndDesc.getStep(); 1620 } 1621 #endif 1622 1623 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 1624 assert(!State.Instance && "cannot be used in per-lane"); 1625 const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout(); 1626 SCEVExpander Exp(SE, DL, "induction"); 1627 1628 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 1629 &*State.Builder.GetInsertPoint()); 1630 assert(!State.ExpandedSCEVs.contains(Expr) && 1631 "Same SCEV expanded multiple times"); 1632 State.ExpandedSCEVs[Expr] = Res; 1633 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1634 State.set(this, Res, {Part, 0}); 1635 } 1636 1637 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1638 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 1639 VPSlotTracker &SlotTracker) const { 1640 O << Indent << "EMIT "; 1641 getVPSingleValue()->printAsOperand(O, SlotTracker); 1642 O << " = EXPAND SCEV " << *Expr; 1643 } 1644 #endif 1645 1646 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 1647 Value *CanonicalIV = State.get(getOperand(0), 0); 1648 Type *STy = CanonicalIV->getType(); 1649 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 1650 ElementCount VF = State.VF; 1651 Value *VStart = VF.isScalar() 1652 ? CanonicalIV 1653 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 1654 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 1655 Value *VStep = createStepForVF(Builder, STy, VF, Part); 1656 if (VF.isVector()) { 1657 VStep = Builder.CreateVectorSplat(VF, VStep); 1658 VStep = 1659 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 1660 } 1661 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 1662 State.set(this, CanonicalVectorIV, Part); 1663 } 1664 } 1665 1666 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1667 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 1668 VPSlotTracker &SlotTracker) const { 1669 O << Indent << "EMIT "; 1670 printAsOperand(O, SlotTracker); 1671 O << " = WIDEN-CANONICAL-INDUCTION "; 1672 printOperands(O, SlotTracker); 1673 } 1674 #endif 1675 1676 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 1677 auto &Builder = State.Builder; 1678 // Create a vector from the initial value. 1679 auto *VectorInit = getStartValue()->getLiveInIRValue(); 1680 1681 Type *VecTy = State.VF.isScalar() 1682 ? VectorInit->getType() 1683 : VectorType::get(VectorInit->getType(), State.VF); 1684 1685 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1686 if (State.VF.isVector()) { 1687 auto *IdxTy = Builder.getInt32Ty(); 1688 auto *One = ConstantInt::get(IdxTy, 1); 1689 IRBuilder<>::InsertPointGuard Guard(Builder); 1690 Builder.SetInsertPoint(VectorPH->getTerminator()); 1691 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 1692 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 1693 VectorInit = Builder.CreateInsertElement( 1694 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 1695 } 1696 1697 // Create a phi node for the new recurrence. 1698 PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); 1699 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1700 EntryPart->addIncoming(VectorInit, VectorPH); 1701 State.set(this, EntryPart, 0); 1702 } 1703 1704 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1705 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 1706 VPSlotTracker &SlotTracker) const { 1707 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 1708 printAsOperand(O, SlotTracker); 1709 O << " = phi "; 1710 printOperands(O, SlotTracker); 1711 } 1712 #endif 1713 1714 void VPReductionPHIRecipe::execute(VPTransformState &State) { 1715 PHINode *PN = cast<PHINode>(getUnderlyingValue()); 1716 auto &Builder = State.Builder; 1717 1718 // In order to support recurrences we need to be able to vectorize Phi nodes. 1719 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 1720 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 1721 // this value when we vectorize all of the instructions that use the PHI. 1722 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 1723 Type *VecTy = 1724 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 1725 1726 BasicBlock *HeaderBB = State.CFG.PrevBB; 1727 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 1728 "recipe must be in the vector loop header"); 1729 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; 1730 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 1731 Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); 1732 EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); 1733 State.set(this, EntryPart, Part); 1734 } 1735 1736 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1737 1738 // Reductions do not have to start at zero. They can start with 1739 // any loop invariant values. 1740 VPValue *StartVPV = getStartValue(); 1741 Value *StartV = StartVPV->getLiveInIRValue(); 1742 1743 Value *Iden = nullptr; 1744 RecurKind RK = RdxDesc.getRecurrenceKind(); 1745 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 1746 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 1747 // MinMax and AnyOf reductions have the start value as their identity. 1748 if (ScalarPHI) { 1749 Iden = StartV; 1750 } else { 1751 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 1752 Builder.SetInsertPoint(VectorPH->getTerminator()); 1753 StartV = Iden = 1754 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 1755 } 1756 } else { 1757 Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), 1758 RdxDesc.getFastMathFlags()); 1759 1760 if (!ScalarPHI) { 1761 Iden = Builder.CreateVectorSplat(State.VF, Iden); 1762 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 1763 Builder.SetInsertPoint(VectorPH->getTerminator()); 1764 Constant *Zero = Builder.getInt32(0); 1765 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 1766 } 1767 } 1768 1769 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 1770 Value *EntryPart = State.get(this, Part); 1771 // Make sure to add the reduction start value only to the 1772 // first unroll part. 1773 Value *StartVal = (Part == 0) ? StartV : Iden; 1774 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH); 1775 } 1776 } 1777 1778 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1779 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1780 VPSlotTracker &SlotTracker) const { 1781 O << Indent << "WIDEN-REDUCTION-PHI "; 1782 1783 printAsOperand(O, SlotTracker); 1784 O << " = phi "; 1785 printOperands(O, SlotTracker); 1786 } 1787 #endif 1788 1789 void VPWidenPHIRecipe::execute(VPTransformState &State) { 1790 assert(EnableVPlanNativePath && 1791 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 1792 1793 Value *Op0 = State.get(getOperand(0), 0); 1794 Type *VecTy = Op0->getType(); 1795 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 1796 State.set(this, VecPhi, 0); 1797 } 1798 1799 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1800 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1801 VPSlotTracker &SlotTracker) const { 1802 O << Indent << "WIDEN-PHI "; 1803 1804 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 1805 // Unless all incoming values are modeled in VPlan print the original PHI 1806 // directly. 1807 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 1808 // values as VPValues. 1809 if (getNumOperands() != OriginalPhi->getNumOperands()) { 1810 O << VPlanIngredient(OriginalPhi); 1811 return; 1812 } 1813 1814 printAsOperand(O, SlotTracker); 1815 O << " = phi "; 1816 printOperands(O, SlotTracker); 1817 } 1818 #endif 1819 1820 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 1821 // remove VPActiveLaneMaskPHIRecipe. 1822 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 1823 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1824 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 1825 Value *StartMask = State.get(getOperand(0), Part); 1826 PHINode *EntryPart = 1827 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 1828 EntryPart->addIncoming(StartMask, VectorPH); 1829 EntryPart->setDebugLoc(getDebugLoc()); 1830 State.set(this, EntryPart, Part); 1831 } 1832 } 1833 1834 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1835 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1836 VPSlotTracker &SlotTracker) const { 1837 O << Indent << "ACTIVE-LANE-MASK-PHI "; 1838 1839 printAsOperand(O, SlotTracker); 1840 O << " = phi "; 1841 printOperands(O, SlotTracker); 1842 } 1843 #endif 1844