1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Twine.h" 19 #include "llvm/Analysis/IVDescriptors.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/Instruction.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/Type.h" 25 #include "llvm/IR/Value.h" 26 #include "llvm/Support/Casting.h" 27 #include "llvm/Support/CommandLine.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 31 #include "llvm/Transforms/Utils/LoopUtils.h" 32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 33 #include <cassert> 34 35 using namespace llvm; 36 37 using VectorParts = SmallVector<Value *, 2>; 38 39 namespace llvm { 40 extern cl::opt<bool> EnableVPlanNativePath; 41 } 42 43 #define LV_NAME "loop-vectorize" 44 #define DEBUG_TYPE LV_NAME 45 46 bool VPRecipeBase::mayWriteToMemory() const { 47 switch (getVPDefID()) { 48 case VPInterleaveSC: 49 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 50 case VPWidenMemoryInstructionSC: { 51 return cast<VPWidenMemoryInstructionRecipe>(this)->isStore(); 52 } 53 case VPReplicateSC: 54 case VPWidenCallSC: 55 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 56 ->mayWriteToMemory(); 57 case VPBranchOnMaskSC: 58 case VPScalarIVStepsSC: 59 case VPPredInstPHISC: 60 return false; 61 case VPBlendSC: 62 case VPReductionSC: 63 case VPWidenCanonicalIVSC: 64 case VPWidenCastSC: 65 case VPWidenGEPSC: 66 case VPWidenIntOrFpInductionSC: 67 case VPWidenPHISC: 68 case VPWidenSC: 69 case VPWidenSelectSC: { 70 const Instruction *I = 71 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 72 (void)I; 73 assert((!I || !I->mayWriteToMemory()) && 74 "underlying instruction may write to memory"); 75 return false; 76 } 77 default: 78 return true; 79 } 80 } 81 82 bool VPRecipeBase::mayReadFromMemory() const { 83 switch (getVPDefID()) { 84 case VPWidenMemoryInstructionSC: { 85 return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore(); 86 } 87 case VPReplicateSC: 88 case VPWidenCallSC: 89 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 90 ->mayReadFromMemory(); 91 case VPBranchOnMaskSC: 92 case VPScalarIVStepsSC: 93 case VPPredInstPHISC: 94 return false; 95 case VPBlendSC: 96 case VPReductionSC: 97 case VPWidenCanonicalIVSC: 98 case VPWidenCastSC: 99 case VPWidenGEPSC: 100 case VPWidenIntOrFpInductionSC: 101 case VPWidenPHISC: 102 case VPWidenSC: 103 case VPWidenSelectSC: { 104 const Instruction *I = 105 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 106 (void)I; 107 assert((!I || !I->mayReadFromMemory()) && 108 "underlying instruction may read from memory"); 109 return false; 110 } 111 default: 112 return true; 113 } 114 } 115 116 bool VPRecipeBase::mayHaveSideEffects() const { 117 switch (getVPDefID()) { 118 case VPDerivedIVSC: 119 case VPPredInstPHISC: 120 return false; 121 case VPInstructionSC: 122 switch (cast<VPInstruction>(this)->getOpcode()) { 123 case Instruction::Or: 124 case Instruction::ICmp: 125 case Instruction::Select: 126 case VPInstruction::Not: 127 case VPInstruction::CalculateTripCountMinusVF: 128 case VPInstruction::CanonicalIVIncrementForPart: 129 return false; 130 default: 131 return true; 132 } 133 case VPWidenCallSC: 134 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 135 ->mayHaveSideEffects(); 136 case VPBlendSC: 137 case VPReductionSC: 138 case VPScalarIVStepsSC: 139 case VPWidenCanonicalIVSC: 140 case VPWidenCastSC: 141 case VPWidenGEPSC: 142 case VPWidenIntOrFpInductionSC: 143 case VPWidenPHISC: 144 case VPWidenPointerInductionSC: 145 case VPWidenSC: 146 case VPWidenSelectSC: { 147 const Instruction *I = 148 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 149 (void)I; 150 assert((!I || !I->mayHaveSideEffects()) && 151 "underlying instruction has side-effects"); 152 return false; 153 } 154 case VPInterleaveSC: 155 return mayWriteToMemory(); 156 case VPWidenMemoryInstructionSC: 157 assert(cast<VPWidenMemoryInstructionRecipe>(this) 158 ->getIngredient() 159 .mayHaveSideEffects() == mayWriteToMemory() && 160 "mayHaveSideffects result for ingredient differs from this " 161 "implementation"); 162 return mayWriteToMemory(); 163 case VPReplicateSC: { 164 auto *R = cast<VPReplicateRecipe>(this); 165 return R->getUnderlyingInstr()->mayHaveSideEffects(); 166 } 167 default: 168 return true; 169 } 170 } 171 172 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { 173 auto Lane = VPLane::getLastLaneForVF(State.VF); 174 VPValue *ExitValue = getOperand(0); 175 if (vputils::isUniformAfterVectorization(ExitValue)) 176 Lane = VPLane::getFirstLane(); 177 VPBasicBlock *MiddleVPBB = 178 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); 179 assert(MiddleVPBB->getNumSuccessors() == 0 && 180 "the middle block must not have any successors"); 181 BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB]; 182 Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)), 183 MiddleBB); 184 } 185 186 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 187 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 188 O << "Live-out "; 189 getPhi()->printAsOperand(O); 190 O << " = "; 191 getOperand(0)->printAsOperand(O, SlotTracker); 192 O << "\n"; 193 } 194 #endif 195 196 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 197 assert(!Parent && "Recipe already in some VPBasicBlock"); 198 assert(InsertPos->getParent() && 199 "Insertion position not in any VPBasicBlock"); 200 Parent = InsertPos->getParent(); 201 Parent->getRecipeList().insert(InsertPos->getIterator(), this); 202 } 203 204 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 205 iplist<VPRecipeBase>::iterator I) { 206 assert(!Parent && "Recipe already in some VPBasicBlock"); 207 assert(I == BB.end() || I->getParent() == &BB); 208 Parent = &BB; 209 BB.getRecipeList().insert(I, this); 210 } 211 212 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 213 assert(!Parent && "Recipe already in some VPBasicBlock"); 214 assert(InsertPos->getParent() && 215 "Insertion position not in any VPBasicBlock"); 216 Parent = InsertPos->getParent(); 217 Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this); 218 } 219 220 void VPRecipeBase::removeFromParent() { 221 assert(getParent() && "Recipe not in any VPBasicBlock"); 222 getParent()->getRecipeList().remove(getIterator()); 223 Parent = nullptr; 224 } 225 226 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 227 assert(getParent() && "Recipe not in any VPBasicBlock"); 228 return getParent()->getRecipeList().erase(getIterator()); 229 } 230 231 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 232 removeFromParent(); 233 insertAfter(InsertPos); 234 } 235 236 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 237 iplist<VPRecipeBase>::iterator I) { 238 removeFromParent(); 239 insertBefore(BB, I); 240 } 241 242 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 243 assert(OpType == OperationType::FPMathOp && 244 "recipe doesn't have fast math flags"); 245 FastMathFlags Res; 246 Res.setAllowReassoc(FMFs.AllowReassoc); 247 Res.setNoNaNs(FMFs.NoNaNs); 248 Res.setNoInfs(FMFs.NoInfs); 249 Res.setNoSignedZeros(FMFs.NoSignedZeros); 250 Res.setAllowReciprocal(FMFs.AllowReciprocal); 251 Res.setAllowContract(FMFs.AllowContract); 252 Res.setApproxFunc(FMFs.ApproxFunc); 253 return Res; 254 } 255 256 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 257 VPValue *A, VPValue *B, DebugLoc DL, 258 const Twine &Name) 259 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 260 Pred, DL), 261 Opcode(Opcode), Name(Name.str()) { 262 assert(Opcode == Instruction::ICmp && 263 "only ICmp predicates supported at the moment"); 264 } 265 266 VPInstruction::VPInstruction(unsigned Opcode, 267 std::initializer_list<VPValue *> Operands, 268 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 269 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 270 Opcode(Opcode), Name(Name.str()) { 271 // Make sure the VPInstruction is a floating-point operation. 272 assert(isFPMathOp() && "this op can't take fast-math flags"); 273 } 274 275 Value *VPInstruction::generateInstruction(VPTransformState &State, 276 unsigned Part) { 277 IRBuilderBase &Builder = State.Builder; 278 Builder.SetCurrentDebugLocation(getDebugLoc()); 279 280 if (Instruction::isBinaryOp(getOpcode())) { 281 if (Part != 0 && vputils::onlyFirstPartUsed(this)) 282 return State.get(this, 0); 283 284 Value *A = State.get(getOperand(0), Part); 285 Value *B = State.get(getOperand(1), Part); 286 auto *Res = 287 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 288 if (auto *I = dyn_cast<Instruction>(Res)) 289 setFlags(I); 290 return Res; 291 } 292 293 switch (getOpcode()) { 294 case VPInstruction::Not: { 295 Value *A = State.get(getOperand(0), Part); 296 return Builder.CreateNot(A, Name); 297 } 298 case Instruction::ICmp: { 299 Value *A = State.get(getOperand(0), Part); 300 Value *B = State.get(getOperand(1), Part); 301 return Builder.CreateCmp(getPredicate(), A, B, Name); 302 } 303 case Instruction::Select: { 304 Value *Cond = State.get(getOperand(0), Part); 305 Value *Op1 = State.get(getOperand(1), Part); 306 Value *Op2 = State.get(getOperand(2), Part); 307 return Builder.CreateSelect(Cond, Op1, Op2, Name); 308 } 309 case VPInstruction::ActiveLaneMask: { 310 // Get first lane of vector induction variable. 311 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); 312 // Get the original loop tripcount. 313 Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); 314 315 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 316 auto *PredTy = VectorType::get(Int1Ty, State.VF); 317 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 318 {PredTy, ScalarTC->getType()}, 319 {VIVElem0, ScalarTC}, nullptr, Name); 320 } 321 case VPInstruction::FirstOrderRecurrenceSplice: { 322 // Generate code to combine the previous and current values in vector v3. 323 // 324 // vector.ph: 325 // v_init = vector(..., ..., ..., a[-1]) 326 // br vector.body 327 // 328 // vector.body 329 // i = phi [0, vector.ph], [i+4, vector.body] 330 // v1 = phi [v_init, vector.ph], [v2, vector.body] 331 // v2 = a[i, i+1, i+2, i+3]; 332 // v3 = vector(v1(3), v2(0, 1, 2)) 333 334 // For the first part, use the recurrence phi (v1), otherwise v2. 335 auto *V1 = State.get(getOperand(0), 0); 336 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); 337 if (!PartMinus1->getType()->isVectorTy()) 338 return PartMinus1; 339 Value *V2 = State.get(getOperand(1), Part); 340 return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); 341 } 342 case VPInstruction::CalculateTripCountMinusVF: { 343 Value *ScalarTC = State.get(getOperand(0), {0, 0}); 344 Value *Step = 345 createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); 346 Value *Sub = Builder.CreateSub(ScalarTC, Step); 347 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 348 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 349 return Builder.CreateSelect(Cmp, Sub, Zero); 350 } 351 case VPInstruction::CanonicalIVIncrementForPart: { 352 auto *IV = State.get(getOperand(0), VPIteration(0, 0)); 353 if (Part == 0) 354 return IV; 355 356 // The canonical IV is incremented by the vectorization factor (num of SIMD 357 // elements) times the unroll part. 358 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 359 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 360 hasNoSignedWrap()); 361 } 362 case VPInstruction::BranchOnCond: { 363 if (Part != 0) 364 return nullptr; 365 366 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); 367 VPRegionBlock *ParentRegion = getParent()->getParent(); 368 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 369 370 // Replace the temporary unreachable terminator with a new conditional 371 // branch, hooking it up to backward destination for exiting blocks now and 372 // to forward destination(s) later when they are created. 373 BranchInst *CondBr = 374 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 375 376 if (getParent()->isExiting()) 377 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 378 379 CondBr->setSuccessor(0, nullptr); 380 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 381 return CondBr; 382 } 383 case VPInstruction::BranchOnCount: { 384 if (Part != 0) 385 return nullptr; 386 // First create the compare. 387 Value *IV = State.get(getOperand(0), Part); 388 Value *TC = State.get(getOperand(1), Part); 389 Value *Cond = Builder.CreateICmpEQ(IV, TC); 390 391 // Now create the branch. 392 auto *Plan = getParent()->getPlan(); 393 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 394 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 395 396 // Replace the temporary unreachable terminator with a new conditional 397 // branch, hooking it up to backward destination (the header) now and to the 398 // forward destination (the exit/middle block) later when it is created. 399 // Note that CreateCondBr expects a valid BB as first argument, so we need 400 // to set it to nullptr later. 401 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 402 State.CFG.VPBB2IRBB[Header]); 403 CondBr->setSuccessor(0, nullptr); 404 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 405 return CondBr; 406 } 407 case VPInstruction::ComputeReductionResult: { 408 if (Part != 0) 409 return State.get(this, 0); 410 411 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 412 // and will be removed by breaking up the recipe further. 413 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 414 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 415 // Get its reduction variable descriptor. 416 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 417 418 RecurKind RK = RdxDesc.getRecurrenceKind(); 419 420 State.setDebugLocFrom(getDebugLoc()); 421 422 VPValue *LoopExitingDef = getOperand(1); 423 Type *PhiTy = OrigPhi->getType(); 424 VectorParts RdxParts(State.UF); 425 for (unsigned Part = 0; Part < State.UF; ++Part) 426 RdxParts[Part] = State.get(LoopExitingDef, Part); 427 428 // If the vector reduction can be performed in a smaller type, we truncate 429 // then extend the loop exit value to enable InstCombine to evaluate the 430 // entire expression in the smaller type. 431 // TODO: Handle this in truncateToMinBW. 432 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 433 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 434 for (unsigned Part = 0; Part < State.UF; ++Part) 435 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 436 } 437 // Reduce all of the unrolled parts into a single vector. 438 Value *ReducedPartRdx = RdxParts[0]; 439 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 440 441 if (PhiR->isOrdered()) { 442 ReducedPartRdx = RdxParts[State.UF - 1]; 443 } else { 444 // Floating-point operations should have some FMF to enable the reduction. 445 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 446 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 447 for (unsigned Part = 1; Part < State.UF; ++Part) { 448 Value *RdxPart = RdxParts[Part]; 449 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 450 ReducedPartRdx = Builder.CreateBinOp( 451 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 452 else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 453 TrackingVH<Value> ReductionStartValue = 454 RdxDesc.getRecurrenceStartValue(); 455 ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK, 456 ReducedPartRdx, RdxPart); 457 } else 458 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 459 } 460 } 461 462 // Create the reduction after the loop. Note that inloop reductions create 463 // the target reduction in the loop using a Reduction recipe. 464 if (State.VF.isVector() && !PhiR->isInLoop()) { 465 ReducedPartRdx = 466 createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 467 // If the reduction can be performed in a smaller type, we need to extend 468 // the reduction to the wider type before we branch to the original loop. 469 if (PhiTy != RdxDesc.getRecurrenceType()) 470 ReducedPartRdx = RdxDesc.isSigned() 471 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 472 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 473 } 474 475 // If there were stores of the reduction value to a uniform memory address 476 // inside the loop, create the final store here. 477 if (StoreInst *SI = RdxDesc.IntermediateStore) { 478 auto *NewSI = Builder.CreateAlignedStore( 479 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); 480 propagateMetadata(NewSI, SI); 481 } 482 483 return ReducedPartRdx; 484 } 485 default: 486 llvm_unreachable("Unsupported opcode for instruction"); 487 } 488 } 489 490 #if !defined(NDEBUG) 491 bool VPInstruction::isFPMathOp() const { 492 // Inspired by FPMathOperator::classof. Notable differences are that we don't 493 // support Call, PHI and Select opcodes here yet. 494 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 495 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 496 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 497 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 498 } 499 #endif 500 501 void VPInstruction::execute(VPTransformState &State) { 502 assert(!State.Instance && "VPInstruction executing an Instance"); 503 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 504 assert((hasFastMathFlags() == isFPMathOp() || 505 getOpcode() == Instruction::Select) && 506 "Recipe not a FPMathOp but has fast-math flags?"); 507 if (hasFastMathFlags()) 508 State.Builder.setFastMathFlags(getFastMathFlags()); 509 for (unsigned Part = 0; Part < State.UF; ++Part) { 510 Value *GeneratedValue = generateInstruction(State, Part); 511 if (!hasResult()) 512 continue; 513 assert(GeneratedValue && "generateInstruction must produce a value"); 514 State.set(this, GeneratedValue, Part); 515 } 516 } 517 518 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 519 void VPInstruction::dump() const { 520 VPSlotTracker SlotTracker(getParent()->getPlan()); 521 print(dbgs(), "", SlotTracker); 522 } 523 524 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 525 VPSlotTracker &SlotTracker) const { 526 O << Indent << "EMIT "; 527 528 if (hasResult()) { 529 printAsOperand(O, SlotTracker); 530 O << " = "; 531 } 532 533 switch (getOpcode()) { 534 case VPInstruction::Not: 535 O << "not"; 536 break; 537 case VPInstruction::SLPLoad: 538 O << "combined load"; 539 break; 540 case VPInstruction::SLPStore: 541 O << "combined store"; 542 break; 543 case VPInstruction::ActiveLaneMask: 544 O << "active lane mask"; 545 break; 546 case VPInstruction::FirstOrderRecurrenceSplice: 547 O << "first-order splice"; 548 break; 549 case VPInstruction::BranchOnCond: 550 O << "branch-on-cond"; 551 break; 552 case VPInstruction::CalculateTripCountMinusVF: 553 O << "TC > VF ? TC - VF : 0"; 554 break; 555 case VPInstruction::CanonicalIVIncrementForPart: 556 O << "VF * Part +"; 557 break; 558 case VPInstruction::BranchOnCount: 559 O << "branch-on-count"; 560 break; 561 case VPInstruction::ComputeReductionResult: 562 O << "compute-reduction-result"; 563 break; 564 default: 565 O << Instruction::getOpcodeName(getOpcode()); 566 } 567 568 printFlags(O); 569 printOperands(O, SlotTracker); 570 571 if (auto DL = getDebugLoc()) { 572 O << ", !dbg "; 573 DL.print(O); 574 } 575 } 576 #endif 577 578 void VPWidenCallRecipe::execute(VPTransformState &State) { 579 assert(State.VF.isVector() && "not widening"); 580 auto &CI = *cast<CallInst>(getUnderlyingInstr()); 581 assert(!isa<DbgInfoIntrinsic>(CI) && 582 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 583 State.setDebugLocFrom(getDebugLoc()); 584 585 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; 586 FunctionType *VFTy = nullptr; 587 if (Variant) 588 VFTy = Variant->getFunctionType(); 589 for (unsigned Part = 0; Part < State.UF; ++Part) { 590 SmallVector<Type *, 2> TysForDecl; 591 // Add return type if intrinsic is overloaded on it. 592 if (UseIntrinsic && 593 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) 594 TysForDecl.push_back( 595 VectorType::get(CI.getType()->getScalarType(), State.VF)); 596 SmallVector<Value *, 4> Args; 597 for (const auto &I : enumerate(operands())) { 598 // Some intrinsics have a scalar argument - don't replace it with a 599 // vector. 600 Value *Arg; 601 if (UseIntrinsic && 602 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) 603 Arg = State.get(I.value(), VPIteration(0, 0)); 604 // Some vectorized function variants may also take a scalar argument, 605 // e.g. linear parameters for pointers. This needs to be the scalar value 606 // from the start of the respective part when interleaving. 607 else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) 608 Arg = State.get(I.value(), VPIteration(Part, 0)); 609 else 610 Arg = State.get(I.value(), Part); 611 if (UseIntrinsic && 612 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) 613 TysForDecl.push_back(Arg->getType()); 614 Args.push_back(Arg); 615 } 616 617 Function *VectorF; 618 if (UseIntrinsic) { 619 // Use vector version of the intrinsic. 620 Module *M = State.Builder.GetInsertBlock()->getModule(); 621 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); 622 assert(VectorF && "Can't retrieve vector intrinsic."); 623 } else { 624 #ifndef NDEBUG 625 assert(Variant != nullptr && "Can't create vector function."); 626 #endif 627 VectorF = Variant; 628 } 629 630 SmallVector<OperandBundleDef, 1> OpBundles; 631 CI.getOperandBundlesAsDefs(OpBundles); 632 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 633 634 if (isa<FPMathOperator>(V)) 635 V->copyFastMathFlags(&CI); 636 637 State.set(this, V, Part); 638 State.addMetadata(V, &CI); 639 } 640 } 641 642 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 643 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 644 VPSlotTracker &SlotTracker) const { 645 O << Indent << "WIDEN-CALL "; 646 647 auto *CI = cast<CallInst>(getUnderlyingInstr()); 648 if (CI->getType()->isVoidTy()) 649 O << "void "; 650 else { 651 printAsOperand(O, SlotTracker); 652 O << " = "; 653 } 654 655 O << "call @" << CI->getCalledFunction()->getName() << "("; 656 printOperands(O, SlotTracker); 657 O << ")"; 658 659 if (VectorIntrinsicID) 660 O << " (using vector intrinsic)"; 661 else { 662 O << " (using library function"; 663 if (Variant->hasName()) 664 O << ": " << Variant->getName(); 665 O << ")"; 666 } 667 } 668 669 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 670 VPSlotTracker &SlotTracker) const { 671 O << Indent << "WIDEN-SELECT "; 672 printAsOperand(O, SlotTracker); 673 O << " = select "; 674 getOperand(0)->printAsOperand(O, SlotTracker); 675 O << ", "; 676 getOperand(1)->printAsOperand(O, SlotTracker); 677 O << ", "; 678 getOperand(2)->printAsOperand(O, SlotTracker); 679 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 680 } 681 #endif 682 683 void VPWidenSelectRecipe::execute(VPTransformState &State) { 684 State.setDebugLocFrom(getDebugLoc()); 685 686 // The condition can be loop invariant but still defined inside the 687 // loop. This means that we can't just use the original 'cond' value. 688 // We have to take the 'vectorized' value and pick the first lane. 689 // Instcombine will make this a no-op. 690 auto *InvarCond = 691 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; 692 693 for (unsigned Part = 0; Part < State.UF; ++Part) { 694 Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); 695 Value *Op0 = State.get(getOperand(1), Part); 696 Value *Op1 = State.get(getOperand(2), Part); 697 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 698 State.set(this, Sel, Part); 699 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 700 } 701 } 702 703 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 704 const FastMathFlags &FMF) { 705 AllowReassoc = FMF.allowReassoc(); 706 NoNaNs = FMF.noNaNs(); 707 NoInfs = FMF.noInfs(); 708 NoSignedZeros = FMF.noSignedZeros(); 709 AllowReciprocal = FMF.allowReciprocal(); 710 AllowContract = FMF.allowContract(); 711 ApproxFunc = FMF.approxFunc(); 712 } 713 714 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 715 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 716 switch (OpType) { 717 case OperationType::Cmp: 718 O << " " << CmpInst::getPredicateName(getPredicate()); 719 break; 720 case OperationType::DisjointOp: 721 if (DisjointFlags.IsDisjoint) 722 O << " disjoint"; 723 break; 724 case OperationType::PossiblyExactOp: 725 if (ExactFlags.IsExact) 726 O << " exact"; 727 break; 728 case OperationType::OverflowingBinOp: 729 if (WrapFlags.HasNUW) 730 O << " nuw"; 731 if (WrapFlags.HasNSW) 732 O << " nsw"; 733 break; 734 case OperationType::FPMathOp: 735 getFastMathFlags().print(O); 736 break; 737 case OperationType::GEPOp: 738 if (GEPFlags.IsInBounds) 739 O << " inbounds"; 740 break; 741 case OperationType::NonNegOp: 742 if (NonNegFlags.NonNeg) 743 O << " nneg"; 744 break; 745 case OperationType::Other: 746 break; 747 } 748 if (getNumOperands() > 0) 749 O << " "; 750 } 751 #endif 752 753 void VPWidenRecipe::execute(VPTransformState &State) { 754 State.setDebugLocFrom(getDebugLoc()); 755 auto &Builder = State.Builder; 756 switch (Opcode) { 757 case Instruction::Call: 758 case Instruction::Br: 759 case Instruction::PHI: 760 case Instruction::GetElementPtr: 761 case Instruction::Select: 762 llvm_unreachable("This instruction is handled by a different recipe."); 763 case Instruction::UDiv: 764 case Instruction::SDiv: 765 case Instruction::SRem: 766 case Instruction::URem: 767 case Instruction::Add: 768 case Instruction::FAdd: 769 case Instruction::Sub: 770 case Instruction::FSub: 771 case Instruction::FNeg: 772 case Instruction::Mul: 773 case Instruction::FMul: 774 case Instruction::FDiv: 775 case Instruction::FRem: 776 case Instruction::Shl: 777 case Instruction::LShr: 778 case Instruction::AShr: 779 case Instruction::And: 780 case Instruction::Or: 781 case Instruction::Xor: { 782 // Just widen unops and binops. 783 for (unsigned Part = 0; Part < State.UF; ++Part) { 784 SmallVector<Value *, 2> Ops; 785 for (VPValue *VPOp : operands()) 786 Ops.push_back(State.get(VPOp, Part)); 787 788 Value *V = Builder.CreateNAryOp(Opcode, Ops); 789 790 if (auto *VecOp = dyn_cast<Instruction>(V)) 791 setFlags(VecOp); 792 793 // Use this vector value for all users of the original instruction. 794 State.set(this, V, Part); 795 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 796 } 797 798 break; 799 } 800 case Instruction::Freeze: { 801 for (unsigned Part = 0; Part < State.UF; ++Part) { 802 Value *Op = State.get(getOperand(0), Part); 803 804 Value *Freeze = Builder.CreateFreeze(Op); 805 State.set(this, Freeze, Part); 806 } 807 break; 808 } 809 case Instruction::ICmp: 810 case Instruction::FCmp: { 811 // Widen compares. Generate vector compares. 812 bool FCmp = Opcode == Instruction::FCmp; 813 for (unsigned Part = 0; Part < State.UF; ++Part) { 814 Value *A = State.get(getOperand(0), Part); 815 Value *B = State.get(getOperand(1), Part); 816 Value *C = nullptr; 817 if (FCmp) { 818 // Propagate fast math flags. 819 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 820 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 821 Builder.setFastMathFlags(I->getFastMathFlags()); 822 C = Builder.CreateFCmp(getPredicate(), A, B); 823 } else { 824 C = Builder.CreateICmp(getPredicate(), A, B); 825 } 826 State.set(this, C, Part); 827 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 828 } 829 830 break; 831 } 832 default: 833 // This instruction is not vectorized by simple widening. 834 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 835 << Instruction::getOpcodeName(Opcode)); 836 llvm_unreachable("Unhandled instruction!"); 837 } // end of switch. 838 839 #if !defined(NDEBUG) 840 // Verify that VPlan type inference results agree with the type of the 841 // generated values. 842 for (unsigned Part = 0; Part < State.UF; ++Part) { 843 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), 844 State.VF) == State.get(this, Part)->getType() && 845 "inferred type and type from generated instructions do not match"); 846 } 847 #endif 848 } 849 850 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 851 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 852 VPSlotTracker &SlotTracker) const { 853 O << Indent << "WIDEN "; 854 printAsOperand(O, SlotTracker); 855 O << " = " << Instruction::getOpcodeName(Opcode); 856 printFlags(O); 857 printOperands(O, SlotTracker); 858 } 859 #endif 860 861 void VPWidenCastRecipe::execute(VPTransformState &State) { 862 State.setDebugLocFrom(getDebugLoc()); 863 auto &Builder = State.Builder; 864 /// Vectorize casts. 865 assert(State.VF.isVector() && "Not vectorizing?"); 866 Type *DestTy = VectorType::get(getResultType(), State.VF); 867 VPValue *Op = getOperand(0); 868 for (unsigned Part = 0; Part < State.UF; ++Part) { 869 if (Part > 0 && Op->isLiveIn()) { 870 // FIXME: Remove once explicit unrolling is implemented using VPlan. 871 State.set(this, State.get(this, 0), Part); 872 continue; 873 } 874 Value *A = State.get(Op, Part); 875 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 876 State.set(this, Cast, Part); 877 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 878 } 879 } 880 881 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 882 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 883 VPSlotTracker &SlotTracker) const { 884 O << Indent << "WIDEN-CAST "; 885 printAsOperand(O, SlotTracker); 886 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 887 printFlags(O); 888 printOperands(O, SlotTracker); 889 O << " to " << *getResultType(); 890 } 891 #endif 892 893 /// This function adds 894 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 895 /// to each vector element of Val. The sequence starts at StartIndex. 896 /// \p Opcode is relevant for FP induction variable. 897 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 898 Instruction::BinaryOps BinOp, ElementCount VF, 899 IRBuilderBase &Builder) { 900 assert(VF.isVector() && "only vector VFs are supported"); 901 902 // Create and check the types. 903 auto *ValVTy = cast<VectorType>(Val->getType()); 904 ElementCount VLen = ValVTy->getElementCount(); 905 906 Type *STy = Val->getType()->getScalarType(); 907 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 908 "Induction Step must be an integer or FP"); 909 assert(Step->getType() == STy && "Step has wrong type"); 910 911 SmallVector<Constant *, 8> Indices; 912 913 // Create a vector of consecutive numbers from zero to VF. 914 VectorType *InitVecValVTy = ValVTy; 915 if (STy->isFloatingPointTy()) { 916 Type *InitVecValSTy = 917 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 918 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 919 } 920 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 921 922 // Splat the StartIdx 923 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 924 925 if (STy->isIntegerTy()) { 926 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 927 Step = Builder.CreateVectorSplat(VLen, Step); 928 assert(Step->getType() == Val->getType() && "Invalid step vec"); 929 // FIXME: The newly created binary instructions should contain nsw/nuw 930 // flags, which can be found from the original scalar operations. 931 Step = Builder.CreateMul(InitVec, Step); 932 return Builder.CreateAdd(Val, Step, "induction"); 933 } 934 935 // Floating point induction. 936 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 937 "Binary Opcode should be specified for FP induction"); 938 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 939 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 940 941 Step = Builder.CreateVectorSplat(VLen, Step); 942 Value *MulOp = Builder.CreateFMul(InitVec, Step); 943 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 944 } 945 946 /// A helper function that returns an integer or floating-point constant with 947 /// value C. 948 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 949 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 950 : ConstantFP::get(Ty, C); 951 } 952 953 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 954 ElementCount VF) { 955 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 956 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 957 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 958 return B.CreateUIToFP(RuntimeVF, FTy); 959 } 960 961 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 962 assert(!State.Instance && "Int or FP induction being replicated."); 963 964 Value *Start = getStartValue()->getLiveInIRValue(); 965 const InductionDescriptor &ID = getInductionDescriptor(); 966 TruncInst *Trunc = getTruncInst(); 967 IRBuilderBase &Builder = State.Builder; 968 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 969 assert(State.VF.isVector() && "must have vector VF"); 970 971 // The value from the original loop to which we are mapping the new induction 972 // variable. 973 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 974 975 // Fast-math-flags propagate from the original induction instruction. 976 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 977 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 978 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 979 980 // Now do the actual transformations, and start with fetching the step value. 981 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 982 983 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 984 "Expected either an induction phi-node or a truncate of it!"); 985 986 // Construct the initial value of the vector IV in the vector loop preheader 987 auto CurrIP = Builder.saveIP(); 988 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 989 Builder.SetInsertPoint(VectorPH->getTerminator()); 990 if (isa<TruncInst>(EntryVal)) { 991 assert(Start->getType()->isIntegerTy() && 992 "Truncation requires an integer type"); 993 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 994 Step = Builder.CreateTrunc(Step, TruncType); 995 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 996 } 997 998 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 999 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1000 Value *SteppedStart = getStepVector( 1001 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 1002 1003 // We create vector phi nodes for both integer and floating-point induction 1004 // variables. Here, we determine the kind of arithmetic we will perform. 1005 Instruction::BinaryOps AddOp; 1006 Instruction::BinaryOps MulOp; 1007 if (Step->getType()->isIntegerTy()) { 1008 AddOp = Instruction::Add; 1009 MulOp = Instruction::Mul; 1010 } else { 1011 AddOp = ID.getInductionOpcode(); 1012 MulOp = Instruction::FMul; 1013 } 1014 1015 // Multiply the vectorization factor by the step using integer or 1016 // floating-point arithmetic as appropriate. 1017 Type *StepType = Step->getType(); 1018 Value *RuntimeVF; 1019 if (Step->getType()->isFloatingPointTy()) 1020 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 1021 else 1022 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 1023 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1024 1025 // Create a vector splat to use in the induction update. 1026 // 1027 // FIXME: If the step is non-constant, we create the vector splat with 1028 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1029 // handle a constant vector splat. 1030 Value *SplatVF = isa<Constant>(Mul) 1031 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 1032 : Builder.CreateVectorSplat(State.VF, Mul); 1033 Builder.restoreIP(CurrIP); 1034 1035 // We may need to add the step a number of times, depending on the unroll 1036 // factor. The last of those goes into the PHI. 1037 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1038 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1039 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1040 Instruction *LastInduction = VecInd; 1041 for (unsigned Part = 0; Part < State.UF; ++Part) { 1042 State.set(this, LastInduction, Part); 1043 1044 if (isa<TruncInst>(EntryVal)) 1045 State.addMetadata(LastInduction, EntryVal); 1046 1047 LastInduction = cast<Instruction>( 1048 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 1049 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1050 } 1051 1052 LastInduction->setName("vec.ind.next"); 1053 VecInd->addIncoming(SteppedStart, VectorPH); 1054 // Add induction update using an incorrect block temporarily. The phi node 1055 // will be fixed after VPlan execution. Note that at this point the latch 1056 // block cannot be used, as it does not exist yet. 1057 // TODO: Model increment value in VPlan, by turning the recipe into a 1058 // multi-def and a subclass of VPHeaderPHIRecipe. 1059 VecInd->addIncoming(LastInduction, VectorPH); 1060 } 1061 1062 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1063 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1064 VPSlotTracker &SlotTracker) const { 1065 O << Indent << "WIDEN-INDUCTION"; 1066 if (getTruncInst()) { 1067 O << "\\l\""; 1068 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 1069 O << " +\n" << Indent << "\" "; 1070 getVPValue(0)->printAsOperand(O, SlotTracker); 1071 } else 1072 O << " " << VPlanIngredient(IV); 1073 1074 O << ", "; 1075 getStepValue()->printAsOperand(O, SlotTracker); 1076 } 1077 #endif 1078 1079 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1080 // The step may be defined by a recipe in the preheader (e.g. if it requires 1081 // SCEV expansion), but for the canonical induction the step is required to be 1082 // 1, which is represented as live-in. 1083 if (getStepValue()->getDefiningRecipe()) 1084 return false; 1085 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1086 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1087 return StartC && StartC->isZero() && StepC && StepC->isOne(); 1088 } 1089 1090 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1091 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1092 VPSlotTracker &SlotTracker) const { 1093 O << Indent; 1094 printAsOperand(O, SlotTracker); 1095 O << Indent << "= DERIVED-IV "; 1096 getStartValue()->printAsOperand(O, SlotTracker); 1097 O << " + "; 1098 getCanonicalIV()->printAsOperand(O, SlotTracker); 1099 O << " * "; 1100 getStepValue()->printAsOperand(O, SlotTracker); 1101 1102 if (TruncResultTy) 1103 O << " (truncated to " << *TruncResultTy << ")"; 1104 } 1105 #endif 1106 1107 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1108 // Fast-math-flags propagate from the original induction instruction. 1109 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1110 if (hasFastMathFlags()) 1111 State.Builder.setFastMathFlags(getFastMathFlags()); 1112 1113 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1114 /// variable on which to base the steps, \p Step is the size of the step. 1115 1116 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 1117 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1118 IRBuilderBase &Builder = State.Builder; 1119 1120 // Ensure step has the same type as that of scalar IV. 1121 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1122 if (BaseIVTy != Step->getType()) { 1123 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to 1124 // avoid separate truncate here. 1125 assert(Step->getType()->isIntegerTy() && 1126 "Truncation requires an integer step"); 1127 Step = State.Builder.CreateTrunc(Step, BaseIVTy); 1128 } 1129 1130 // We build scalar steps for both integer and floating-point induction 1131 // variables. Here, we determine the kind of arithmetic we will perform. 1132 Instruction::BinaryOps AddOp; 1133 Instruction::BinaryOps MulOp; 1134 if (BaseIVTy->isIntegerTy()) { 1135 AddOp = Instruction::Add; 1136 MulOp = Instruction::Mul; 1137 } else { 1138 AddOp = InductionOpcode; 1139 MulOp = Instruction::FMul; 1140 } 1141 1142 // Determine the number of scalars we need to generate for each unroll 1143 // iteration. 1144 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1145 // Compute the scalar steps and save the results in State. 1146 Type *IntStepTy = 1147 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1148 Type *VecIVTy = nullptr; 1149 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1150 if (!FirstLaneOnly && State.VF.isScalable()) { 1151 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1152 UnitStepVec = 1153 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1154 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1155 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1156 } 1157 1158 unsigned StartPart = 0; 1159 unsigned EndPart = State.UF; 1160 unsigned StartLane = 0; 1161 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1162 if (State.Instance) { 1163 StartPart = State.Instance->Part; 1164 EndPart = StartPart + 1; 1165 StartLane = State.Instance->Lane.getKnownLane(); 1166 EndLane = StartLane + 1; 1167 } 1168 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 1169 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 1170 1171 if (!FirstLaneOnly && State.VF.isScalable()) { 1172 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1173 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1174 if (BaseIVTy->isFloatingPointTy()) 1175 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1176 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1177 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1178 State.set(this, Add, Part); 1179 // It's useful to record the lane values too for the known minimum number 1180 // of elements so we do those below. This improves the code quality when 1181 // trying to extract the first element, for example. 1182 } 1183 1184 if (BaseIVTy->isFloatingPointTy()) 1185 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1186 1187 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1188 Value *StartIdx = Builder.CreateBinOp( 1189 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1190 // The step returned by `createStepForVF` is a runtime-evaluated value 1191 // when VF is scalable. Otherwise, it should be folded into a Constant. 1192 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1193 "Expected StartIdx to be folded to a constant when VF is not " 1194 "scalable"); 1195 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1196 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1197 State.set(this, Add, VPIteration(Part, Lane)); 1198 } 1199 } 1200 } 1201 1202 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1203 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1204 VPSlotTracker &SlotTracker) const { 1205 O << Indent; 1206 printAsOperand(O, SlotTracker); 1207 O << " = SCALAR-STEPS "; 1208 printOperands(O, SlotTracker); 1209 } 1210 #endif 1211 1212 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1213 assert(State.VF.isVector() && "not widening"); 1214 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1215 // Construct a vector GEP by widening the operands of the scalar GEP as 1216 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1217 // results in a vector of pointers when at least one operand of the GEP 1218 // is vector-typed. Thus, to keep the representation compact, we only use 1219 // vector-typed operands for loop-varying values. 1220 1221 if (areAllOperandsInvariant()) { 1222 // If we are vectorizing, but the GEP has only loop-invariant operands, 1223 // the GEP we build (by only using vector-typed operands for 1224 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1225 // produce a vector of pointers, we need to either arbitrarily pick an 1226 // operand to broadcast, or broadcast a clone of the original GEP. 1227 // Here, we broadcast a clone of the original. 1228 // 1229 // TODO: If at some point we decide to scalarize instructions having 1230 // loop-invariant operands, this special case will no longer be 1231 // required. We would add the scalarization decision to 1232 // collectLoopScalars() and teach getVectorValue() to broadcast 1233 // the lane-zero scalar value. 1234 SmallVector<Value *> Ops; 1235 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1236 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); 1237 1238 auto *NewGEP = 1239 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1240 ArrayRef(Ops).drop_front(), "", isInBounds()); 1241 for (unsigned Part = 0; Part < State.UF; ++Part) { 1242 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1243 State.set(this, EntryPart, Part); 1244 State.addMetadata(EntryPart, GEP); 1245 } 1246 } else { 1247 // If the GEP has at least one loop-varying operand, we are sure to 1248 // produce a vector of pointers. But if we are only unrolling, we want 1249 // to produce a scalar GEP for each unroll part. Thus, the GEP we 1250 // produce with the code below will be scalar (if VF == 1) or vector 1251 // (otherwise). Note that for the unroll-only case, we still maintain 1252 // values in the vector mapping with initVector, as we do for other 1253 // instructions. 1254 for (unsigned Part = 0; Part < State.UF; ++Part) { 1255 // The pointer operand of the new GEP. If it's loop-invariant, we 1256 // won't broadcast it. 1257 auto *Ptr = isPointerLoopInvariant() 1258 ? State.get(getOperand(0), VPIteration(0, 0)) 1259 : State.get(getOperand(0), Part); 1260 1261 // Collect all the indices for the new GEP. If any index is 1262 // loop-invariant, we won't broadcast it. 1263 SmallVector<Value *, 4> Indices; 1264 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1265 VPValue *Operand = getOperand(I); 1266 if (isIndexLoopInvariant(I - 1)) 1267 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 1268 else 1269 Indices.push_back(State.get(Operand, Part)); 1270 } 1271 1272 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1273 // but it should be a vector, otherwise. 1274 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1275 Indices, "", isInBounds()); 1276 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1277 "NewGEP is not a pointer vector"); 1278 State.set(this, NewGEP, Part); 1279 State.addMetadata(NewGEP, GEP); 1280 } 1281 } 1282 } 1283 1284 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1285 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1286 VPSlotTracker &SlotTracker) const { 1287 O << Indent << "WIDEN-GEP "; 1288 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1289 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1290 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1291 1292 O << " "; 1293 printAsOperand(O, SlotTracker); 1294 O << " = getelementptr"; 1295 printFlags(O); 1296 printOperands(O, SlotTracker); 1297 } 1298 #endif 1299 1300 void VPVectorPointerRecipe ::execute(VPTransformState &State) { 1301 auto &Builder = State.Builder; 1302 State.setDebugLocFrom(getDebugLoc()); 1303 for (unsigned Part = 0; Part < State.UF; ++Part) { 1304 // Calculate the pointer for the specific unroll-part. 1305 Value *PartPtr = nullptr; 1306 // Use i32 for the gep index type when the value is constant, 1307 // or query DataLayout for a more suitable index type otherwise. 1308 const DataLayout &DL = 1309 Builder.GetInsertBlock()->getModule()->getDataLayout(); 1310 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) 1311 ? DL.getIndexType(IndexedTy->getPointerTo()) 1312 : Builder.getInt32Ty(); 1313 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); 1314 bool InBounds = isInBounds(); 1315 if (IsReverse) { 1316 // If the address is consecutive but reversed, then the 1317 // wide store needs to start at the last vector element. 1318 // RunTimeVF = VScale * VF.getKnownMinValue() 1319 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 1320 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 1321 // NumElt = -Part * RunTimeVF 1322 Value *NumElt = Builder.CreateMul( 1323 ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 1324 // LastLane = 1 - RunTimeVF 1325 Value *LastLane = 1326 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1327 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); 1328 PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); 1329 } else { 1330 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 1331 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); 1332 } 1333 1334 State.set(this, PartPtr, Part); 1335 } 1336 } 1337 1338 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1339 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1340 VPSlotTracker &SlotTracker) const { 1341 O << Indent; 1342 printAsOperand(O, SlotTracker); 1343 O << " = vector-pointer "; 1344 if (IsReverse) 1345 O << "(reverse) "; 1346 1347 printOperands(O, SlotTracker); 1348 } 1349 #endif 1350 1351 void VPBlendRecipe::execute(VPTransformState &State) { 1352 State.setDebugLocFrom(getDebugLoc()); 1353 // We know that all PHIs in non-header blocks are converted into 1354 // selects, so we don't have to worry about the insertion order and we 1355 // can just use the builder. 1356 // At this point we generate the predication tree. There may be 1357 // duplications since this is a simple recursive scan, but future 1358 // optimizations will clean it up. 1359 1360 unsigned NumIncoming = getNumIncomingValues(); 1361 1362 // Generate a sequence of selects of the form: 1363 // SELECT(Mask3, In3, 1364 // SELECT(Mask2, In2, 1365 // SELECT(Mask1, In1, 1366 // In0))) 1367 // Note that Mask0 is never used: lanes for which no path reaches this phi and 1368 // are essentially undef are taken from In0. 1369 VectorParts Entry(State.UF); 1370 for (unsigned In = 0; In < NumIncoming; ++In) { 1371 for (unsigned Part = 0; Part < State.UF; ++Part) { 1372 // We might have single edge PHIs (blocks) - use an identity 1373 // 'select' for the first PHI operand. 1374 Value *In0 = State.get(getIncomingValue(In), Part); 1375 if (In == 0) 1376 Entry[Part] = In0; // Initialize with the first incoming value. 1377 else { 1378 // Select between the current value and the previous incoming edge 1379 // based on the incoming mask. 1380 Value *Cond = State.get(getMask(In), Part); 1381 Entry[Part] = 1382 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 1383 } 1384 } 1385 } 1386 for (unsigned Part = 0; Part < State.UF; ++Part) 1387 State.set(this, Entry[Part], Part); 1388 } 1389 1390 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1391 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 1392 VPSlotTracker &SlotTracker) const { 1393 O << Indent << "BLEND "; 1394 printAsOperand(O, SlotTracker); 1395 O << " ="; 1396 if (getNumIncomingValues() == 1) { 1397 // Not a User of any mask: not really blending, this is a 1398 // single-predecessor phi. 1399 O << " "; 1400 getIncomingValue(0)->printAsOperand(O, SlotTracker); 1401 } else { 1402 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 1403 O << " "; 1404 getIncomingValue(I)->printAsOperand(O, SlotTracker); 1405 O << "/"; 1406 getMask(I)->printAsOperand(O, SlotTracker); 1407 } 1408 } 1409 } 1410 1411 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 1412 VPSlotTracker &SlotTracker) const { 1413 O << Indent << "REDUCE "; 1414 printAsOperand(O, SlotTracker); 1415 O << " = "; 1416 getChainOp()->printAsOperand(O, SlotTracker); 1417 O << " +"; 1418 if (isa<FPMathOperator>(getUnderlyingInstr())) 1419 O << getUnderlyingInstr()->getFastMathFlags(); 1420 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1421 getVecOp()->printAsOperand(O, SlotTracker); 1422 if (getCondOp()) { 1423 O << ", "; 1424 getCondOp()->printAsOperand(O, SlotTracker); 1425 } 1426 O << ")"; 1427 if (RdxDesc.IntermediateStore) 1428 O << " (with final reduction value stored in invariant address sank " 1429 "outside of loop)"; 1430 } 1431 #endif 1432 1433 bool VPReplicateRecipe::shouldPack() const { 1434 // Find if the recipe is used by a widened recipe via an intervening 1435 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 1436 return any_of(users(), [](const VPUser *U) { 1437 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 1438 return any_of(PredR->users(), [PredR](const VPUser *U) { 1439 return !U->usesScalars(PredR); 1440 }); 1441 return false; 1442 }); 1443 } 1444 1445 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1446 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 1447 VPSlotTracker &SlotTracker) const { 1448 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 1449 1450 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 1451 printAsOperand(O, SlotTracker); 1452 O << " = "; 1453 } 1454 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 1455 O << "call"; 1456 printFlags(O); 1457 O << "@" << CB->getCalledFunction()->getName() << "("; 1458 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 1459 O, [&O, &SlotTracker](VPValue *Op) { 1460 Op->printAsOperand(O, SlotTracker); 1461 }); 1462 O << ")"; 1463 } else { 1464 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 1465 printFlags(O); 1466 printOperands(O, SlotTracker); 1467 } 1468 1469 if (shouldPack()) 1470 O << " (S->V)"; 1471 } 1472 #endif 1473 1474 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 1475 assert(State.Instance && "Branch on Mask works only on single instance."); 1476 1477 unsigned Part = State.Instance->Part; 1478 unsigned Lane = State.Instance->Lane.getKnownLane(); 1479 1480 Value *ConditionBit = nullptr; 1481 VPValue *BlockInMask = getMask(); 1482 if (BlockInMask) { 1483 ConditionBit = State.get(BlockInMask, Part); 1484 if (ConditionBit->getType()->isVectorTy()) 1485 ConditionBit = State.Builder.CreateExtractElement( 1486 ConditionBit, State.Builder.getInt32(Lane)); 1487 } else // Block in mask is all-one. 1488 ConditionBit = State.Builder.getTrue(); 1489 1490 // Replace the temporary unreachable terminator with a new conditional branch, 1491 // whose two destinations will be set later when they are created. 1492 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 1493 assert(isa<UnreachableInst>(CurrentTerminator) && 1494 "Expected to replace unreachable terminator with conditional branch."); 1495 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 1496 CondBr->setSuccessor(0, nullptr); 1497 ReplaceInstWithInst(CurrentTerminator, CondBr); 1498 } 1499 1500 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 1501 assert(State.Instance && "Predicated instruction PHI works per instance."); 1502 Instruction *ScalarPredInst = 1503 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 1504 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 1505 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 1506 assert(PredicatingBB && "Predicated block has no single predecessor."); 1507 assert(isa<VPReplicateRecipe>(getOperand(0)) && 1508 "operand must be VPReplicateRecipe"); 1509 1510 // By current pack/unpack logic we need to generate only a single phi node: if 1511 // a vector value for the predicated instruction exists at this point it means 1512 // the instruction has vector users only, and a phi for the vector value is 1513 // needed. In this case the recipe of the predicated instruction is marked to 1514 // also do that packing, thereby "hoisting" the insert-element sequence. 1515 // Otherwise, a phi node for the scalar value is needed. 1516 unsigned Part = State.Instance->Part; 1517 if (State.hasVectorValue(getOperand(0), Part)) { 1518 Value *VectorValue = State.get(getOperand(0), Part); 1519 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 1520 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 1521 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 1522 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 1523 if (State.hasVectorValue(this, Part)) 1524 State.reset(this, VPhi, Part); 1525 else 1526 State.set(this, VPhi, Part); 1527 // NOTE: Currently we need to update the value of the operand, so the next 1528 // predicated iteration inserts its generated value in the correct vector. 1529 State.reset(getOperand(0), VPhi, Part); 1530 } else { 1531 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 1532 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 1533 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 1534 PredicatingBB); 1535 Phi->addIncoming(ScalarPredInst, PredicatedBB); 1536 if (State.hasScalarValue(this, *State.Instance)) 1537 State.reset(this, Phi, *State.Instance); 1538 else 1539 State.set(this, Phi, *State.Instance); 1540 // NOTE: Currently we need to update the value of the operand, so the next 1541 // predicated iteration inserts its generated value in the correct vector. 1542 State.reset(getOperand(0), Phi, *State.Instance); 1543 } 1544 } 1545 1546 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1547 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1548 VPSlotTracker &SlotTracker) const { 1549 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 1550 printAsOperand(O, SlotTracker); 1551 O << " = "; 1552 printOperands(O, SlotTracker); 1553 } 1554 1555 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, 1556 VPSlotTracker &SlotTracker) const { 1557 O << Indent << "WIDEN "; 1558 1559 if (!isStore()) { 1560 getVPSingleValue()->printAsOperand(O, SlotTracker); 1561 O << " = "; 1562 } 1563 O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; 1564 1565 printOperands(O, SlotTracker); 1566 } 1567 #endif 1568 1569 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { 1570 Value *Start = getStartValue()->getLiveInIRValue(); 1571 PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); 1572 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1573 1574 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1575 EntryPart->addIncoming(Start, VectorPH); 1576 EntryPart->setDebugLoc(getDebugLoc()); 1577 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1578 State.set(this, EntryPart, Part); 1579 } 1580 1581 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1582 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1583 VPSlotTracker &SlotTracker) const { 1584 O << Indent << "EMIT "; 1585 printAsOperand(O, SlotTracker); 1586 O << " = CANONICAL-INDUCTION "; 1587 printOperands(O, SlotTracker); 1588 } 1589 #endif 1590 1591 bool VPCanonicalIVPHIRecipe::isCanonical( 1592 InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step, 1593 Type *Ty) const { 1594 // The types must match and it must be an integer induction. 1595 if (Ty != getScalarType() || Kind != InductionDescriptor::IK_IntInduction) 1596 return false; 1597 // Start must match the start value of this canonical induction. 1598 if (Start != getStartValue()) 1599 return false; 1600 1601 // If the step is defined by a recipe, it is not a ConstantInt. 1602 if (Step->getDefiningRecipe()) 1603 return false; 1604 1605 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 1606 return StepC && StepC->isOne(); 1607 } 1608 1609 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) { 1610 return IsScalarAfterVectorization && 1611 (!VF.isScalable() || vputils::onlyFirstLaneUsed(this)); 1612 } 1613 1614 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1615 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1616 VPSlotTracker &SlotTracker) const { 1617 O << Indent << "EMIT "; 1618 printAsOperand(O, SlotTracker); 1619 O << " = WIDEN-POINTER-INDUCTION "; 1620 getStartValue()->printAsOperand(O, SlotTracker); 1621 O << ", " << *IndDesc.getStep(); 1622 } 1623 #endif 1624 1625 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 1626 assert(!State.Instance && "cannot be used in per-lane"); 1627 const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout(); 1628 SCEVExpander Exp(SE, DL, "induction"); 1629 1630 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 1631 &*State.Builder.GetInsertPoint()); 1632 assert(!State.ExpandedSCEVs.contains(Expr) && 1633 "Same SCEV expanded multiple times"); 1634 State.ExpandedSCEVs[Expr] = Res; 1635 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 1636 State.set(this, Res, {Part, 0}); 1637 } 1638 1639 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1640 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 1641 VPSlotTracker &SlotTracker) const { 1642 O << Indent << "EMIT "; 1643 getVPSingleValue()->printAsOperand(O, SlotTracker); 1644 O << " = EXPAND SCEV " << *Expr; 1645 } 1646 #endif 1647 1648 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 1649 Value *CanonicalIV = State.get(getOperand(0), 0); 1650 Type *STy = CanonicalIV->getType(); 1651 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 1652 ElementCount VF = State.VF; 1653 Value *VStart = VF.isScalar() 1654 ? CanonicalIV 1655 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 1656 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 1657 Value *VStep = createStepForVF(Builder, STy, VF, Part); 1658 if (VF.isVector()) { 1659 VStep = Builder.CreateVectorSplat(VF, VStep); 1660 VStep = 1661 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 1662 } 1663 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 1664 State.set(this, CanonicalVectorIV, Part); 1665 } 1666 } 1667 1668 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1669 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 1670 VPSlotTracker &SlotTracker) const { 1671 O << Indent << "EMIT "; 1672 printAsOperand(O, SlotTracker); 1673 O << " = WIDEN-CANONICAL-INDUCTION "; 1674 printOperands(O, SlotTracker); 1675 } 1676 #endif 1677 1678 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 1679 auto &Builder = State.Builder; 1680 // Create a vector from the initial value. 1681 auto *VectorInit = getStartValue()->getLiveInIRValue(); 1682 1683 Type *VecTy = State.VF.isScalar() 1684 ? VectorInit->getType() 1685 : VectorType::get(VectorInit->getType(), State.VF); 1686 1687 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1688 if (State.VF.isVector()) { 1689 auto *IdxTy = Builder.getInt32Ty(); 1690 auto *One = ConstantInt::get(IdxTy, 1); 1691 IRBuilder<>::InsertPointGuard Guard(Builder); 1692 Builder.SetInsertPoint(VectorPH->getTerminator()); 1693 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 1694 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 1695 VectorInit = Builder.CreateInsertElement( 1696 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 1697 } 1698 1699 // Create a phi node for the new recurrence. 1700 PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); 1701 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1702 EntryPart->addIncoming(VectorInit, VectorPH); 1703 State.set(this, EntryPart, 0); 1704 } 1705 1706 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1707 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 1708 VPSlotTracker &SlotTracker) const { 1709 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 1710 printAsOperand(O, SlotTracker); 1711 O << " = phi "; 1712 printOperands(O, SlotTracker); 1713 } 1714 #endif 1715 1716 void VPReductionPHIRecipe::execute(VPTransformState &State) { 1717 auto &Builder = State.Builder; 1718 1719 // Reductions do not have to start at zero. They can start with 1720 // any loop invariant values. 1721 VPValue *StartVPV = getStartValue(); 1722 Value *StartV = StartVPV->getLiveInIRValue(); 1723 1724 // In order to support recurrences we need to be able to vectorize Phi nodes. 1725 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 1726 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 1727 // this value when we vectorize all of the instructions that use the PHI. 1728 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 1729 Type *VecTy = ScalarPHI ? StartV->getType() 1730 : VectorType::get(StartV->getType(), State.VF); 1731 1732 BasicBlock *HeaderBB = State.CFG.PrevBB; 1733 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 1734 "recipe must be in the vector loop header"); 1735 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; 1736 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 1737 Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); 1738 EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); 1739 State.set(this, EntryPart, Part); 1740 } 1741 1742 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1743 1744 Value *Iden = nullptr; 1745 RecurKind RK = RdxDesc.getRecurrenceKind(); 1746 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 1747 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 1748 // MinMax and AnyOf reductions have the start value as their identity. 1749 if (ScalarPHI) { 1750 Iden = StartV; 1751 } else { 1752 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 1753 Builder.SetInsertPoint(VectorPH->getTerminator()); 1754 StartV = Iden = 1755 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 1756 } 1757 } else { 1758 Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), 1759 RdxDesc.getFastMathFlags()); 1760 1761 if (!ScalarPHI) { 1762 Iden = Builder.CreateVectorSplat(State.VF, Iden); 1763 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 1764 Builder.SetInsertPoint(VectorPH->getTerminator()); 1765 Constant *Zero = Builder.getInt32(0); 1766 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 1767 } 1768 } 1769 1770 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 1771 Value *EntryPart = State.get(this, Part); 1772 // Make sure to add the reduction start value only to the 1773 // first unroll part. 1774 Value *StartVal = (Part == 0) ? StartV : Iden; 1775 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH); 1776 } 1777 } 1778 1779 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1780 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1781 VPSlotTracker &SlotTracker) const { 1782 O << Indent << "WIDEN-REDUCTION-PHI "; 1783 1784 printAsOperand(O, SlotTracker); 1785 O << " = phi "; 1786 printOperands(O, SlotTracker); 1787 } 1788 #endif 1789 1790 void VPWidenPHIRecipe::execute(VPTransformState &State) { 1791 assert(EnableVPlanNativePath && 1792 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 1793 1794 Value *Op0 = State.get(getOperand(0), 0); 1795 Type *VecTy = Op0->getType(); 1796 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 1797 State.set(this, VecPhi, 0); 1798 } 1799 1800 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1801 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1802 VPSlotTracker &SlotTracker) const { 1803 O << Indent << "WIDEN-PHI "; 1804 1805 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 1806 // Unless all incoming values are modeled in VPlan print the original PHI 1807 // directly. 1808 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 1809 // values as VPValues. 1810 if (getNumOperands() != OriginalPhi->getNumOperands()) { 1811 O << VPlanIngredient(OriginalPhi); 1812 return; 1813 } 1814 1815 printAsOperand(O, SlotTracker); 1816 O << " = phi "; 1817 printOperands(O, SlotTracker); 1818 } 1819 #endif 1820 1821 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 1822 // remove VPActiveLaneMaskPHIRecipe. 1823 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 1824 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1825 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 1826 Value *StartMask = State.get(getOperand(0), Part); 1827 PHINode *EntryPart = 1828 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 1829 EntryPart->addIncoming(StartMask, VectorPH); 1830 EntryPart->setDebugLoc(getDebugLoc()); 1831 State.set(this, EntryPart, Part); 1832 } 1833 } 1834 1835 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1836 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 1837 VPSlotTracker &SlotTracker) const { 1838 O << Indent << "ACTIVE-LANE-MASK-PHI "; 1839 1840 printAsOperand(O, SlotTracker); 1841 O << " = phi "; 1842 printOperands(O, SlotTracker); 1843 } 1844 #endif 1845