1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlan.h" 15 #include "VPlanAnalysis.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Twine.h" 19 #include "llvm/Analysis/IVDescriptors.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/Instruction.h" 23 #include "llvm/IR/Instructions.h" 24 #include "llvm/IR/Type.h" 25 #include "llvm/IR/Value.h" 26 #include "llvm/Support/Casting.h" 27 #include "llvm/Support/CommandLine.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 31 #include "llvm/Transforms/Utils/LoopUtils.h" 32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 33 #include <cassert> 34 35 using namespace llvm; 36 37 using VectorParts = SmallVector<Value *, 2>; 38 39 namespace llvm { 40 extern cl::opt<bool> EnableVPlanNativePath; 41 } 42 extern cl::opt<unsigned> ForceTargetInstructionCost; 43 44 #define LV_NAME "loop-vectorize" 45 #define DEBUG_TYPE LV_NAME 46 47 bool VPRecipeBase::mayWriteToMemory() const { 48 switch (getVPDefID()) { 49 case VPInterleaveSC: 50 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 51 case VPWidenStoreEVLSC: 52 case VPWidenStoreSC: 53 return true; 54 case VPReplicateSC: 55 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 56 ->mayWriteToMemory(); 57 case VPWidenCallSC: 58 return !cast<VPWidenCallRecipe>(this) 59 ->getCalledScalarFunction() 60 ->onlyReadsMemory(); 61 case VPBranchOnMaskSC: 62 case VPScalarIVStepsSC: 63 case VPPredInstPHISC: 64 return false; 65 case VPBlendSC: 66 case VPReductionEVLSC: 67 case VPReductionSC: 68 case VPWidenCanonicalIVSC: 69 case VPWidenCastSC: 70 case VPWidenGEPSC: 71 case VPWidenIntOrFpInductionSC: 72 case VPWidenLoadEVLSC: 73 case VPWidenLoadSC: 74 case VPWidenPHISC: 75 case VPWidenSC: 76 case VPWidenSelectSC: { 77 const Instruction *I = 78 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 79 (void)I; 80 assert((!I || !I->mayWriteToMemory()) && 81 "underlying instruction may write to memory"); 82 return false; 83 } 84 default: 85 return true; 86 } 87 } 88 89 bool VPRecipeBase::mayReadFromMemory() const { 90 switch (getVPDefID()) { 91 case VPWidenLoadEVLSC: 92 case VPWidenLoadSC: 93 return true; 94 case VPReplicateSC: 95 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 96 ->mayReadFromMemory(); 97 case VPWidenCallSC: 98 return !cast<VPWidenCallRecipe>(this) 99 ->getCalledScalarFunction() 100 ->onlyWritesMemory(); 101 case VPBranchOnMaskSC: 102 case VPPredInstPHISC: 103 case VPScalarIVStepsSC: 104 case VPWidenStoreEVLSC: 105 case VPWidenStoreSC: 106 return false; 107 case VPBlendSC: 108 case VPReductionEVLSC: 109 case VPReductionSC: 110 case VPWidenCanonicalIVSC: 111 case VPWidenCastSC: 112 case VPWidenGEPSC: 113 case VPWidenIntOrFpInductionSC: 114 case VPWidenPHISC: 115 case VPWidenSC: 116 case VPWidenSelectSC: { 117 const Instruction *I = 118 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 119 (void)I; 120 assert((!I || !I->mayReadFromMemory()) && 121 "underlying instruction may read from memory"); 122 return false; 123 } 124 default: 125 return true; 126 } 127 } 128 129 bool VPRecipeBase::mayHaveSideEffects() const { 130 switch (getVPDefID()) { 131 case VPDerivedIVSC: 132 case VPPredInstPHISC: 133 case VPScalarCastSC: 134 return false; 135 case VPInstructionSC: 136 switch (cast<VPInstruction>(this)->getOpcode()) { 137 case Instruction::Or: 138 case Instruction::ICmp: 139 case Instruction::Select: 140 case VPInstruction::Not: 141 case VPInstruction::CalculateTripCountMinusVF: 142 case VPInstruction::CanonicalIVIncrementForPart: 143 case VPInstruction::ExtractFromEnd: 144 case VPInstruction::FirstOrderRecurrenceSplice: 145 case VPInstruction::LogicalAnd: 146 case VPInstruction::PtrAdd: 147 return false; 148 default: 149 return true; 150 } 151 case VPWidenCallSC: { 152 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 153 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 154 } 155 case VPBlendSC: 156 case VPReductionEVLSC: 157 case VPReductionSC: 158 case VPScalarIVStepsSC: 159 case VPWidenCanonicalIVSC: 160 case VPWidenCastSC: 161 case VPWidenGEPSC: 162 case VPWidenIntOrFpInductionSC: 163 case VPWidenPHISC: 164 case VPWidenPointerInductionSC: 165 case VPWidenSC: 166 case VPWidenSelectSC: { 167 const Instruction *I = 168 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 169 (void)I; 170 assert((!I || !I->mayHaveSideEffects()) && 171 "underlying instruction has side-effects"); 172 return false; 173 } 174 case VPInterleaveSC: 175 return mayWriteToMemory(); 176 case VPWidenLoadEVLSC: 177 case VPWidenLoadSC: 178 case VPWidenStoreEVLSC: 179 case VPWidenStoreSC: 180 assert( 181 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 182 mayWriteToMemory() && 183 "mayHaveSideffects result for ingredient differs from this " 184 "implementation"); 185 return mayWriteToMemory(); 186 case VPReplicateSC: { 187 auto *R = cast<VPReplicateRecipe>(this); 188 return R->getUnderlyingInstr()->mayHaveSideEffects(); 189 } 190 default: 191 return true; 192 } 193 } 194 195 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { 196 VPValue *ExitValue = getOperand(0); 197 auto Lane = vputils::isUniformAfterVectorization(ExitValue) 198 ? VPLane::getFirstLane() 199 : VPLane::getLastLaneForVF(State.VF); 200 VPBasicBlock *MiddleVPBB = 201 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()); 202 VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); 203 auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr; 204 // Values leaving the vector loop reach live out phi's in the exiting block 205 // via middle block. 206 auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion() 207 ? MiddleVPBB 208 : ExitingVPBB; 209 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 210 // Set insertion point in PredBB in case an extract needs to be generated. 211 // TODO: Model extracts explicitly. 212 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); 213 Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane)); 214 if (Phi->getBasicBlockIndex(PredBB) != -1) 215 Phi->setIncomingValueForBlock(PredBB, V); 216 else 217 Phi->addIncoming(V, PredBB); 218 } 219 220 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 221 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 222 O << "Live-out "; 223 getPhi()->printAsOperand(O); 224 O << " = "; 225 getOperand(0)->printAsOperand(O, SlotTracker); 226 O << "\n"; 227 } 228 #endif 229 230 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 231 assert(!Parent && "Recipe already in some VPBasicBlock"); 232 assert(InsertPos->getParent() && 233 "Insertion position not in any VPBasicBlock"); 234 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 235 } 236 237 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 238 iplist<VPRecipeBase>::iterator I) { 239 assert(!Parent && "Recipe already in some VPBasicBlock"); 240 assert(I == BB.end() || I->getParent() == &BB); 241 BB.insert(this, I); 242 } 243 244 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 245 assert(!Parent && "Recipe already in some VPBasicBlock"); 246 assert(InsertPos->getParent() && 247 "Insertion position not in any VPBasicBlock"); 248 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 249 } 250 251 void VPRecipeBase::removeFromParent() { 252 assert(getParent() && "Recipe not in any VPBasicBlock"); 253 getParent()->getRecipeList().remove(getIterator()); 254 Parent = nullptr; 255 } 256 257 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 258 assert(getParent() && "Recipe not in any VPBasicBlock"); 259 return getParent()->getRecipeList().erase(getIterator()); 260 } 261 262 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 263 removeFromParent(); 264 insertAfter(InsertPos); 265 } 266 267 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 268 iplist<VPRecipeBase>::iterator I) { 269 removeFromParent(); 270 insertBefore(BB, I); 271 } 272 273 /// Return the underlying instruction to be used for computing \p R's cost via 274 /// the legacy cost model. Return nullptr if there's no suitable instruction. 275 static Instruction *getInstructionForCost(const VPRecipeBase *R) { 276 if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) 277 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 278 if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) 279 return IG->getInsertPos(); 280 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) 281 return &WidenMem->getIngredient(); 282 return nullptr; 283 } 284 285 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { 286 if (auto *UI = getInstructionForCost(this)) 287 if (Ctx.skipCostComputation(UI, VF.isVector())) 288 return 0; 289 290 InstructionCost RecipeCost = computeCost(VF, Ctx); 291 if (ForceTargetInstructionCost.getNumOccurrences() > 0 && 292 RecipeCost.isValid()) 293 RecipeCost = InstructionCost(ForceTargetInstructionCost); 294 295 LLVM_DEBUG({ 296 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; 297 dump(); 298 }); 299 return RecipeCost; 300 } 301 302 InstructionCost VPRecipeBase::computeCost(ElementCount VF, 303 VPCostContext &Ctx) const { 304 // Compute the cost for the recipe falling back to the legacy cost model using 305 // the underlying instruction. If there is no underlying instruction, returns 306 // 0. 307 Instruction *UI = getInstructionForCost(this); 308 if (UI && isa<VPReplicateRecipe>(this)) { 309 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan 310 // transform, avoid computing their cost multiple times for now. 311 Ctx.SkipCostComputation.insert(UI); 312 } 313 return UI ? Ctx.getLegacyCost(UI, VF) : 0; 314 } 315 316 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { 317 assert(OpType == OperationType::FPMathOp && 318 "recipe doesn't have fast math flags"); 319 FastMathFlags Res; 320 Res.setAllowReassoc(FMFs.AllowReassoc); 321 Res.setNoNaNs(FMFs.NoNaNs); 322 Res.setNoInfs(FMFs.NoInfs); 323 Res.setNoSignedZeros(FMFs.NoSignedZeros); 324 Res.setAllowReciprocal(FMFs.AllowReciprocal); 325 Res.setAllowContract(FMFs.AllowContract); 326 Res.setApproxFunc(FMFs.ApproxFunc); 327 return Res; 328 } 329 330 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, 331 VPValue *A, VPValue *B, DebugLoc DL, 332 const Twine &Name) 333 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), 334 Pred, DL), 335 Opcode(Opcode), Name(Name.str()) { 336 assert(Opcode == Instruction::ICmp && 337 "only ICmp predicates supported at the moment"); 338 } 339 340 VPInstruction::VPInstruction(unsigned Opcode, 341 std::initializer_list<VPValue *> Operands, 342 FastMathFlags FMFs, DebugLoc DL, const Twine &Name) 343 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), 344 Opcode(Opcode), Name(Name.str()) { 345 // Make sure the VPInstruction is a floating-point operation. 346 assert(isFPMathOp() && "this op can't take fast-math flags"); 347 } 348 349 bool VPInstruction::doesGeneratePerAllLanes() const { 350 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 351 } 352 353 bool VPInstruction::canGenerateScalarForFirstLane() const { 354 if (Instruction::isBinaryOp(getOpcode())) 355 return true; 356 if (isSingleScalar() || isVectorToScalar()) 357 return true; 358 switch (Opcode) { 359 case Instruction::ICmp: 360 case VPInstruction::BranchOnCond: 361 case VPInstruction::BranchOnCount: 362 case VPInstruction::CalculateTripCountMinusVF: 363 case VPInstruction::CanonicalIVIncrementForPart: 364 case VPInstruction::PtrAdd: 365 case VPInstruction::ExplicitVectorLength: 366 return true; 367 default: 368 return false; 369 } 370 } 371 372 Value *VPInstruction::generatePerLane(VPTransformState &State, 373 const VPIteration &Lane) { 374 IRBuilderBase &Builder = State.Builder; 375 376 assert(getOpcode() == VPInstruction::PtrAdd && 377 "only PtrAdd opcodes are supported for now"); 378 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 379 State.get(getOperand(1), Lane), Name); 380 } 381 382 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { 383 IRBuilderBase &Builder = State.Builder; 384 385 if (Instruction::isBinaryOp(getOpcode())) { 386 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 387 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); 388 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); 389 auto *Res = 390 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 391 if (auto *I = dyn_cast<Instruction>(Res)) 392 setFlags(I); 393 return Res; 394 } 395 396 switch (getOpcode()) { 397 case VPInstruction::Not: { 398 Value *A = State.get(getOperand(0), Part); 399 return Builder.CreateNot(A, Name); 400 } 401 case Instruction::ICmp: { 402 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 403 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed); 404 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed); 405 return Builder.CreateCmp(getPredicate(), A, B, Name); 406 } 407 case Instruction::Select: { 408 Value *Cond = State.get(getOperand(0), Part); 409 Value *Op1 = State.get(getOperand(1), Part); 410 Value *Op2 = State.get(getOperand(2), Part); 411 return Builder.CreateSelect(Cond, Op1, Op2, Name); 412 } 413 case VPInstruction::ActiveLaneMask: { 414 // Get first lane of vector induction variable. 415 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); 416 // Get the original loop tripcount. 417 Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); 418 419 // If this part of the active lane mask is scalar, generate the CMP directly 420 // to avoid unnecessary extracts. 421 if (State.VF.isScalar()) 422 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 423 Name); 424 425 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 426 auto *PredTy = VectorType::get(Int1Ty, State.VF); 427 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 428 {PredTy, ScalarTC->getType()}, 429 {VIVElem0, ScalarTC}, nullptr, Name); 430 } 431 case VPInstruction::FirstOrderRecurrenceSplice: { 432 // Generate code to combine the previous and current values in vector v3. 433 // 434 // vector.ph: 435 // v_init = vector(..., ..., ..., a[-1]) 436 // br vector.body 437 // 438 // vector.body 439 // i = phi [0, vector.ph], [i+4, vector.body] 440 // v1 = phi [v_init, vector.ph], [v2, vector.body] 441 // v2 = a[i, i+1, i+2, i+3]; 442 // v3 = vector(v1(3), v2(0, 1, 2)) 443 444 // For the first part, use the recurrence phi (v1), otherwise v2. 445 auto *V1 = State.get(getOperand(0), 0); 446 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); 447 if (!PartMinus1->getType()->isVectorTy()) 448 return PartMinus1; 449 Value *V2 = State.get(getOperand(1), Part); 450 return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name); 451 } 452 case VPInstruction::CalculateTripCountMinusVF: { 453 if (Part != 0) 454 return State.get(this, 0, /*IsScalar*/ true); 455 456 Value *ScalarTC = State.get(getOperand(0), {0, 0}); 457 Value *Step = 458 createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); 459 Value *Sub = Builder.CreateSub(ScalarTC, Step); 460 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 461 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 462 return Builder.CreateSelect(Cmp, Sub, Zero); 463 } 464 case VPInstruction::ExplicitVectorLength: { 465 // Compute EVL 466 auto GetEVL = [=](VPTransformState &State, Value *AVL) { 467 assert(AVL->getType()->isIntegerTy() && 468 "Requested vector length should be an integer."); 469 470 // TODO: Add support for MaxSafeDist for correct loop emission. 471 assert(State.VF.isScalable() && "Expected scalable vector factor."); 472 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 473 474 Value *EVL = State.Builder.CreateIntrinsic( 475 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 476 {AVL, VFArg, State.Builder.getTrue()}); 477 return EVL; 478 }; 479 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 480 // be outside of the main loop. 481 assert(Part == 0 && "No unrolling expected for predicated vectorization."); 482 // Compute VTC - IV as the AVL (requested vector length). 483 Value *Index = State.get(getOperand(0), VPIteration(0, 0)); 484 Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); 485 Value *AVL = State.Builder.CreateSub(TripCount, Index); 486 Value *EVL = GetEVL(State, AVL); 487 return EVL; 488 } 489 case VPInstruction::CanonicalIVIncrementForPart: { 490 auto *IV = State.get(getOperand(0), VPIteration(0, 0)); 491 if (Part == 0) 492 return IV; 493 494 // The canonical IV is incremented by the vectorization factor (num of SIMD 495 // elements) times the unroll part. 496 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 497 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 498 hasNoSignedWrap()); 499 } 500 case VPInstruction::BranchOnCond: { 501 if (Part != 0) 502 return nullptr; 503 504 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); 505 // Replace the temporary unreachable terminator with a new conditional 506 // branch, hooking it up to backward destination for exiting blocks now and 507 // to forward destination(s) later when they are created. 508 BranchInst *CondBr = 509 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); 510 CondBr->setSuccessor(0, nullptr); 511 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 512 513 if (!getParent()->isExiting()) 514 return CondBr; 515 516 VPRegionBlock *ParentRegion = getParent()->getParent(); 517 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); 518 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); 519 return CondBr; 520 } 521 case VPInstruction::BranchOnCount: { 522 if (Part != 0) 523 return nullptr; 524 // First create the compare. 525 Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true); 526 Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true); 527 Value *Cond = Builder.CreateICmpEQ(IV, TC); 528 529 // Now create the branch. 530 auto *Plan = getParent()->getPlan(); 531 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); 532 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); 533 534 // Replace the temporary unreachable terminator with a new conditional 535 // branch, hooking it up to backward destination (the header) now and to the 536 // forward destination (the exit/middle block) later when it is created. 537 // Note that CreateCondBr expects a valid BB as first argument, so we need 538 // to set it to nullptr later. 539 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), 540 State.CFG.VPBB2IRBB[Header]); 541 CondBr->setSuccessor(0, nullptr); 542 Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); 543 return CondBr; 544 } 545 case VPInstruction::ComputeReductionResult: { 546 if (Part != 0) 547 return State.get(this, 0, /*IsScalar*/ true); 548 549 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 550 // and will be removed by breaking up the recipe further. 551 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 552 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 553 // Get its reduction variable descriptor. 554 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 555 556 RecurKind RK = RdxDesc.getRecurrenceKind(); 557 558 VPValue *LoopExitingDef = getOperand(1); 559 Type *PhiTy = OrigPhi->getType(); 560 VectorParts RdxParts(State.UF); 561 for (unsigned Part = 0; Part < State.UF; ++Part) 562 RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop()); 563 564 // If the vector reduction can be performed in a smaller type, we truncate 565 // then extend the loop exit value to enable InstCombine to evaluate the 566 // entire expression in the smaller type. 567 // TODO: Handle this in truncateToMinBW. 568 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 569 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); 570 for (unsigned Part = 0; Part < State.UF; ++Part) 571 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 572 } 573 // Reduce all of the unrolled parts into a single vector. 574 Value *ReducedPartRdx = RdxParts[0]; 575 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 576 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 577 Op = Instruction::Or; 578 579 if (PhiR->isOrdered()) { 580 ReducedPartRdx = RdxParts[State.UF - 1]; 581 } else { 582 // Floating-point operations should have some FMF to enable the reduction. 583 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 584 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 585 for (unsigned Part = 1; Part < State.UF; ++Part) { 586 Value *RdxPart = RdxParts[Part]; 587 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 588 ReducedPartRdx = Builder.CreateBinOp( 589 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 590 else 591 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 592 } 593 } 594 595 // Create the reduction after the loop. Note that inloop reductions create 596 // the target reduction in the loop using a Reduction recipe. 597 if ((State.VF.isVector() || 598 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && 599 !PhiR->isInLoop()) { 600 ReducedPartRdx = 601 createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 602 // If the reduction can be performed in a smaller type, we need to extend 603 // the reduction to the wider type before we branch to the original loop. 604 if (PhiTy != RdxDesc.getRecurrenceType()) 605 ReducedPartRdx = RdxDesc.isSigned() 606 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 607 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 608 } 609 610 // If there were stores of the reduction value to a uniform memory address 611 // inside the loop, create the final store here. 612 if (StoreInst *SI = RdxDesc.IntermediateStore) { 613 auto *NewSI = Builder.CreateAlignedStore( 614 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign()); 615 propagateMetadata(NewSI, SI); 616 } 617 618 return ReducedPartRdx; 619 } 620 case VPInstruction::ExtractFromEnd: { 621 if (Part != 0) 622 return State.get(this, 0, /*IsScalar*/ true); 623 624 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 625 unsigned Offset = CI->getZExtValue(); 626 assert(Offset > 0 && "Offset from end must be positive"); 627 Value *Res; 628 if (State.VF.isVector()) { 629 assert(Offset <= State.VF.getKnownMinValue() && 630 "invalid offset to extract from"); 631 // Extract lane VF - Offset from the operand. 632 Res = State.get( 633 getOperand(0), 634 VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset))); 635 } else { 636 assert(Offset <= State.UF && "invalid offset to extract from"); 637 // When loop is unrolled without vectorizing, retrieve UF - Offset. 638 Res = State.get(getOperand(0), State.UF - Offset); 639 } 640 if (isa<ExtractElementInst>(Res)) 641 Res->setName(Name); 642 return Res; 643 } 644 case VPInstruction::LogicalAnd: { 645 Value *A = State.get(getOperand(0), Part); 646 Value *B = State.get(getOperand(1), Part); 647 return Builder.CreateLogicalAnd(A, B, Name); 648 } 649 case VPInstruction::PtrAdd: { 650 assert(vputils::onlyFirstLaneUsed(this) && 651 "can only generate first lane for PtrAdd"); 652 Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true); 653 Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true); 654 return Builder.CreatePtrAdd(Ptr, Addend, Name); 655 } 656 case VPInstruction::ResumePhi: { 657 if (Part != 0) 658 return State.get(this, 0, /*IsScalar*/ true); 659 Value *IncomingFromVPlanPred = 660 State.get(getOperand(0), Part, /* IsScalar */ true); 661 Value *IncomingFromOtherPreds = 662 State.get(getOperand(1), Part, /* IsScalar */ true); 663 auto *NewPhi = 664 Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name); 665 BasicBlock *VPlanPred = 666 State.CFG 667 .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())]; 668 NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred); 669 for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) { 670 assert(OtherPred != VPlanPred && 671 "VPlan predecessors should not be connected yet"); 672 NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred); 673 } 674 return NewPhi; 675 } 676 677 default: 678 llvm_unreachable("Unsupported opcode for instruction"); 679 } 680 } 681 682 bool VPInstruction::isVectorToScalar() const { 683 return getOpcode() == VPInstruction::ExtractFromEnd || 684 getOpcode() == VPInstruction::ComputeReductionResult; 685 } 686 687 bool VPInstruction::isSingleScalar() const { 688 return getOpcode() == VPInstruction::ResumePhi; 689 } 690 691 #if !defined(NDEBUG) 692 bool VPInstruction::isFPMathOp() const { 693 // Inspired by FPMathOperator::classof. Notable differences are that we don't 694 // support Call, PHI and Select opcodes here yet. 695 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 696 Opcode == Instruction::FNeg || Opcode == Instruction::FSub || 697 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 698 Opcode == Instruction::FCmp || Opcode == Instruction::Select; 699 } 700 #endif 701 702 void VPInstruction::execute(VPTransformState &State) { 703 assert(!State.Instance && "VPInstruction executing an Instance"); 704 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 705 assert((hasFastMathFlags() == isFPMathOp() || 706 getOpcode() == Instruction::Select) && 707 "Recipe not a FPMathOp but has fast-math flags?"); 708 if (hasFastMathFlags()) 709 State.Builder.setFastMathFlags(getFastMathFlags()); 710 State.setDebugLocFrom(getDebugLoc()); 711 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && 712 (vputils::onlyFirstLaneUsed(this) || 713 isVectorToScalar() || isSingleScalar()); 714 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 715 bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this); 716 for (unsigned Part = 0; Part < State.UF; ++Part) { 717 if (GeneratesPerAllLanes) { 718 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); 719 Lane != NumLanes; ++Lane) { 720 Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane)); 721 assert(GeneratedValue && "generatePerLane must produce a value"); 722 State.set(this, GeneratedValue, VPIteration(Part, Lane)); 723 } 724 continue; 725 } 726 727 if (Part != 0 && OnlyFirstPartUsed && hasResult()) { 728 Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly); 729 State.set(this, Part0, Part, 730 /*IsScalar*/ GeneratesPerFirstLaneOnly); 731 continue; 732 } 733 734 Value *GeneratedValue = generatePerPart(State, Part); 735 if (!hasResult()) 736 continue; 737 assert(GeneratedValue && "generatePerPart must produce a value"); 738 assert((GeneratedValue->getType()->isVectorTy() == 739 !GeneratesPerFirstLaneOnly || 740 State.VF.isScalar()) && 741 "scalar value but not only first lane defined"); 742 State.set(this, GeneratedValue, Part, 743 /*IsScalar*/ GeneratesPerFirstLaneOnly); 744 } 745 } 746 747 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 748 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 749 if (Instruction::isBinaryOp(getOpcode())) 750 return vputils::onlyFirstLaneUsed(this); 751 752 switch (getOpcode()) { 753 default: 754 return false; 755 case Instruction::ICmp: 756 case VPInstruction::PtrAdd: 757 // TODO: Cover additional opcodes. 758 return vputils::onlyFirstLaneUsed(this); 759 case VPInstruction::ActiveLaneMask: 760 case VPInstruction::ExplicitVectorLength: 761 case VPInstruction::CalculateTripCountMinusVF: 762 case VPInstruction::CanonicalIVIncrementForPart: 763 case VPInstruction::BranchOnCount: 764 case VPInstruction::BranchOnCond: 765 case VPInstruction::ResumePhi: 766 return true; 767 }; 768 llvm_unreachable("switch should return"); 769 } 770 771 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 772 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 773 if (Instruction::isBinaryOp(getOpcode())) 774 return vputils::onlyFirstPartUsed(this); 775 776 switch (getOpcode()) { 777 default: 778 return false; 779 case Instruction::ICmp: 780 case Instruction::Select: 781 return vputils::onlyFirstPartUsed(this); 782 case VPInstruction::BranchOnCount: 783 case VPInstruction::BranchOnCond: 784 case VPInstruction::CanonicalIVIncrementForPart: 785 return true; 786 }; 787 llvm_unreachable("switch should return"); 788 } 789 790 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 791 void VPInstruction::dump() const { 792 VPSlotTracker SlotTracker(getParent()->getPlan()); 793 print(dbgs(), "", SlotTracker); 794 } 795 796 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 797 VPSlotTracker &SlotTracker) const { 798 O << Indent << "EMIT "; 799 800 if (hasResult()) { 801 printAsOperand(O, SlotTracker); 802 O << " = "; 803 } 804 805 switch (getOpcode()) { 806 case VPInstruction::Not: 807 O << "not"; 808 break; 809 case VPInstruction::SLPLoad: 810 O << "combined load"; 811 break; 812 case VPInstruction::SLPStore: 813 O << "combined store"; 814 break; 815 case VPInstruction::ActiveLaneMask: 816 O << "active lane mask"; 817 break; 818 case VPInstruction::ResumePhi: 819 O << "resume-phi"; 820 break; 821 case VPInstruction::ExplicitVectorLength: 822 O << "EXPLICIT-VECTOR-LENGTH"; 823 break; 824 case VPInstruction::FirstOrderRecurrenceSplice: 825 O << "first-order splice"; 826 break; 827 case VPInstruction::BranchOnCond: 828 O << "branch-on-cond"; 829 break; 830 case VPInstruction::CalculateTripCountMinusVF: 831 O << "TC > VF ? TC - VF : 0"; 832 break; 833 case VPInstruction::CanonicalIVIncrementForPart: 834 O << "VF * Part +"; 835 break; 836 case VPInstruction::BranchOnCount: 837 O << "branch-on-count"; 838 break; 839 case VPInstruction::ExtractFromEnd: 840 O << "extract-from-end"; 841 break; 842 case VPInstruction::ComputeReductionResult: 843 O << "compute-reduction-result"; 844 break; 845 case VPInstruction::LogicalAnd: 846 O << "logical-and"; 847 break; 848 case VPInstruction::PtrAdd: 849 O << "ptradd"; 850 break; 851 default: 852 O << Instruction::getOpcodeName(getOpcode()); 853 } 854 855 printFlags(O); 856 printOperands(O, SlotTracker); 857 858 if (auto DL = getDebugLoc()) { 859 O << ", !dbg "; 860 DL.print(O); 861 } 862 } 863 #endif 864 865 void VPWidenCallRecipe::execute(VPTransformState &State) { 866 assert(State.VF.isVector() && "not widening"); 867 Function *CalledScalarFn = getCalledScalarFunction(); 868 assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) && 869 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 870 State.setDebugLocFrom(getDebugLoc()); 871 872 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; 873 FunctionType *VFTy = nullptr; 874 if (Variant) 875 VFTy = Variant->getFunctionType(); 876 for (unsigned Part = 0; Part < State.UF; ++Part) { 877 SmallVector<Type *, 2> TysForDecl; 878 // Add return type if intrinsic is overloaded on it. 879 if (UseIntrinsic && 880 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) 881 TysForDecl.push_back(VectorType::get( 882 CalledScalarFn->getReturnType()->getScalarType(), State.VF)); 883 SmallVector<Value *, 4> Args; 884 for (const auto &I : enumerate(arg_operands())) { 885 // Some intrinsics have a scalar argument - don't replace it with a 886 // vector. 887 Value *Arg; 888 if (UseIntrinsic && 889 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) 890 Arg = State.get(I.value(), VPIteration(0, 0)); 891 // Some vectorized function variants may also take a scalar argument, 892 // e.g. linear parameters for pointers. This needs to be the scalar value 893 // from the start of the respective part when interleaving. 894 else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy()) 895 Arg = State.get(I.value(), VPIteration(Part, 0)); 896 else 897 Arg = State.get(I.value(), Part); 898 if (UseIntrinsic && 899 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) 900 TysForDecl.push_back(Arg->getType()); 901 Args.push_back(Arg); 902 } 903 904 Function *VectorF; 905 if (UseIntrinsic) { 906 // Use vector version of the intrinsic. 907 Module *M = State.Builder.GetInsertBlock()->getModule(); 908 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); 909 assert(VectorF && "Can't retrieve vector intrinsic."); 910 } else { 911 #ifndef NDEBUG 912 assert(Variant != nullptr && "Can't create vector function."); 913 #endif 914 VectorF = Variant; 915 } 916 917 auto *CI = cast_or_null<CallInst>(getUnderlyingInstr()); 918 SmallVector<OperandBundleDef, 1> OpBundles; 919 if (CI) 920 CI->getOperandBundlesAsDefs(OpBundles); 921 922 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 923 924 if (isa<FPMathOperator>(V)) 925 V->copyFastMathFlags(CI); 926 927 if (!V->getType()->isVoidTy()) 928 State.set(this, V, Part); 929 State.addMetadata(V, CI); 930 } 931 } 932 933 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 934 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 935 VPSlotTracker &SlotTracker) const { 936 O << Indent << "WIDEN-CALL "; 937 938 Function *CalledFn = getCalledScalarFunction(); 939 if (CalledFn->getReturnType()->isVoidTy()) 940 O << "void "; 941 else { 942 printAsOperand(O, SlotTracker); 943 O << " = "; 944 } 945 946 O << "call @" << CalledFn->getName() << "("; 947 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { 948 Op->printAsOperand(O, SlotTracker); 949 }); 950 O << ")"; 951 952 if (VectorIntrinsicID) 953 O << " (using vector intrinsic)"; 954 else { 955 O << " (using library function"; 956 if (Variant->hasName()) 957 O << ": " << Variant->getName(); 958 O << ")"; 959 } 960 } 961 962 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 963 VPSlotTracker &SlotTracker) const { 964 O << Indent << "WIDEN-SELECT "; 965 printAsOperand(O, SlotTracker); 966 O << " = select "; 967 getOperand(0)->printAsOperand(O, SlotTracker); 968 O << ", "; 969 getOperand(1)->printAsOperand(O, SlotTracker); 970 O << ", "; 971 getOperand(2)->printAsOperand(O, SlotTracker); 972 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 973 } 974 #endif 975 976 void VPWidenSelectRecipe::execute(VPTransformState &State) { 977 State.setDebugLocFrom(getDebugLoc()); 978 979 // The condition can be loop invariant but still defined inside the 980 // loop. This means that we can't just use the original 'cond' value. 981 // We have to take the 'vectorized' value and pick the first lane. 982 // Instcombine will make this a no-op. 983 auto *InvarCond = 984 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr; 985 986 for (unsigned Part = 0; Part < State.UF; ++Part) { 987 Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part); 988 Value *Op0 = State.get(getOperand(1), Part); 989 Value *Op1 = State.get(getOperand(2), Part); 990 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 991 State.set(this, Sel, Part); 992 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 993 } 994 } 995 996 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( 997 const FastMathFlags &FMF) { 998 AllowReassoc = FMF.allowReassoc(); 999 NoNaNs = FMF.noNaNs(); 1000 NoInfs = FMF.noInfs(); 1001 NoSignedZeros = FMF.noSignedZeros(); 1002 AllowReciprocal = FMF.allowReciprocal(); 1003 AllowContract = FMF.allowContract(); 1004 ApproxFunc = FMF.approxFunc(); 1005 } 1006 1007 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1008 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { 1009 switch (OpType) { 1010 case OperationType::Cmp: 1011 O << " " << CmpInst::getPredicateName(getPredicate()); 1012 break; 1013 case OperationType::DisjointOp: 1014 if (DisjointFlags.IsDisjoint) 1015 O << " disjoint"; 1016 break; 1017 case OperationType::PossiblyExactOp: 1018 if (ExactFlags.IsExact) 1019 O << " exact"; 1020 break; 1021 case OperationType::OverflowingBinOp: 1022 if (WrapFlags.HasNUW) 1023 O << " nuw"; 1024 if (WrapFlags.HasNSW) 1025 O << " nsw"; 1026 break; 1027 case OperationType::FPMathOp: 1028 getFastMathFlags().print(O); 1029 break; 1030 case OperationType::GEPOp: 1031 if (GEPFlags.IsInBounds) 1032 O << " inbounds"; 1033 break; 1034 case OperationType::NonNegOp: 1035 if (NonNegFlags.NonNeg) 1036 O << " nneg"; 1037 break; 1038 case OperationType::Other: 1039 break; 1040 } 1041 if (getNumOperands() > 0) 1042 O << " "; 1043 } 1044 #endif 1045 1046 void VPWidenRecipe::execute(VPTransformState &State) { 1047 State.setDebugLocFrom(getDebugLoc()); 1048 auto &Builder = State.Builder; 1049 switch (Opcode) { 1050 case Instruction::Call: 1051 case Instruction::Br: 1052 case Instruction::PHI: 1053 case Instruction::GetElementPtr: 1054 case Instruction::Select: 1055 llvm_unreachable("This instruction is handled by a different recipe."); 1056 case Instruction::UDiv: 1057 case Instruction::SDiv: 1058 case Instruction::SRem: 1059 case Instruction::URem: 1060 case Instruction::Add: 1061 case Instruction::FAdd: 1062 case Instruction::Sub: 1063 case Instruction::FSub: 1064 case Instruction::FNeg: 1065 case Instruction::Mul: 1066 case Instruction::FMul: 1067 case Instruction::FDiv: 1068 case Instruction::FRem: 1069 case Instruction::Shl: 1070 case Instruction::LShr: 1071 case Instruction::AShr: 1072 case Instruction::And: 1073 case Instruction::Or: 1074 case Instruction::Xor: { 1075 // Just widen unops and binops. 1076 for (unsigned Part = 0; Part < State.UF; ++Part) { 1077 SmallVector<Value *, 2> Ops; 1078 for (VPValue *VPOp : operands()) 1079 Ops.push_back(State.get(VPOp, Part)); 1080 1081 Value *V = Builder.CreateNAryOp(Opcode, Ops); 1082 1083 if (auto *VecOp = dyn_cast<Instruction>(V)) 1084 setFlags(VecOp); 1085 1086 // Use this vector value for all users of the original instruction. 1087 State.set(this, V, Part); 1088 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1089 } 1090 1091 break; 1092 } 1093 case Instruction::Freeze: { 1094 for (unsigned Part = 0; Part < State.UF; ++Part) { 1095 Value *Op = State.get(getOperand(0), Part); 1096 1097 Value *Freeze = Builder.CreateFreeze(Op); 1098 State.set(this, Freeze, Part); 1099 } 1100 break; 1101 } 1102 case Instruction::ICmp: 1103 case Instruction::FCmp: { 1104 // Widen compares. Generate vector compares. 1105 bool FCmp = Opcode == Instruction::FCmp; 1106 for (unsigned Part = 0; Part < State.UF; ++Part) { 1107 Value *A = State.get(getOperand(0), Part); 1108 Value *B = State.get(getOperand(1), Part); 1109 Value *C = nullptr; 1110 if (FCmp) { 1111 // Propagate fast math flags. 1112 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1113 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) 1114 Builder.setFastMathFlags(I->getFastMathFlags()); 1115 C = Builder.CreateFCmp(getPredicate(), A, B); 1116 } else { 1117 C = Builder.CreateICmp(getPredicate(), A, B); 1118 } 1119 State.set(this, C, Part); 1120 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1121 } 1122 1123 break; 1124 } 1125 default: 1126 // This instruction is not vectorized by simple widening. 1127 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1128 << Instruction::getOpcodeName(Opcode)); 1129 llvm_unreachable("Unhandled instruction!"); 1130 } // end of switch. 1131 1132 #if !defined(NDEBUG) 1133 // Verify that VPlan type inference results agree with the type of the 1134 // generated values. 1135 for (unsigned Part = 0; Part < State.UF; ++Part) { 1136 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), 1137 State.VF) == State.get(this, Part)->getType() && 1138 "inferred type and type from generated instructions do not match"); 1139 } 1140 #endif 1141 } 1142 1143 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1144 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 1145 VPSlotTracker &SlotTracker) const { 1146 O << Indent << "WIDEN "; 1147 printAsOperand(O, SlotTracker); 1148 O << " = " << Instruction::getOpcodeName(Opcode); 1149 printFlags(O); 1150 printOperands(O, SlotTracker); 1151 } 1152 #endif 1153 1154 void VPWidenCastRecipe::execute(VPTransformState &State) { 1155 State.setDebugLocFrom(getDebugLoc()); 1156 auto &Builder = State.Builder; 1157 /// Vectorize casts. 1158 assert(State.VF.isVector() && "Not vectorizing?"); 1159 Type *DestTy = VectorType::get(getResultType(), State.VF); 1160 VPValue *Op = getOperand(0); 1161 for (unsigned Part = 0; Part < State.UF; ++Part) { 1162 if (Part > 0 && Op->isLiveIn()) { 1163 // FIXME: Remove once explicit unrolling is implemented using VPlan. 1164 State.set(this, State.get(this, 0), Part); 1165 continue; 1166 } 1167 Value *A = State.get(Op, Part); 1168 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 1169 State.set(this, Cast, Part); 1170 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue())); 1171 } 1172 } 1173 1174 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1175 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 1176 VPSlotTracker &SlotTracker) const { 1177 O << Indent << "WIDEN-CAST "; 1178 printAsOperand(O, SlotTracker); 1179 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1180 printFlags(O); 1181 printOperands(O, SlotTracker); 1182 O << " to " << *getResultType(); 1183 } 1184 #endif 1185 1186 /// This function adds 1187 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 1188 /// to each vector element of Val. The sequence starts at StartIndex. 1189 /// \p Opcode is relevant for FP induction variable. 1190 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 1191 Instruction::BinaryOps BinOp, ElementCount VF, 1192 IRBuilderBase &Builder) { 1193 assert(VF.isVector() && "only vector VFs are supported"); 1194 1195 // Create and check the types. 1196 auto *ValVTy = cast<VectorType>(Val->getType()); 1197 ElementCount VLen = ValVTy->getElementCount(); 1198 1199 Type *STy = Val->getType()->getScalarType(); 1200 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1201 "Induction Step must be an integer or FP"); 1202 assert(Step->getType() == STy && "Step has wrong type"); 1203 1204 SmallVector<Constant *, 8> Indices; 1205 1206 // Create a vector of consecutive numbers from zero to VF. 1207 VectorType *InitVecValVTy = ValVTy; 1208 if (STy->isFloatingPointTy()) { 1209 Type *InitVecValSTy = 1210 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 1211 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 1212 } 1213 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 1214 1215 // Splat the StartIdx 1216 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 1217 1218 if (STy->isIntegerTy()) { 1219 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 1220 Step = Builder.CreateVectorSplat(VLen, Step); 1221 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1222 // FIXME: The newly created binary instructions should contain nsw/nuw 1223 // flags, which can be found from the original scalar operations. 1224 Step = Builder.CreateMul(InitVec, Step); 1225 return Builder.CreateAdd(Val, Step, "induction"); 1226 } 1227 1228 // Floating point induction. 1229 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1230 "Binary Opcode should be specified for FP induction"); 1231 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 1232 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 1233 1234 Step = Builder.CreateVectorSplat(VLen, Step); 1235 Value *MulOp = Builder.CreateFMul(InitVec, Step); 1236 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1237 } 1238 1239 /// A helper function that returns an integer or floating-point constant with 1240 /// value C. 1241 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 1242 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 1243 : ConstantFP::get(Ty, C); 1244 } 1245 1246 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1247 ElementCount VF) { 1248 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1249 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1250 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1251 return B.CreateUIToFP(RuntimeVF, FTy); 1252 } 1253 1254 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 1255 assert(!State.Instance && "Int or FP induction being replicated."); 1256 1257 Value *Start = getStartValue()->getLiveInIRValue(); 1258 const InductionDescriptor &ID = getInductionDescriptor(); 1259 TruncInst *Trunc = getTruncInst(); 1260 IRBuilderBase &Builder = State.Builder; 1261 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1262 assert(State.VF.isVector() && "must have vector VF"); 1263 1264 // The value from the original loop to which we are mapping the new induction 1265 // variable. 1266 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1267 1268 // Fast-math-flags propagate from the original induction instruction. 1269 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 1270 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 1271 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 1272 1273 // Now do the actual transformations, and start with fetching the step value. 1274 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1275 1276 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1277 "Expected either an induction phi-node or a truncate of it!"); 1278 1279 // Construct the initial value of the vector IV in the vector loop preheader 1280 auto CurrIP = Builder.saveIP(); 1281 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 1282 Builder.SetInsertPoint(VectorPH->getTerminator()); 1283 if (isa<TruncInst>(EntryVal)) { 1284 assert(Start->getType()->isIntegerTy() && 1285 "Truncation requires an integer type"); 1286 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1287 Step = Builder.CreateTrunc(Step, TruncType); 1288 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1289 } 1290 1291 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 1292 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 1293 Value *SteppedStart = getStepVector( 1294 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 1295 1296 // We create vector phi nodes for both integer and floating-point induction 1297 // variables. Here, we determine the kind of arithmetic we will perform. 1298 Instruction::BinaryOps AddOp; 1299 Instruction::BinaryOps MulOp; 1300 if (Step->getType()->isIntegerTy()) { 1301 AddOp = Instruction::Add; 1302 MulOp = Instruction::Mul; 1303 } else { 1304 AddOp = ID.getInductionOpcode(); 1305 MulOp = Instruction::FMul; 1306 } 1307 1308 // Multiply the vectorization factor by the step using integer or 1309 // floating-point arithmetic as appropriate. 1310 Type *StepType = Step->getType(); 1311 Value *RuntimeVF; 1312 if (Step->getType()->isFloatingPointTy()) 1313 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 1314 else 1315 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 1316 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 1317 1318 // Create a vector splat to use in the induction update. 1319 // 1320 // FIXME: If the step is non-constant, we create the vector splat with 1321 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1322 // handle a constant vector splat. 1323 Value *SplatVF = isa<Constant>(Mul) 1324 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 1325 : Builder.CreateVectorSplat(State.VF, Mul); 1326 Builder.restoreIP(CurrIP); 1327 1328 // We may need to add the step a number of times, depending on the unroll 1329 // factor. The last of those goes into the PHI. 1330 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); 1331 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 1332 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1333 Instruction *LastInduction = VecInd; 1334 for (unsigned Part = 0; Part < State.UF; ++Part) { 1335 State.set(this, LastInduction, Part); 1336 1337 if (isa<TruncInst>(EntryVal)) 1338 State.addMetadata(LastInduction, EntryVal); 1339 1340 LastInduction = cast<Instruction>( 1341 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 1342 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1343 } 1344 1345 LastInduction->setName("vec.ind.next"); 1346 VecInd->addIncoming(SteppedStart, VectorPH); 1347 // Add induction update using an incorrect block temporarily. The phi node 1348 // will be fixed after VPlan execution. Note that at this point the latch 1349 // block cannot be used, as it does not exist yet. 1350 // TODO: Model increment value in VPlan, by turning the recipe into a 1351 // multi-def and a subclass of VPHeaderPHIRecipe. 1352 VecInd->addIncoming(LastInduction, VectorPH); 1353 } 1354 1355 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1356 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 1357 VPSlotTracker &SlotTracker) const { 1358 O << Indent << "WIDEN-INDUCTION"; 1359 if (getTruncInst()) { 1360 O << "\\l\""; 1361 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 1362 O << " +\n" << Indent << "\" "; 1363 getVPValue(0)->printAsOperand(O, SlotTracker); 1364 } else 1365 O << " " << VPlanIngredient(IV); 1366 1367 O << ", "; 1368 getStepValue()->printAsOperand(O, SlotTracker); 1369 } 1370 #endif 1371 1372 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 1373 // The step may be defined by a recipe in the preheader (e.g. if it requires 1374 // SCEV expansion), but for the canonical induction the step is required to be 1375 // 1, which is represented as live-in. 1376 if (getStepValue()->getDefiningRecipe()) 1377 return false; 1378 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 1379 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 1380 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 1381 return StartC && StartC->isZero() && StepC && StepC->isOne() && 1382 getScalarType() == CanIV->getScalarType(); 1383 } 1384 1385 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1386 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 1387 VPSlotTracker &SlotTracker) const { 1388 O << Indent; 1389 printAsOperand(O, SlotTracker); 1390 O << Indent << "= DERIVED-IV "; 1391 getStartValue()->printAsOperand(O, SlotTracker); 1392 O << " + "; 1393 getOperand(1)->printAsOperand(O, SlotTracker); 1394 O << " * "; 1395 getStepValue()->printAsOperand(O, SlotTracker); 1396 } 1397 #endif 1398 1399 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 1400 // Fast-math-flags propagate from the original induction instruction. 1401 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 1402 if (hasFastMathFlags()) 1403 State.Builder.setFastMathFlags(getFastMathFlags()); 1404 1405 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 1406 /// variable on which to base the steps, \p Step is the size of the step. 1407 1408 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 1409 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 1410 IRBuilderBase &Builder = State.Builder; 1411 1412 // Ensure step has the same type as that of scalar IV. 1413 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 1414 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 1415 1416 // We build scalar steps for both integer and floating-point induction 1417 // variables. Here, we determine the kind of arithmetic we will perform. 1418 Instruction::BinaryOps AddOp; 1419 Instruction::BinaryOps MulOp; 1420 if (BaseIVTy->isIntegerTy()) { 1421 AddOp = Instruction::Add; 1422 MulOp = Instruction::Mul; 1423 } else { 1424 AddOp = InductionOpcode; 1425 MulOp = Instruction::FMul; 1426 } 1427 1428 // Determine the number of scalars we need to generate for each unroll 1429 // iteration. 1430 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 1431 // Compute the scalar steps and save the results in State. 1432 Type *IntStepTy = 1433 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 1434 Type *VecIVTy = nullptr; 1435 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 1436 if (!FirstLaneOnly && State.VF.isScalable()) { 1437 VecIVTy = VectorType::get(BaseIVTy, State.VF); 1438 UnitStepVec = 1439 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 1440 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 1441 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 1442 } 1443 1444 unsigned StartPart = 0; 1445 unsigned EndPart = State.UF; 1446 unsigned StartLane = 0; 1447 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 1448 if (State.Instance) { 1449 StartPart = State.Instance->Part; 1450 EndPart = StartPart + 1; 1451 StartLane = State.Instance->Lane.getKnownLane(); 1452 EndLane = StartLane + 1; 1453 } 1454 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 1455 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 1456 1457 if (!FirstLaneOnly && State.VF.isScalable()) { 1458 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 1459 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 1460 if (BaseIVTy->isFloatingPointTy()) 1461 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 1462 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 1463 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 1464 State.set(this, Add, Part); 1465 // It's useful to record the lane values too for the known minimum number 1466 // of elements so we do those below. This improves the code quality when 1467 // trying to extract the first element, for example. 1468 } 1469 1470 if (BaseIVTy->isFloatingPointTy()) 1471 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 1472 1473 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 1474 Value *StartIdx = Builder.CreateBinOp( 1475 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 1476 // The step returned by `createStepForVF` is a runtime-evaluated value 1477 // when VF is scalable. Otherwise, it should be folded into a Constant. 1478 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 1479 "Expected StartIdx to be folded to a constant when VF is not " 1480 "scalable"); 1481 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 1482 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 1483 State.set(this, Add, VPIteration(Part, Lane)); 1484 } 1485 } 1486 } 1487 1488 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1489 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 1490 VPSlotTracker &SlotTracker) const { 1491 O << Indent; 1492 printAsOperand(O, SlotTracker); 1493 O << " = SCALAR-STEPS "; 1494 printOperands(O, SlotTracker); 1495 } 1496 #endif 1497 1498 void VPWidenGEPRecipe::execute(VPTransformState &State) { 1499 assert(State.VF.isVector() && "not widening"); 1500 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 1501 // Construct a vector GEP by widening the operands of the scalar GEP as 1502 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 1503 // results in a vector of pointers when at least one operand of the GEP 1504 // is vector-typed. Thus, to keep the representation compact, we only use 1505 // vector-typed operands for loop-varying values. 1506 1507 if (areAllOperandsInvariant()) { 1508 // If we are vectorizing, but the GEP has only loop-invariant operands, 1509 // the GEP we build (by only using vector-typed operands for 1510 // loop-varying values) would be a scalar pointer. Thus, to ensure we 1511 // produce a vector of pointers, we need to either arbitrarily pick an 1512 // operand to broadcast, or broadcast a clone of the original GEP. 1513 // Here, we broadcast a clone of the original. 1514 // 1515 // TODO: If at some point we decide to scalarize instructions having 1516 // loop-invariant operands, this special case will no longer be 1517 // required. We would add the scalarization decision to 1518 // collectLoopScalars() and teach getVectorValue() to broadcast 1519 // the lane-zero scalar value. 1520 SmallVector<Value *> Ops; 1521 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 1522 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0))); 1523 1524 auto *NewGEP = 1525 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 1526 ArrayRef(Ops).drop_front(), "", isInBounds()); 1527 for (unsigned Part = 0; Part < State.UF; ++Part) { 1528 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP); 1529 State.set(this, EntryPart, Part); 1530 State.addMetadata(EntryPart, GEP); 1531 } 1532 } else { 1533 // If the GEP has at least one loop-varying operand, we are sure to 1534 // produce a vector of pointers. But if we are only unrolling, we want 1535 // to produce a scalar GEP for each unroll part. Thus, the GEP we 1536 // produce with the code below will be scalar (if VF == 1) or vector 1537 // (otherwise). Note that for the unroll-only case, we still maintain 1538 // values in the vector mapping with initVector, as we do for other 1539 // instructions. 1540 for (unsigned Part = 0; Part < State.UF; ++Part) { 1541 // The pointer operand of the new GEP. If it's loop-invariant, we 1542 // won't broadcast it. 1543 auto *Ptr = isPointerLoopInvariant() 1544 ? State.get(getOperand(0), VPIteration(0, 0)) 1545 : State.get(getOperand(0), Part); 1546 1547 // Collect all the indices for the new GEP. If any index is 1548 // loop-invariant, we won't broadcast it. 1549 SmallVector<Value *, 4> Indices; 1550 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 1551 VPValue *Operand = getOperand(I); 1552 if (isIndexLoopInvariant(I - 1)) 1553 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 1554 else 1555 Indices.push_back(State.get(Operand, Part)); 1556 } 1557 1558 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 1559 // but it should be a vector, otherwise. 1560 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 1561 Indices, "", isInBounds()); 1562 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 1563 "NewGEP is not a pointer vector"); 1564 State.set(this, NewGEP, Part); 1565 State.addMetadata(NewGEP, GEP); 1566 } 1567 } 1568 } 1569 1570 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1571 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 1572 VPSlotTracker &SlotTracker) const { 1573 O << Indent << "WIDEN-GEP "; 1574 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 1575 for (size_t I = 0; I < getNumOperands() - 1; ++I) 1576 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 1577 1578 O << " "; 1579 printAsOperand(O, SlotTracker); 1580 O << " = getelementptr"; 1581 printFlags(O); 1582 printOperands(O, SlotTracker); 1583 } 1584 #endif 1585 1586 void VPVectorPointerRecipe ::execute(VPTransformState &State) { 1587 auto &Builder = State.Builder; 1588 State.setDebugLocFrom(getDebugLoc()); 1589 for (unsigned Part = 0; Part < State.UF; ++Part) { 1590 // Calculate the pointer for the specific unroll-part. 1591 Value *PartPtr = nullptr; 1592 // Use i32 for the gep index type when the value is constant, 1593 // or query DataLayout for a more suitable index type otherwise. 1594 const DataLayout &DL = 1595 Builder.GetInsertBlock()->getDataLayout(); 1596 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) 1597 ? DL.getIndexType(IndexedTy->getPointerTo()) 1598 : Builder.getInt32Ty(); 1599 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); 1600 bool InBounds = isInBounds(); 1601 if (IsReverse) { 1602 // If the address is consecutive but reversed, then the 1603 // wide store needs to start at the last vector element. 1604 // RunTimeVF = VScale * VF.getKnownMinValue() 1605 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 1606 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 1607 // NumElt = -Part * RunTimeVF 1608 Value *NumElt = Builder.CreateMul( 1609 ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 1610 // LastLane = 1 - RunTimeVF 1611 Value *LastLane = 1612 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 1613 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); 1614 PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); 1615 } else { 1616 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 1617 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); 1618 } 1619 1620 State.set(this, PartPtr, Part, /*IsScalar*/ true); 1621 } 1622 } 1623 1624 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1625 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 1626 VPSlotTracker &SlotTracker) const { 1627 O << Indent; 1628 printAsOperand(O, SlotTracker); 1629 O << " = vector-pointer "; 1630 if (IsReverse) 1631 O << "(reverse) "; 1632 1633 printOperands(O, SlotTracker); 1634 } 1635 #endif 1636 1637 void VPBlendRecipe::execute(VPTransformState &State) { 1638 State.setDebugLocFrom(getDebugLoc()); 1639 // We know that all PHIs in non-header blocks are converted into 1640 // selects, so we don't have to worry about the insertion order and we 1641 // can just use the builder. 1642 // At this point we generate the predication tree. There may be 1643 // duplications since this is a simple recursive scan, but future 1644 // optimizations will clean it up. 1645 1646 unsigned NumIncoming = getNumIncomingValues(); 1647 1648 // Generate a sequence of selects of the form: 1649 // SELECT(Mask3, In3, 1650 // SELECT(Mask2, In2, 1651 // SELECT(Mask1, In1, 1652 // In0))) 1653 // Note that Mask0 is never used: lanes for which no path reaches this phi and 1654 // are essentially undef are taken from In0. 1655 VectorParts Entry(State.UF); 1656 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 1657 for (unsigned In = 0; In < NumIncoming; ++In) { 1658 for (unsigned Part = 0; Part < State.UF; ++Part) { 1659 // We might have single edge PHIs (blocks) - use an identity 1660 // 'select' for the first PHI operand. 1661 Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed); 1662 if (In == 0) 1663 Entry[Part] = In0; // Initialize with the first incoming value. 1664 else { 1665 // Select between the current value and the previous incoming edge 1666 // based on the incoming mask. 1667 Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed); 1668 Entry[Part] = 1669 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 1670 } 1671 } 1672 } 1673 for (unsigned Part = 0; Part < State.UF; ++Part) 1674 State.set(this, Entry[Part], Part, OnlyFirstLaneUsed); 1675 } 1676 1677 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1678 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 1679 VPSlotTracker &SlotTracker) const { 1680 O << Indent << "BLEND "; 1681 printAsOperand(O, SlotTracker); 1682 O << " ="; 1683 if (getNumIncomingValues() == 1) { 1684 // Not a User of any mask: not really blending, this is a 1685 // single-predecessor phi. 1686 O << " "; 1687 getIncomingValue(0)->printAsOperand(O, SlotTracker); 1688 } else { 1689 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 1690 O << " "; 1691 getIncomingValue(I)->printAsOperand(O, SlotTracker); 1692 if (I == 0) 1693 continue; 1694 O << "/"; 1695 getMask(I)->printAsOperand(O, SlotTracker); 1696 } 1697 } 1698 } 1699 #endif 1700 1701 void VPReductionRecipe::execute(VPTransformState &State) { 1702 assert(!State.Instance && "Reduction being replicated."); 1703 Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true); 1704 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1705 // Propagate the fast-math flags carried by the underlying instruction. 1706 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 1707 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1708 for (unsigned Part = 0; Part < State.UF; ++Part) { 1709 Value *NewVecOp = State.get(getVecOp(), Part); 1710 if (VPValue *Cond = getCondOp()) { 1711 Value *NewCond = State.get(Cond, Part, State.VF.isScalar()); 1712 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 1713 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 1714 Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, 1715 RdxDesc.getFastMathFlags()); 1716 if (State.VF.isVector()) { 1717 Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 1718 } 1719 1720 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden); 1721 NewVecOp = Select; 1722 } 1723 Value *NewRed; 1724 Value *NextInChain; 1725 if (IsOrdered) { 1726 if (State.VF.isVector()) 1727 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, 1728 PrevInChain); 1729 else 1730 NewRed = State.Builder.CreateBinOp( 1731 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 1732 NewVecOp); 1733 PrevInChain = NewRed; 1734 } else { 1735 PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true); 1736 NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); 1737 } 1738 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 1739 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 1740 NewRed, PrevInChain); 1741 } else if (IsOrdered) 1742 NextInChain = NewRed; 1743 else 1744 NextInChain = State.Builder.CreateBinOp( 1745 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); 1746 State.set(this, NextInChain, Part, /*IsScalar*/ true); 1747 } 1748 } 1749 1750 void VPReductionEVLRecipe::execute(VPTransformState &State) { 1751 assert(!State.Instance && "Reduction being replicated."); 1752 assert(State.UF == 1 && 1753 "Expected only UF == 1 when vectorizing with explicit vector length."); 1754 1755 auto &Builder = State.Builder; 1756 // Propagate the fast-math flags carried by the underlying instruction. 1757 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); 1758 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 1759 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 1760 1761 RecurKind Kind = RdxDesc.getRecurrenceKind(); 1762 Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true); 1763 Value *VecOp = State.get(getVecOp(), 0); 1764 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 1765 1766 VectorBuilder VBuilder(Builder); 1767 VBuilder.setEVL(EVL); 1768 Value *Mask; 1769 // TODO: move the all-true mask generation into VectorBuilder. 1770 if (VPValue *CondOp = getCondOp()) 1771 Mask = State.get(CondOp, 0); 1772 else 1773 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 1774 VBuilder.setMask(Mask); 1775 1776 Value *NewRed; 1777 if (isOrdered()) { 1778 NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev); 1779 } else { 1780 NewRed = createSimpleTargetReduction(VBuilder, VecOp, RdxDesc); 1781 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 1782 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); 1783 else 1784 NewRed = Builder.CreateBinOp( 1785 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev); 1786 } 1787 State.set(this, NewRed, 0, /*IsScalar*/ true); 1788 } 1789 1790 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1791 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 1792 VPSlotTracker &SlotTracker) const { 1793 O << Indent << "REDUCE "; 1794 printAsOperand(O, SlotTracker); 1795 O << " = "; 1796 getChainOp()->printAsOperand(O, SlotTracker); 1797 O << " +"; 1798 if (isa<FPMathOperator>(getUnderlyingInstr())) 1799 O << getUnderlyingInstr()->getFastMathFlags(); 1800 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1801 getVecOp()->printAsOperand(O, SlotTracker); 1802 if (isConditional()) { 1803 O << ", "; 1804 getCondOp()->printAsOperand(O, SlotTracker); 1805 } 1806 O << ")"; 1807 if (RdxDesc.IntermediateStore) 1808 O << " (with final reduction value stored in invariant address sank " 1809 "outside of loop)"; 1810 } 1811 1812 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, 1813 VPSlotTracker &SlotTracker) const { 1814 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); 1815 O << Indent << "REDUCE "; 1816 printAsOperand(O, SlotTracker); 1817 O << " = "; 1818 getChainOp()->printAsOperand(O, SlotTracker); 1819 O << " +"; 1820 if (isa<FPMathOperator>(getUnderlyingInstr())) 1821 O << getUnderlyingInstr()->getFastMathFlags(); 1822 O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; 1823 getVecOp()->printAsOperand(O, SlotTracker); 1824 O << ", "; 1825 getEVL()->printAsOperand(O, SlotTracker); 1826 if (isConditional()) { 1827 O << ", "; 1828 getCondOp()->printAsOperand(O, SlotTracker); 1829 } 1830 O << ")"; 1831 if (RdxDesc.IntermediateStore) 1832 O << " (with final reduction value stored in invariant address sank " 1833 "outside of loop)"; 1834 } 1835 #endif 1836 1837 bool VPReplicateRecipe::shouldPack() const { 1838 // Find if the recipe is used by a widened recipe via an intervening 1839 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 1840 return any_of(users(), [](const VPUser *U) { 1841 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 1842 return any_of(PredR->users(), [PredR](const VPUser *U) { 1843 return !U->usesScalars(PredR); 1844 }); 1845 return false; 1846 }); 1847 } 1848 1849 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1850 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 1851 VPSlotTracker &SlotTracker) const { 1852 O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); 1853 1854 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 1855 printAsOperand(O, SlotTracker); 1856 O << " = "; 1857 } 1858 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 1859 O << "call"; 1860 printFlags(O); 1861 O << "@" << CB->getCalledFunction()->getName() << "("; 1862 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 1863 O, [&O, &SlotTracker](VPValue *Op) { 1864 Op->printAsOperand(O, SlotTracker); 1865 }); 1866 O << ")"; 1867 } else { 1868 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 1869 printFlags(O); 1870 printOperands(O, SlotTracker); 1871 } 1872 1873 if (shouldPack()) 1874 O << " (S->V)"; 1875 } 1876 #endif 1877 1878 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such 1879 /// if it is either defined outside the vector region or its operand is known to 1880 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI). 1881 /// TODO: Uniformity should be associated with a VPValue and there should be a 1882 /// generic way to check. 1883 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) { 1884 return C->isDefinedOutsideVectorRegions() || 1885 isa<VPDerivedIVRecipe>(C->getOperand(0)) || 1886 isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)); 1887 } 1888 1889 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { 1890 assert(vputils::onlyFirstLaneUsed(this) && 1891 "Codegen only implemented for first lane."); 1892 switch (Opcode) { 1893 case Instruction::SExt: 1894 case Instruction::ZExt: 1895 case Instruction::Trunc: { 1896 // Note: SExt/ZExt not used yet. 1897 Value *Op = State.get(getOperand(0), VPIteration(Part, 0)); 1898 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy); 1899 } 1900 default: 1901 llvm_unreachable("opcode not implemented yet"); 1902 } 1903 } 1904 1905 void VPScalarCastRecipe ::execute(VPTransformState &State) { 1906 bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this); 1907 for (unsigned Part = 0; Part != State.UF; ++Part) { 1908 Value *Res; 1909 // Only generate a single instance, if the recipe is uniform across UFs and 1910 // VFs. 1911 if (Part > 0 && IsUniformAcrossVFsAndUFs) 1912 Res = State.get(this, VPIteration(0, 0)); 1913 else 1914 Res = generate(State, Part); 1915 State.set(this, Res, VPIteration(Part, 0)); 1916 } 1917 } 1918 1919 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1920 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, 1921 VPSlotTracker &SlotTracker) const { 1922 O << Indent << "SCALAR-CAST "; 1923 printAsOperand(O, SlotTracker); 1924 O << " = " << Instruction::getOpcodeName(Opcode) << " "; 1925 printOperands(O, SlotTracker); 1926 O << " to " << *ResultTy; 1927 } 1928 #endif 1929 1930 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 1931 assert(State.Instance && "Branch on Mask works only on single instance."); 1932 1933 unsigned Part = State.Instance->Part; 1934 unsigned Lane = State.Instance->Lane.getKnownLane(); 1935 1936 Value *ConditionBit = nullptr; 1937 VPValue *BlockInMask = getMask(); 1938 if (BlockInMask) { 1939 ConditionBit = State.get(BlockInMask, Part); 1940 if (ConditionBit->getType()->isVectorTy()) 1941 ConditionBit = State.Builder.CreateExtractElement( 1942 ConditionBit, State.Builder.getInt32(Lane)); 1943 } else // Block in mask is all-one. 1944 ConditionBit = State.Builder.getTrue(); 1945 1946 // Replace the temporary unreachable terminator with a new conditional branch, 1947 // whose two destinations will be set later when they are created. 1948 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 1949 assert(isa<UnreachableInst>(CurrentTerminator) && 1950 "Expected to replace unreachable terminator with conditional branch."); 1951 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 1952 CondBr->setSuccessor(0, nullptr); 1953 ReplaceInstWithInst(CurrentTerminator, CondBr); 1954 } 1955 1956 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 1957 assert(State.Instance && "Predicated instruction PHI works per instance."); 1958 Instruction *ScalarPredInst = 1959 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 1960 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 1961 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 1962 assert(PredicatingBB && "Predicated block has no single predecessor."); 1963 assert(isa<VPReplicateRecipe>(getOperand(0)) && 1964 "operand must be VPReplicateRecipe"); 1965 1966 // By current pack/unpack logic we need to generate only a single phi node: if 1967 // a vector value for the predicated instruction exists at this point it means 1968 // the instruction has vector users only, and a phi for the vector value is 1969 // needed. In this case the recipe of the predicated instruction is marked to 1970 // also do that packing, thereby "hoisting" the insert-element sequence. 1971 // Otherwise, a phi node for the scalar value is needed. 1972 unsigned Part = State.Instance->Part; 1973 if (State.hasVectorValue(getOperand(0), Part)) { 1974 Value *VectorValue = State.get(getOperand(0), Part); 1975 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 1976 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 1977 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 1978 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 1979 if (State.hasVectorValue(this, Part)) 1980 State.reset(this, VPhi, Part); 1981 else 1982 State.set(this, VPhi, Part); 1983 // NOTE: Currently we need to update the value of the operand, so the next 1984 // predicated iteration inserts its generated value in the correct vector. 1985 State.reset(getOperand(0), VPhi, Part); 1986 } else { 1987 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 1988 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 1989 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 1990 PredicatingBB); 1991 Phi->addIncoming(ScalarPredInst, PredicatedBB); 1992 if (State.hasScalarValue(this, *State.Instance)) 1993 State.reset(this, Phi, *State.Instance); 1994 else 1995 State.set(this, Phi, *State.Instance); 1996 // NOTE: Currently we need to update the value of the operand, so the next 1997 // predicated iteration inserts its generated value in the correct vector. 1998 State.reset(getOperand(0), Phi, *State.Instance); 1999 } 2000 } 2001 2002 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2003 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2004 VPSlotTracker &SlotTracker) const { 2005 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 2006 printAsOperand(O, SlotTracker); 2007 O << " = "; 2008 printOperands(O, SlotTracker); 2009 } 2010 2011 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 2012 VPSlotTracker &SlotTracker) const { 2013 O << Indent << "WIDEN "; 2014 printAsOperand(O, SlotTracker); 2015 O << " = load "; 2016 printOperands(O, SlotTracker); 2017 } 2018 2019 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2020 VPSlotTracker &SlotTracker) const { 2021 O << Indent << "WIDEN "; 2022 printAsOperand(O, SlotTracker); 2023 O << " = vp.load "; 2024 printOperands(O, SlotTracker); 2025 } 2026 2027 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 2028 VPSlotTracker &SlotTracker) const { 2029 O << Indent << "WIDEN store "; 2030 printOperands(O, SlotTracker); 2031 } 2032 2033 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2034 VPSlotTracker &SlotTracker) const { 2035 O << Indent << "WIDEN vp.store "; 2036 printOperands(O, SlotTracker); 2037 } 2038 #endif 2039 2040 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, 2041 VectorType *DstVTy, const DataLayout &DL) { 2042 // Verify that V is a vector type with same number of elements as DstVTy. 2043 auto VF = DstVTy->getElementCount(); 2044 auto *SrcVecTy = cast<VectorType>(V->getType()); 2045 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2046 Type *SrcElemTy = SrcVecTy->getElementType(); 2047 Type *DstElemTy = DstVTy->getElementType(); 2048 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2049 "Vector elements must have same size"); 2050 2051 // Do a direct cast if element types are castable. 2052 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2053 return Builder.CreateBitOrPointerCast(V, DstVTy); 2054 } 2055 // V cannot be directly casted to desired vector type. 2056 // May happen when V is a floating point vector but DstVTy is a vector of 2057 // pointers or vice-versa. Handle this using a two-step bitcast using an 2058 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2059 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2060 "Only one type should be a pointer type"); 2061 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2062 "Only one type should be a floating point type"); 2063 Type *IntTy = 2064 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2065 auto *VecIntTy = VectorType::get(IntTy, VF); 2066 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2067 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2068 } 2069 2070 /// Return a vector containing interleaved elements from multiple 2071 /// smaller input vectors. 2072 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 2073 const Twine &Name) { 2074 unsigned Factor = Vals.size(); 2075 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 2076 2077 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 2078 #ifndef NDEBUG 2079 for (Value *Val : Vals) 2080 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 2081 #endif 2082 2083 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 2084 // must use intrinsics to interleave. 2085 if (VecTy->isScalableTy()) { 2086 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 2087 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, 2088 Vals, 2089 /*FMFSource=*/nullptr, Name); 2090 } 2091 2092 // Fixed length. Start by concatenating all vectors into a wide vector. 2093 Value *WideVec = concatenateVectors(Builder, Vals); 2094 2095 // Interleave the elements into the wide vector. 2096 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 2097 return Builder.CreateShuffleVector( 2098 WideVec, createInterleaveMask(NumElts, Factor), Name); 2099 } 2100 2101 // Try to vectorize the interleave group that \p Instr belongs to. 2102 // 2103 // E.g. Translate following interleaved load group (factor = 3): 2104 // for (i = 0; i < N; i+=3) { 2105 // R = Pic[i]; // Member of index 0 2106 // G = Pic[i+1]; // Member of index 1 2107 // B = Pic[i+2]; // Member of index 2 2108 // ... // do something to R, G, B 2109 // } 2110 // To: 2111 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2112 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2113 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2114 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2115 // 2116 // Or translate following interleaved store group (factor = 3): 2117 // for (i = 0; i < N; i+=3) { 2118 // ... do something to R, G, B 2119 // Pic[i] = R; // Member of index 0 2120 // Pic[i+1] = G; // Member of index 1 2121 // Pic[i+2] = B; // Member of index 2 2122 // } 2123 // To: 2124 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2125 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2126 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2127 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2128 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2129 void VPInterleaveRecipe::execute(VPTransformState &State) { 2130 assert(!State.Instance && "Interleave group being replicated."); 2131 const InterleaveGroup<Instruction> *Group = IG; 2132 Instruction *Instr = Group->getInsertPos(); 2133 2134 // Prepare for the vector type of the interleaved load/store. 2135 Type *ScalarTy = getLoadStoreType(Instr); 2136 unsigned InterleaveFactor = Group->getFactor(); 2137 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); 2138 2139 // Prepare for the new pointers. 2140 SmallVector<Value *, 2> AddrParts; 2141 unsigned Index = Group->getIndex(Instr); 2142 2143 // TODO: extend the masked interleaved-group support to reversed access. 2144 VPValue *BlockInMask = getMask(); 2145 assert((!BlockInMask || !Group->isReverse()) && 2146 "Reversed masked interleave-group not supported."); 2147 2148 Value *Idx; 2149 // If the group is reverse, adjust the index to refer to the last vector lane 2150 // instead of the first. We adjust the index from the first vector lane, 2151 // rather than directly getting the pointer for lane VF - 1, because the 2152 // pointer operand of the interleaved access is supposed to be uniform. For 2153 // uniform instructions, we're only required to generate a value for the 2154 // first vector lane in each unroll iteration. 2155 if (Group->isReverse()) { 2156 Value *RuntimeVF = 2157 getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); 2158 Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); 2159 Idx = State.Builder.CreateMul(Idx, 2160 State.Builder.getInt32(Group->getFactor())); 2161 Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index)); 2162 Idx = State.Builder.CreateNeg(Idx); 2163 } else 2164 Idx = State.Builder.getInt32(-Index); 2165 2166 VPValue *Addr = getAddr(); 2167 for (unsigned Part = 0; Part < State.UF; Part++) { 2168 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2169 if (auto *I = dyn_cast<Instruction>(AddrPart)) 2170 State.setDebugLocFrom(I->getDebugLoc()); 2171 2172 // Notice current instruction could be any index. Need to adjust the address 2173 // to the member of index 0. 2174 // 2175 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2176 // b = A[i]; // Member of index 0 2177 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2178 // 2179 // E.g. A[i+1] = a; // Member of index 1 2180 // A[i] = b; // Member of index 0 2181 // A[i+2] = c; // Member of index 2 (Current instruction) 2182 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2183 2184 bool InBounds = false; 2185 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2186 InBounds = gep->isInBounds(); 2187 AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); 2188 AddrParts.push_back(AddrPart); 2189 } 2190 2191 State.setDebugLocFrom(Instr->getDebugLoc()); 2192 Value *PoisonVec = PoisonValue::get(VecTy); 2193 2194 auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor]( 2195 unsigned Part, Value *MaskForGaps) -> Value * { 2196 if (State.VF.isScalable()) { 2197 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2198 assert(InterleaveFactor == 2 && 2199 "Unsupported deinterleave factor for scalable vectors"); 2200 auto *BlockInMaskPart = State.get(BlockInMask, Part); 2201 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; 2202 auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), 2203 State.VF.getKnownMinValue() * 2, true); 2204 return State.Builder.CreateIntrinsic( 2205 MaskTy, Intrinsic::vector_interleave2, Ops, 2206 /*FMFSource=*/nullptr, "interleaved.mask"); 2207 } 2208 2209 if (!BlockInMask) 2210 return MaskForGaps; 2211 2212 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2213 Value *ShuffledMask = State.Builder.CreateShuffleVector( 2214 BlockInMaskPart, 2215 createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), 2216 "interleaved.mask"); 2217 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, 2218 ShuffledMask, MaskForGaps) 2219 : ShuffledMask; 2220 }; 2221 2222 const DataLayout &DL = Instr->getDataLayout(); 2223 // Vectorize the interleaved load group. 2224 if (isa<LoadInst>(Instr)) { 2225 Value *MaskForGaps = nullptr; 2226 if (NeedsMaskForGaps) { 2227 MaskForGaps = createBitMaskForGaps(State.Builder, 2228 State.VF.getKnownMinValue(), *Group); 2229 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2230 } 2231 2232 // For each unroll part, create a wide load for the group. 2233 SmallVector<Value *, 2> NewLoads; 2234 for (unsigned Part = 0; Part < State.UF; Part++) { 2235 Instruction *NewLoad; 2236 if (BlockInMask || MaskForGaps) { 2237 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2238 NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part], 2239 Group->getAlign(), GroupMask, 2240 PoisonVec, "wide.masked.vec"); 2241 } else 2242 NewLoad = State.Builder.CreateAlignedLoad( 2243 VecTy, AddrParts[Part], Group->getAlign(), "wide.vec"); 2244 Group->addMetadata(NewLoad); 2245 NewLoads.push_back(NewLoad); 2246 } 2247 2248 ArrayRef<VPValue *> VPDefs = definedValues(); 2249 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2250 if (VecTy->isScalableTy()) { 2251 assert(InterleaveFactor == 2 && 2252 "Unsupported deinterleave factor for scalable vectors"); 2253 2254 for (unsigned Part = 0; Part < State.UF; ++Part) { 2255 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2256 // so must use intrinsics to deinterleave. 2257 Value *DI = State.Builder.CreateIntrinsic( 2258 Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part], 2259 /*FMFSource=*/nullptr, "strided.vec"); 2260 unsigned J = 0; 2261 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2262 Instruction *Member = Group->getMember(I); 2263 2264 if (!Member) 2265 continue; 2266 2267 Value *StridedVec = State.Builder.CreateExtractValue(DI, I); 2268 // If this member has different type, cast the result type. 2269 if (Member->getType() != ScalarTy) { 2270 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2271 StridedVec = 2272 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2273 } 2274 2275 if (Group->isReverse()) 2276 StridedVec = 2277 State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2278 2279 State.set(VPDefs[J], StridedVec, Part); 2280 ++J; 2281 } 2282 } 2283 2284 return; 2285 } 2286 2287 // For each member in the group, shuffle out the appropriate data from the 2288 // wide loads. 2289 unsigned J = 0; 2290 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2291 Instruction *Member = Group->getMember(I); 2292 2293 // Skip the gaps in the group. 2294 if (!Member) 2295 continue; 2296 2297 auto StrideMask = 2298 createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); 2299 for (unsigned Part = 0; Part < State.UF; Part++) { 2300 Value *StridedVec = State.Builder.CreateShuffleVector( 2301 NewLoads[Part], StrideMask, "strided.vec"); 2302 2303 // If this member has different type, cast the result type. 2304 if (Member->getType() != ScalarTy) { 2305 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 2306 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 2307 StridedVec = 2308 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 2309 } 2310 2311 if (Group->isReverse()) 2312 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 2313 2314 State.set(VPDefs[J], StridedVec, Part); 2315 } 2316 ++J; 2317 } 2318 return; 2319 } 2320 2321 // The sub vector type for current instruction. 2322 auto *SubVT = VectorType::get(ScalarTy, State.VF); 2323 2324 // Vectorize the interleaved store group. 2325 Value *MaskForGaps = 2326 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); 2327 assert((!MaskForGaps || !State.VF.isScalable()) && 2328 "masking gaps for scalable vectors is not yet supported."); 2329 ArrayRef<VPValue *> StoredValues = getStoredValues(); 2330 for (unsigned Part = 0; Part < State.UF; Part++) { 2331 // Collect the stored vector from each member. 2332 SmallVector<Value *, 4> StoredVecs; 2333 unsigned StoredIdx = 0; 2334 for (unsigned i = 0; i < InterleaveFactor; i++) { 2335 assert((Group->getMember(i) || MaskForGaps) && 2336 "Fail to get a member from an interleaved store group"); 2337 Instruction *Member = Group->getMember(i); 2338 2339 // Skip the gaps in the group. 2340 if (!Member) { 2341 Value *Undef = PoisonValue::get(SubVT); 2342 StoredVecs.push_back(Undef); 2343 continue; 2344 } 2345 2346 Value *StoredVec = State.get(StoredValues[StoredIdx], Part); 2347 ++StoredIdx; 2348 2349 if (Group->isReverse()) 2350 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); 2351 2352 // If this member has different type, cast it to a unified type. 2353 2354 if (StoredVec->getType() != SubVT) 2355 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); 2356 2357 StoredVecs.push_back(StoredVec); 2358 } 2359 2360 // Interleave all the smaller vectors into one wider vector. 2361 Value *IVec = 2362 interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); 2363 Instruction *NewStoreInstr; 2364 if (BlockInMask || MaskForGaps) { 2365 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2366 NewStoreInstr = State.Builder.CreateMaskedStore( 2367 IVec, AddrParts[Part], Group->getAlign(), GroupMask); 2368 } else 2369 NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part], 2370 Group->getAlign()); 2371 2372 Group->addMetadata(NewStoreInstr); 2373 } 2374 } 2375 2376 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2377 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 2378 VPSlotTracker &SlotTracker) const { 2379 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 2380 IG->getInsertPos()->printAsOperand(O, false); 2381 O << ", "; 2382 getAddr()->printAsOperand(O, SlotTracker); 2383 VPValue *Mask = getMask(); 2384 if (Mask) { 2385 O << ", "; 2386 Mask->printAsOperand(O, SlotTracker); 2387 } 2388 2389 unsigned OpIdx = 0; 2390 for (unsigned i = 0; i < IG->getFactor(); ++i) { 2391 if (!IG->getMember(i)) 2392 continue; 2393 if (getNumStoreOperands() > 0) { 2394 O << "\n" << Indent << " store "; 2395 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 2396 O << " to index " << i; 2397 } else { 2398 O << "\n" << Indent << " "; 2399 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 2400 O << " = load from index " << i; 2401 } 2402 ++OpIdx; 2403 } 2404 } 2405 #endif 2406 2407 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { 2408 Value *Start = getStartValue()->getLiveInIRValue(); 2409 PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index"); 2410 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 2411 2412 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2413 EntryPart->addIncoming(Start, VectorPH); 2414 EntryPart->setDebugLoc(getDebugLoc()); 2415 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 2416 State.set(this, EntryPart, Part, /*IsScalar*/ true); 2417 } 2418 2419 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2420 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2421 VPSlotTracker &SlotTracker) const { 2422 O << Indent << "EMIT "; 2423 printAsOperand(O, SlotTracker); 2424 O << " = CANONICAL-INDUCTION "; 2425 printOperands(O, SlotTracker); 2426 } 2427 #endif 2428 2429 bool VPCanonicalIVPHIRecipe::isCanonical( 2430 InductionDescriptor::InductionKind Kind, VPValue *Start, 2431 VPValue *Step) const { 2432 // Must be an integer induction. 2433 if (Kind != InductionDescriptor::IK_IntInduction) 2434 return false; 2435 // Start must match the start value of this canonical induction. 2436 if (Start != getStartValue()) 2437 return false; 2438 2439 // If the step is defined by a recipe, it is not a ConstantInt. 2440 if (Step->getDefiningRecipe()) 2441 return false; 2442 2443 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 2444 return StepC && StepC->isOne(); 2445 } 2446 2447 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 2448 return IsScalarAfterVectorization && 2449 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 2450 } 2451 2452 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2453 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 2454 VPSlotTracker &SlotTracker) const { 2455 O << Indent << "EMIT "; 2456 printAsOperand(O, SlotTracker); 2457 O << " = WIDEN-POINTER-INDUCTION "; 2458 getStartValue()->printAsOperand(O, SlotTracker); 2459 O << ", " << *IndDesc.getStep(); 2460 } 2461 #endif 2462 2463 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 2464 assert(!State.Instance && "cannot be used in per-lane"); 2465 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 2466 SCEVExpander Exp(SE, DL, "induction"); 2467 2468 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 2469 &*State.Builder.GetInsertPoint()); 2470 assert(!State.ExpandedSCEVs.contains(Expr) && 2471 "Same SCEV expanded multiple times"); 2472 State.ExpandedSCEVs[Expr] = Res; 2473 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) 2474 State.set(this, Res, {Part, 0}); 2475 } 2476 2477 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2478 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 2479 VPSlotTracker &SlotTracker) const { 2480 O << Indent << "EMIT "; 2481 getVPSingleValue()->printAsOperand(O, SlotTracker); 2482 O << " = EXPAND SCEV " << *Expr; 2483 } 2484 #endif 2485 2486 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 2487 Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true); 2488 Type *STy = CanonicalIV->getType(); 2489 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 2490 ElementCount VF = State.VF; 2491 Value *VStart = VF.isScalar() 2492 ? CanonicalIV 2493 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 2494 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 2495 Value *VStep = createStepForVF(Builder, STy, VF, Part); 2496 if (VF.isVector()) { 2497 VStep = Builder.CreateVectorSplat(VF, VStep); 2498 VStep = 2499 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 2500 } 2501 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 2502 State.set(this, CanonicalVectorIV, Part); 2503 } 2504 } 2505 2506 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2507 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 2508 VPSlotTracker &SlotTracker) const { 2509 O << Indent << "EMIT "; 2510 printAsOperand(O, SlotTracker); 2511 O << " = WIDEN-CANONICAL-INDUCTION "; 2512 printOperands(O, SlotTracker); 2513 } 2514 #endif 2515 2516 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 2517 auto &Builder = State.Builder; 2518 // Create a vector from the initial value. 2519 auto *VectorInit = getStartValue()->getLiveInIRValue(); 2520 2521 Type *VecTy = State.VF.isScalar() 2522 ? VectorInit->getType() 2523 : VectorType::get(VectorInit->getType(), State.VF); 2524 2525 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2526 if (State.VF.isVector()) { 2527 auto *IdxTy = Builder.getInt32Ty(); 2528 auto *One = ConstantInt::get(IdxTy, 1); 2529 IRBuilder<>::InsertPointGuard Guard(Builder); 2530 Builder.SetInsertPoint(VectorPH->getTerminator()); 2531 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 2532 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 2533 VectorInit = Builder.CreateInsertElement( 2534 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 2535 } 2536 2537 // Create a phi node for the new recurrence. 2538 PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur"); 2539 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 2540 EntryPart->addIncoming(VectorInit, VectorPH); 2541 State.set(this, EntryPart, 0); 2542 } 2543 2544 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2545 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 2546 VPSlotTracker &SlotTracker) const { 2547 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 2548 printAsOperand(O, SlotTracker); 2549 O << " = phi "; 2550 printOperands(O, SlotTracker); 2551 } 2552 #endif 2553 2554 void VPReductionPHIRecipe::execute(VPTransformState &State) { 2555 auto &Builder = State.Builder; 2556 2557 // Reductions do not have to start at zero. They can start with 2558 // any loop invariant values. 2559 VPValue *StartVPV = getStartValue(); 2560 Value *StartV = StartVPV->getLiveInIRValue(); 2561 2562 // In order to support recurrences we need to be able to vectorize Phi nodes. 2563 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 2564 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 2565 // this value when we vectorize all of the instructions that use the PHI. 2566 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 2567 Type *VecTy = ScalarPHI ? StartV->getType() 2568 : VectorType::get(StartV->getType(), State.VF); 2569 2570 BasicBlock *HeaderBB = State.CFG.PrevBB; 2571 assert(State.CurrentVectorLoop->getHeader() == HeaderBB && 2572 "recipe must be in the vector loop header"); 2573 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; 2574 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 2575 Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi"); 2576 EntryPart->insertBefore(HeaderBB->getFirstInsertionPt()); 2577 State.set(this, EntryPart, Part, IsInLoop); 2578 } 2579 2580 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2581 2582 Value *Iden = nullptr; 2583 RecurKind RK = RdxDesc.getRecurrenceKind(); 2584 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || 2585 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 2586 // MinMax and AnyOf reductions have the start value as their identity. 2587 if (ScalarPHI) { 2588 Iden = StartV; 2589 } else { 2590 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 2591 Builder.SetInsertPoint(VectorPH->getTerminator()); 2592 StartV = Iden = 2593 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 2594 } 2595 } else { 2596 Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), 2597 RdxDesc.getFastMathFlags()); 2598 2599 if (!ScalarPHI) { 2600 Iden = Builder.CreateVectorSplat(State.VF, Iden); 2601 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 2602 Builder.SetInsertPoint(VectorPH->getTerminator()); 2603 Constant *Zero = Builder.getInt32(0); 2604 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 2605 } 2606 } 2607 2608 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { 2609 Value *EntryPart = State.get(this, Part, IsInLoop); 2610 // Make sure to add the reduction start value only to the 2611 // first unroll part. 2612 Value *StartVal = (Part == 0) ? StartV : Iden; 2613 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH); 2614 } 2615 } 2616 2617 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2618 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2619 VPSlotTracker &SlotTracker) const { 2620 O << Indent << "WIDEN-REDUCTION-PHI "; 2621 2622 printAsOperand(O, SlotTracker); 2623 O << " = phi "; 2624 printOperands(O, SlotTracker); 2625 } 2626 #endif 2627 2628 void VPWidenPHIRecipe::execute(VPTransformState &State) { 2629 assert(EnableVPlanNativePath && 2630 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 2631 2632 Value *Op0 = State.get(getOperand(0), 0); 2633 Type *VecTy = Op0->getType(); 2634 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); 2635 State.set(this, VecPhi, 0); 2636 } 2637 2638 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2639 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2640 VPSlotTracker &SlotTracker) const { 2641 O << Indent << "WIDEN-PHI "; 2642 2643 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); 2644 // Unless all incoming values are modeled in VPlan print the original PHI 2645 // directly. 2646 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming 2647 // values as VPValues. 2648 if (getNumOperands() != OriginalPhi->getNumOperands()) { 2649 O << VPlanIngredient(OriginalPhi); 2650 return; 2651 } 2652 2653 printAsOperand(O, SlotTracker); 2654 O << " = phi "; 2655 printOperands(O, SlotTracker); 2656 } 2657 #endif 2658 2659 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 2660 // remove VPActiveLaneMaskPHIRecipe. 2661 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 2662 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2663 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 2664 Value *StartMask = State.get(getOperand(0), Part); 2665 PHINode *EntryPart = 2666 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 2667 EntryPart->addIncoming(StartMask, VectorPH); 2668 EntryPart->setDebugLoc(getDebugLoc()); 2669 State.set(this, EntryPart, Part); 2670 } 2671 } 2672 2673 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2674 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2675 VPSlotTracker &SlotTracker) const { 2676 O << Indent << "ACTIVE-LANE-MASK-PHI "; 2677 2678 printAsOperand(O, SlotTracker); 2679 O << " = phi "; 2680 printOperands(O, SlotTracker); 2681 } 2682 #endif 2683 2684 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { 2685 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 2686 assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization."); 2687 Value *Start = State.get(getOperand(0), VPIteration(0, 0)); 2688 PHINode *EntryPart = 2689 State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); 2690 EntryPart->addIncoming(Start, VectorPH); 2691 EntryPart->setDebugLoc(getDebugLoc()); 2692 State.set(this, EntryPart, 0, /*IsScalar=*/true); 2693 } 2694 2695 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2696 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 2697 VPSlotTracker &SlotTracker) const { 2698 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 2699 2700 printAsOperand(O, SlotTracker); 2701 O << " = phi "; 2702 printOperands(O, SlotTracker); 2703 } 2704 #endif 2705