1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/IVDescriptors.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/Debug.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
31 #include "llvm/Transforms/Utils/LoopUtils.h"
32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
33 #include <cassert>
34
35 using namespace llvm;
36
37 using VectorParts = SmallVector<Value *, 2>;
38
39 namespace llvm {
40 extern cl::opt<bool> EnableVPlanNativePath;
41 }
42 extern cl::opt<unsigned> ForceTargetInstructionCost;
43
44 #define LV_NAME "loop-vectorize"
45 #define DEBUG_TYPE LV_NAME
46
mayWriteToMemory() const47 bool VPRecipeBase::mayWriteToMemory() const {
48 switch (getVPDefID()) {
49 case VPInterleaveSC:
50 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
51 case VPWidenStoreEVLSC:
52 case VPWidenStoreSC:
53 return true;
54 case VPReplicateSC:
55 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
56 ->mayWriteToMemory();
57 case VPWidenCallSC:
58 return !cast<VPWidenCallRecipe>(this)
59 ->getCalledScalarFunction()
60 ->onlyReadsMemory();
61 case VPBranchOnMaskSC:
62 case VPScalarIVStepsSC:
63 case VPPredInstPHISC:
64 return false;
65 case VPBlendSC:
66 case VPReductionEVLSC:
67 case VPReductionSC:
68 case VPWidenCanonicalIVSC:
69 case VPWidenCastSC:
70 case VPWidenGEPSC:
71 case VPWidenIntOrFpInductionSC:
72 case VPWidenLoadEVLSC:
73 case VPWidenLoadSC:
74 case VPWidenPHISC:
75 case VPWidenSC:
76 case VPWidenSelectSC: {
77 const Instruction *I =
78 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
79 (void)I;
80 assert((!I || !I->mayWriteToMemory()) &&
81 "underlying instruction may write to memory");
82 return false;
83 }
84 default:
85 return true;
86 }
87 }
88
mayReadFromMemory() const89 bool VPRecipeBase::mayReadFromMemory() const {
90 switch (getVPDefID()) {
91 case VPWidenLoadEVLSC:
92 case VPWidenLoadSC:
93 return true;
94 case VPReplicateSC:
95 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
96 ->mayReadFromMemory();
97 case VPWidenCallSC:
98 return !cast<VPWidenCallRecipe>(this)
99 ->getCalledScalarFunction()
100 ->onlyWritesMemory();
101 case VPBranchOnMaskSC:
102 case VPPredInstPHISC:
103 case VPScalarIVStepsSC:
104 case VPWidenStoreEVLSC:
105 case VPWidenStoreSC:
106 return false;
107 case VPBlendSC:
108 case VPReductionEVLSC:
109 case VPReductionSC:
110 case VPWidenCanonicalIVSC:
111 case VPWidenCastSC:
112 case VPWidenGEPSC:
113 case VPWidenIntOrFpInductionSC:
114 case VPWidenPHISC:
115 case VPWidenSC:
116 case VPWidenSelectSC: {
117 const Instruction *I =
118 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
119 (void)I;
120 assert((!I || !I->mayReadFromMemory()) &&
121 "underlying instruction may read from memory");
122 return false;
123 }
124 default:
125 return true;
126 }
127 }
128
mayHaveSideEffects() const129 bool VPRecipeBase::mayHaveSideEffects() const {
130 switch (getVPDefID()) {
131 case VPDerivedIVSC:
132 case VPPredInstPHISC:
133 case VPScalarCastSC:
134 return false;
135 case VPInstructionSC:
136 switch (cast<VPInstruction>(this)->getOpcode()) {
137 case Instruction::Or:
138 case Instruction::ICmp:
139 case Instruction::Select:
140 case VPInstruction::Not:
141 case VPInstruction::CalculateTripCountMinusVF:
142 case VPInstruction::CanonicalIVIncrementForPart:
143 case VPInstruction::ExtractFromEnd:
144 case VPInstruction::FirstOrderRecurrenceSplice:
145 case VPInstruction::LogicalAnd:
146 case VPInstruction::PtrAdd:
147 return false;
148 default:
149 return true;
150 }
151 case VPWidenCallSC: {
152 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
153 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
154 }
155 case VPBlendSC:
156 case VPReductionEVLSC:
157 case VPReductionSC:
158 case VPScalarIVStepsSC:
159 case VPWidenCanonicalIVSC:
160 case VPWidenCastSC:
161 case VPWidenGEPSC:
162 case VPWidenIntOrFpInductionSC:
163 case VPWidenPHISC:
164 case VPWidenPointerInductionSC:
165 case VPWidenSC:
166 case VPWidenSelectSC: {
167 const Instruction *I =
168 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
169 (void)I;
170 assert((!I || !I->mayHaveSideEffects()) &&
171 "underlying instruction has side-effects");
172 return false;
173 }
174 case VPInterleaveSC:
175 return mayWriteToMemory();
176 case VPWidenLoadEVLSC:
177 case VPWidenLoadSC:
178 case VPWidenStoreEVLSC:
179 case VPWidenStoreSC:
180 assert(
181 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
182 mayWriteToMemory() &&
183 "mayHaveSideffects result for ingredient differs from this "
184 "implementation");
185 return mayWriteToMemory();
186 case VPReplicateSC: {
187 auto *R = cast<VPReplicateRecipe>(this);
188 return R->getUnderlyingInstr()->mayHaveSideEffects();
189 }
190 default:
191 return true;
192 }
193 }
194
fixPhi(VPlan & Plan,VPTransformState & State)195 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
196 VPValue *ExitValue = getOperand(0);
197 auto Lane = vputils::isUniformAfterVectorization(ExitValue)
198 ? VPLane::getFirstLane()
199 : VPLane::getLastLaneForVF(State.VF);
200 VPBasicBlock *MiddleVPBB =
201 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
202 VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
203 auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
204 // Values leaving the vector loop reach live out phi's in the exiting block
205 // via middle block.
206 auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
207 ? MiddleVPBB
208 : ExitingVPBB;
209 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
210 // Set insertion point in PredBB in case an extract needs to be generated.
211 // TODO: Model extracts explicitly.
212 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
213 Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
214 if (Phi->getBasicBlockIndex(PredBB) != -1)
215 Phi->setIncomingValueForBlock(PredBB, V);
216 else
217 Phi->addIncoming(V, PredBB);
218 }
219
220 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,VPSlotTracker & SlotTracker) const221 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
222 O << "Live-out ";
223 getPhi()->printAsOperand(O);
224 O << " = ";
225 getOperand(0)->printAsOperand(O, SlotTracker);
226 O << "\n";
227 }
228 #endif
229
insertBefore(VPRecipeBase * InsertPos)230 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
231 assert(!Parent && "Recipe already in some VPBasicBlock");
232 assert(InsertPos->getParent() &&
233 "Insertion position not in any VPBasicBlock");
234 InsertPos->getParent()->insert(this, InsertPos->getIterator());
235 }
236
insertBefore(VPBasicBlock & BB,iplist<VPRecipeBase>::iterator I)237 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
238 iplist<VPRecipeBase>::iterator I) {
239 assert(!Parent && "Recipe already in some VPBasicBlock");
240 assert(I == BB.end() || I->getParent() == &BB);
241 BB.insert(this, I);
242 }
243
insertAfter(VPRecipeBase * InsertPos)244 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
245 assert(!Parent && "Recipe already in some VPBasicBlock");
246 assert(InsertPos->getParent() &&
247 "Insertion position not in any VPBasicBlock");
248 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
249 }
250
removeFromParent()251 void VPRecipeBase::removeFromParent() {
252 assert(getParent() && "Recipe not in any VPBasicBlock");
253 getParent()->getRecipeList().remove(getIterator());
254 Parent = nullptr;
255 }
256
eraseFromParent()257 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
258 assert(getParent() && "Recipe not in any VPBasicBlock");
259 return getParent()->getRecipeList().erase(getIterator());
260 }
261
moveAfter(VPRecipeBase * InsertPos)262 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
263 removeFromParent();
264 insertAfter(InsertPos);
265 }
266
moveBefore(VPBasicBlock & BB,iplist<VPRecipeBase>::iterator I)267 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
268 iplist<VPRecipeBase>::iterator I) {
269 removeFromParent();
270 insertBefore(BB, I);
271 }
272
273 /// Return the underlying instruction to be used for computing \p R's cost via
274 /// the legacy cost model. Return nullptr if there's no suitable instruction.
getInstructionForCost(const VPRecipeBase * R)275 static Instruction *getInstructionForCost(const VPRecipeBase *R) {
276 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
277 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
278 if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))
279 return IG->getInsertPos();
280 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
281 return &WidenMem->getIngredient();
282 return nullptr;
283 }
284
cost(ElementCount VF,VPCostContext & Ctx)285 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
286 if (auto *UI = getInstructionForCost(this))
287 if (Ctx.skipCostComputation(UI, VF.isVector()))
288 return 0;
289
290 InstructionCost RecipeCost = computeCost(VF, Ctx);
291 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
292 RecipeCost.isValid())
293 RecipeCost = InstructionCost(ForceTargetInstructionCost);
294
295 LLVM_DEBUG({
296 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
297 dump();
298 });
299 return RecipeCost;
300 }
301
computeCost(ElementCount VF,VPCostContext & Ctx) const302 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
303 VPCostContext &Ctx) const {
304 // Compute the cost for the recipe falling back to the legacy cost model using
305 // the underlying instruction. If there is no underlying instruction, returns
306 // 0.
307 Instruction *UI = getInstructionForCost(this);
308 if (UI && isa<VPReplicateRecipe>(this)) {
309 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
310 // transform, avoid computing their cost multiple times for now.
311 Ctx.SkipCostComputation.insert(UI);
312 }
313 return UI ? Ctx.getLegacyCost(UI, VF) : 0;
314 }
315
getFastMathFlags() const316 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
317 assert(OpType == OperationType::FPMathOp &&
318 "recipe doesn't have fast math flags");
319 FastMathFlags Res;
320 Res.setAllowReassoc(FMFs.AllowReassoc);
321 Res.setNoNaNs(FMFs.NoNaNs);
322 Res.setNoInfs(FMFs.NoInfs);
323 Res.setNoSignedZeros(FMFs.NoSignedZeros);
324 Res.setAllowReciprocal(FMFs.AllowReciprocal);
325 Res.setAllowContract(FMFs.AllowContract);
326 Res.setApproxFunc(FMFs.ApproxFunc);
327 return Res;
328 }
329
VPInstruction(unsigned Opcode,CmpInst::Predicate Pred,VPValue * A,VPValue * B,DebugLoc DL,const Twine & Name)330 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
331 VPValue *A, VPValue *B, DebugLoc DL,
332 const Twine &Name)
333 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
334 Pred, DL),
335 Opcode(Opcode), Name(Name.str()) {
336 assert(Opcode == Instruction::ICmp &&
337 "only ICmp predicates supported at the moment");
338 }
339
VPInstruction(unsigned Opcode,std::initializer_list<VPValue * > Operands,FastMathFlags FMFs,DebugLoc DL,const Twine & Name)340 VPInstruction::VPInstruction(unsigned Opcode,
341 std::initializer_list<VPValue *> Operands,
342 FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
343 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
344 Opcode(Opcode), Name(Name.str()) {
345 // Make sure the VPInstruction is a floating-point operation.
346 assert(isFPMathOp() && "this op can't take fast-math flags");
347 }
348
doesGeneratePerAllLanes() const349 bool VPInstruction::doesGeneratePerAllLanes() const {
350 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
351 }
352
canGenerateScalarForFirstLane() const353 bool VPInstruction::canGenerateScalarForFirstLane() const {
354 if (Instruction::isBinaryOp(getOpcode()))
355 return true;
356 if (isSingleScalar() || isVectorToScalar())
357 return true;
358 switch (Opcode) {
359 case Instruction::ICmp:
360 case VPInstruction::BranchOnCond:
361 case VPInstruction::BranchOnCount:
362 case VPInstruction::CalculateTripCountMinusVF:
363 case VPInstruction::CanonicalIVIncrementForPart:
364 case VPInstruction::PtrAdd:
365 case VPInstruction::ExplicitVectorLength:
366 return true;
367 default:
368 return false;
369 }
370 }
371
generatePerLane(VPTransformState & State,const VPIteration & Lane)372 Value *VPInstruction::generatePerLane(VPTransformState &State,
373 const VPIteration &Lane) {
374 IRBuilderBase &Builder = State.Builder;
375
376 assert(getOpcode() == VPInstruction::PtrAdd &&
377 "only PtrAdd opcodes are supported for now");
378 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
379 State.get(getOperand(1), Lane), Name);
380 }
381
generatePerPart(VPTransformState & State,unsigned Part)382 Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
383 IRBuilderBase &Builder = State.Builder;
384
385 if (Instruction::isBinaryOp(getOpcode())) {
386 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
387 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
388 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
389 auto *Res =
390 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
391 if (auto *I = dyn_cast<Instruction>(Res))
392 setFlags(I);
393 return Res;
394 }
395
396 switch (getOpcode()) {
397 case VPInstruction::Not: {
398 Value *A = State.get(getOperand(0), Part);
399 return Builder.CreateNot(A, Name);
400 }
401 case Instruction::ICmp: {
402 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
403 Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
404 Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
405 return Builder.CreateCmp(getPredicate(), A, B, Name);
406 }
407 case Instruction::Select: {
408 Value *Cond = State.get(getOperand(0), Part);
409 Value *Op1 = State.get(getOperand(1), Part);
410 Value *Op2 = State.get(getOperand(2), Part);
411 return Builder.CreateSelect(Cond, Op1, Op2, Name);
412 }
413 case VPInstruction::ActiveLaneMask: {
414 // Get first lane of vector induction variable.
415 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
416 // Get the original loop tripcount.
417 Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
418
419 // If this part of the active lane mask is scalar, generate the CMP directly
420 // to avoid unnecessary extracts.
421 if (State.VF.isScalar())
422 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
423 Name);
424
425 auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
426 auto *PredTy = VectorType::get(Int1Ty, State.VF);
427 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
428 {PredTy, ScalarTC->getType()},
429 {VIVElem0, ScalarTC}, nullptr, Name);
430 }
431 case VPInstruction::FirstOrderRecurrenceSplice: {
432 // Generate code to combine the previous and current values in vector v3.
433 //
434 // vector.ph:
435 // v_init = vector(..., ..., ..., a[-1])
436 // br vector.body
437 //
438 // vector.body
439 // i = phi [0, vector.ph], [i+4, vector.body]
440 // v1 = phi [v_init, vector.ph], [v2, vector.body]
441 // v2 = a[i, i+1, i+2, i+3];
442 // v3 = vector(v1(3), v2(0, 1, 2))
443
444 // For the first part, use the recurrence phi (v1), otherwise v2.
445 auto *V1 = State.get(getOperand(0), 0);
446 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
447 if (!PartMinus1->getType()->isVectorTy())
448 return PartMinus1;
449 Value *V2 = State.get(getOperand(1), Part);
450 return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
451 }
452 case VPInstruction::CalculateTripCountMinusVF: {
453 if (Part != 0)
454 return State.get(this, 0, /*IsScalar*/ true);
455
456 Value *ScalarTC = State.get(getOperand(0), {0, 0});
457 Value *Step =
458 createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
459 Value *Sub = Builder.CreateSub(ScalarTC, Step);
460 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
461 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
462 return Builder.CreateSelect(Cmp, Sub, Zero);
463 }
464 case VPInstruction::ExplicitVectorLength: {
465 // Compute EVL
466 auto GetEVL = [=](VPTransformState &State, Value *AVL) {
467 assert(AVL->getType()->isIntegerTy() &&
468 "Requested vector length should be an integer.");
469
470 // TODO: Add support for MaxSafeDist for correct loop emission.
471 assert(State.VF.isScalable() && "Expected scalable vector factor.");
472 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
473
474 Value *EVL = State.Builder.CreateIntrinsic(
475 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
476 {AVL, VFArg, State.Builder.getTrue()});
477 return EVL;
478 };
479 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
480 // be outside of the main loop.
481 assert(Part == 0 && "No unrolling expected for predicated vectorization.");
482 // Compute VTC - IV as the AVL (requested vector length).
483 Value *Index = State.get(getOperand(0), VPIteration(0, 0));
484 Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
485 Value *AVL = State.Builder.CreateSub(TripCount, Index);
486 Value *EVL = GetEVL(State, AVL);
487 return EVL;
488 }
489 case VPInstruction::CanonicalIVIncrementForPart: {
490 auto *IV = State.get(getOperand(0), VPIteration(0, 0));
491 if (Part == 0)
492 return IV;
493
494 // The canonical IV is incremented by the vectorization factor (num of SIMD
495 // elements) times the unroll part.
496 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
497 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
498 hasNoSignedWrap());
499 }
500 case VPInstruction::BranchOnCond: {
501 if (Part != 0)
502 return nullptr;
503
504 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
505 // Replace the temporary unreachable terminator with a new conditional
506 // branch, hooking it up to backward destination for exiting blocks now and
507 // to forward destination(s) later when they are created.
508 BranchInst *CondBr =
509 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
510 CondBr->setSuccessor(0, nullptr);
511 Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
512
513 if (!getParent()->isExiting())
514 return CondBr;
515
516 VPRegionBlock *ParentRegion = getParent()->getParent();
517 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
518 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
519 return CondBr;
520 }
521 case VPInstruction::BranchOnCount: {
522 if (Part != 0)
523 return nullptr;
524 // First create the compare.
525 Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
526 Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
527 Value *Cond = Builder.CreateICmpEQ(IV, TC);
528
529 // Now create the branch.
530 auto *Plan = getParent()->getPlan();
531 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
532 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
533
534 // Replace the temporary unreachable terminator with a new conditional
535 // branch, hooking it up to backward destination (the header) now and to the
536 // forward destination (the exit/middle block) later when it is created.
537 // Note that CreateCondBr expects a valid BB as first argument, so we need
538 // to set it to nullptr later.
539 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
540 State.CFG.VPBB2IRBB[Header]);
541 CondBr->setSuccessor(0, nullptr);
542 Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
543 return CondBr;
544 }
545 case VPInstruction::ComputeReductionResult: {
546 if (Part != 0)
547 return State.get(this, 0, /*IsScalar*/ true);
548
549 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
550 // and will be removed by breaking up the recipe further.
551 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
552 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
553 // Get its reduction variable descriptor.
554 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
555
556 RecurKind RK = RdxDesc.getRecurrenceKind();
557
558 VPValue *LoopExitingDef = getOperand(1);
559 Type *PhiTy = OrigPhi->getType();
560 VectorParts RdxParts(State.UF);
561 for (unsigned Part = 0; Part < State.UF; ++Part)
562 RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
563
564 // If the vector reduction can be performed in a smaller type, we truncate
565 // then extend the loop exit value to enable InstCombine to evaluate the
566 // entire expression in the smaller type.
567 // TODO: Handle this in truncateToMinBW.
568 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
569 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
570 for (unsigned Part = 0; Part < State.UF; ++Part)
571 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
572 }
573 // Reduce all of the unrolled parts into a single vector.
574 Value *ReducedPartRdx = RdxParts[0];
575 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
576 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
577 Op = Instruction::Or;
578
579 if (PhiR->isOrdered()) {
580 ReducedPartRdx = RdxParts[State.UF - 1];
581 } else {
582 // Floating-point operations should have some FMF to enable the reduction.
583 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
584 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
585 for (unsigned Part = 1; Part < State.UF; ++Part) {
586 Value *RdxPart = RdxParts[Part];
587 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
588 ReducedPartRdx = Builder.CreateBinOp(
589 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
590 else
591 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
592 }
593 }
594
595 // Create the reduction after the loop. Note that inloop reductions create
596 // the target reduction in the loop using a Reduction recipe.
597 if ((State.VF.isVector() ||
598 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
599 !PhiR->isInLoop()) {
600 ReducedPartRdx =
601 createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
602 // If the reduction can be performed in a smaller type, we need to extend
603 // the reduction to the wider type before we branch to the original loop.
604 if (PhiTy != RdxDesc.getRecurrenceType())
605 ReducedPartRdx = RdxDesc.isSigned()
606 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
607 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
608 }
609
610 // If there were stores of the reduction value to a uniform memory address
611 // inside the loop, create the final store here.
612 if (StoreInst *SI = RdxDesc.IntermediateStore) {
613 auto *NewSI = Builder.CreateAlignedStore(
614 ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
615 propagateMetadata(NewSI, SI);
616 }
617
618 return ReducedPartRdx;
619 }
620 case VPInstruction::ExtractFromEnd: {
621 if (Part != 0)
622 return State.get(this, 0, /*IsScalar*/ true);
623
624 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
625 unsigned Offset = CI->getZExtValue();
626 assert(Offset > 0 && "Offset from end must be positive");
627 Value *Res;
628 if (State.VF.isVector()) {
629 assert(Offset <= State.VF.getKnownMinValue() &&
630 "invalid offset to extract from");
631 // Extract lane VF - Offset from the operand.
632 Res = State.get(
633 getOperand(0),
634 VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));
635 } else {
636 assert(Offset <= State.UF && "invalid offset to extract from");
637 // When loop is unrolled without vectorizing, retrieve UF - Offset.
638 Res = State.get(getOperand(0), State.UF - Offset);
639 }
640 if (isa<ExtractElementInst>(Res))
641 Res->setName(Name);
642 return Res;
643 }
644 case VPInstruction::LogicalAnd: {
645 Value *A = State.get(getOperand(0), Part);
646 Value *B = State.get(getOperand(1), Part);
647 return Builder.CreateLogicalAnd(A, B, Name);
648 }
649 case VPInstruction::PtrAdd: {
650 assert(vputils::onlyFirstLaneUsed(this) &&
651 "can only generate first lane for PtrAdd");
652 Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
653 Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
654 return Builder.CreatePtrAdd(Ptr, Addend, Name);
655 }
656 case VPInstruction::ResumePhi: {
657 if (Part != 0)
658 return State.get(this, 0, /*IsScalar*/ true);
659 Value *IncomingFromVPlanPred =
660 State.get(getOperand(0), Part, /* IsScalar */ true);
661 Value *IncomingFromOtherPreds =
662 State.get(getOperand(1), Part, /* IsScalar */ true);
663 auto *NewPhi =
664 Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
665 BasicBlock *VPlanPred =
666 State.CFG
667 .VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
668 NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
669 for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
670 assert(OtherPred != VPlanPred &&
671 "VPlan predecessors should not be connected yet");
672 NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
673 }
674 return NewPhi;
675 }
676
677 default:
678 llvm_unreachable("Unsupported opcode for instruction");
679 }
680 }
681
isVectorToScalar() const682 bool VPInstruction::isVectorToScalar() const {
683 return getOpcode() == VPInstruction::ExtractFromEnd ||
684 getOpcode() == VPInstruction::ComputeReductionResult;
685 }
686
isSingleScalar() const687 bool VPInstruction::isSingleScalar() const {
688 return getOpcode() == VPInstruction::ResumePhi;
689 }
690
691 #if !defined(NDEBUG)
isFPMathOp() const692 bool VPInstruction::isFPMathOp() const {
693 // Inspired by FPMathOperator::classof. Notable differences are that we don't
694 // support Call, PHI and Select opcodes here yet.
695 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
696 Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
697 Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
698 Opcode == Instruction::FCmp || Opcode == Instruction::Select;
699 }
700 #endif
701
execute(VPTransformState & State)702 void VPInstruction::execute(VPTransformState &State) {
703 assert(!State.Instance && "VPInstruction executing an Instance");
704 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
705 assert((hasFastMathFlags() == isFPMathOp() ||
706 getOpcode() == Instruction::Select) &&
707 "Recipe not a FPMathOp but has fast-math flags?");
708 if (hasFastMathFlags())
709 State.Builder.setFastMathFlags(getFastMathFlags());
710 State.setDebugLocFrom(getDebugLoc());
711 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
712 (vputils::onlyFirstLaneUsed(this) ||
713 isVectorToScalar() || isSingleScalar());
714 bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
715 bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this);
716 for (unsigned Part = 0; Part < State.UF; ++Part) {
717 if (GeneratesPerAllLanes) {
718 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
719 Lane != NumLanes; ++Lane) {
720 Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
721 assert(GeneratedValue && "generatePerLane must produce a value");
722 State.set(this, GeneratedValue, VPIteration(Part, Lane));
723 }
724 continue;
725 }
726
727 if (Part != 0 && OnlyFirstPartUsed && hasResult()) {
728 Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly);
729 State.set(this, Part0, Part,
730 /*IsScalar*/ GeneratesPerFirstLaneOnly);
731 continue;
732 }
733
734 Value *GeneratedValue = generatePerPart(State, Part);
735 if (!hasResult())
736 continue;
737 assert(GeneratedValue && "generatePerPart must produce a value");
738 assert((GeneratedValue->getType()->isVectorTy() ==
739 !GeneratesPerFirstLaneOnly ||
740 State.VF.isScalar()) &&
741 "scalar value but not only first lane defined");
742 State.set(this, GeneratedValue, Part,
743 /*IsScalar*/ GeneratesPerFirstLaneOnly);
744 }
745 }
746
onlyFirstLaneUsed(const VPValue * Op) const747 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
748 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
749 if (Instruction::isBinaryOp(getOpcode()))
750 return vputils::onlyFirstLaneUsed(this);
751
752 switch (getOpcode()) {
753 default:
754 return false;
755 case Instruction::ICmp:
756 case VPInstruction::PtrAdd:
757 // TODO: Cover additional opcodes.
758 return vputils::onlyFirstLaneUsed(this);
759 case VPInstruction::ActiveLaneMask:
760 case VPInstruction::ExplicitVectorLength:
761 case VPInstruction::CalculateTripCountMinusVF:
762 case VPInstruction::CanonicalIVIncrementForPart:
763 case VPInstruction::BranchOnCount:
764 case VPInstruction::BranchOnCond:
765 case VPInstruction::ResumePhi:
766 return true;
767 };
768 llvm_unreachable("switch should return");
769 }
770
onlyFirstPartUsed(const VPValue * Op) const771 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
772 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
773 if (Instruction::isBinaryOp(getOpcode()))
774 return vputils::onlyFirstPartUsed(this);
775
776 switch (getOpcode()) {
777 default:
778 return false;
779 case Instruction::ICmp:
780 case Instruction::Select:
781 return vputils::onlyFirstPartUsed(this);
782 case VPInstruction::BranchOnCount:
783 case VPInstruction::BranchOnCond:
784 case VPInstruction::CanonicalIVIncrementForPart:
785 return true;
786 };
787 llvm_unreachable("switch should return");
788 }
789
790 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump() const791 void VPInstruction::dump() const {
792 VPSlotTracker SlotTracker(getParent()->getPlan());
793 print(dbgs(), "", SlotTracker);
794 }
795
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const796 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
797 VPSlotTracker &SlotTracker) const {
798 O << Indent << "EMIT ";
799
800 if (hasResult()) {
801 printAsOperand(O, SlotTracker);
802 O << " = ";
803 }
804
805 switch (getOpcode()) {
806 case VPInstruction::Not:
807 O << "not";
808 break;
809 case VPInstruction::SLPLoad:
810 O << "combined load";
811 break;
812 case VPInstruction::SLPStore:
813 O << "combined store";
814 break;
815 case VPInstruction::ActiveLaneMask:
816 O << "active lane mask";
817 break;
818 case VPInstruction::ResumePhi:
819 O << "resume-phi";
820 break;
821 case VPInstruction::ExplicitVectorLength:
822 O << "EXPLICIT-VECTOR-LENGTH";
823 break;
824 case VPInstruction::FirstOrderRecurrenceSplice:
825 O << "first-order splice";
826 break;
827 case VPInstruction::BranchOnCond:
828 O << "branch-on-cond";
829 break;
830 case VPInstruction::CalculateTripCountMinusVF:
831 O << "TC > VF ? TC - VF : 0";
832 break;
833 case VPInstruction::CanonicalIVIncrementForPart:
834 O << "VF * Part +";
835 break;
836 case VPInstruction::BranchOnCount:
837 O << "branch-on-count";
838 break;
839 case VPInstruction::ExtractFromEnd:
840 O << "extract-from-end";
841 break;
842 case VPInstruction::ComputeReductionResult:
843 O << "compute-reduction-result";
844 break;
845 case VPInstruction::LogicalAnd:
846 O << "logical-and";
847 break;
848 case VPInstruction::PtrAdd:
849 O << "ptradd";
850 break;
851 default:
852 O << Instruction::getOpcodeName(getOpcode());
853 }
854
855 printFlags(O);
856 printOperands(O, SlotTracker);
857
858 if (auto DL = getDebugLoc()) {
859 O << ", !dbg ";
860 DL.print(O);
861 }
862 }
863 #endif
864
execute(VPTransformState & State)865 void VPWidenCallRecipe::execute(VPTransformState &State) {
866 assert(State.VF.isVector() && "not widening");
867 Function *CalledScalarFn = getCalledScalarFunction();
868 assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
869 "DbgInfoIntrinsic should have been dropped during VPlan construction");
870 State.setDebugLocFrom(getDebugLoc());
871
872 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
873 FunctionType *VFTy = nullptr;
874 if (Variant)
875 VFTy = Variant->getFunctionType();
876 for (unsigned Part = 0; Part < State.UF; ++Part) {
877 SmallVector<Type *, 2> TysForDecl;
878 // Add return type if intrinsic is overloaded on it.
879 if (UseIntrinsic &&
880 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
881 TysForDecl.push_back(VectorType::get(
882 CalledScalarFn->getReturnType()->getScalarType(), State.VF));
883 SmallVector<Value *, 4> Args;
884 for (const auto &I : enumerate(arg_operands())) {
885 // Some intrinsics have a scalar argument - don't replace it with a
886 // vector.
887 Value *Arg;
888 if (UseIntrinsic &&
889 isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
890 Arg = State.get(I.value(), VPIteration(0, 0));
891 // Some vectorized function variants may also take a scalar argument,
892 // e.g. linear parameters for pointers. This needs to be the scalar value
893 // from the start of the respective part when interleaving.
894 else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
895 Arg = State.get(I.value(), VPIteration(Part, 0));
896 else
897 Arg = State.get(I.value(), Part);
898 if (UseIntrinsic &&
899 isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
900 TysForDecl.push_back(Arg->getType());
901 Args.push_back(Arg);
902 }
903
904 Function *VectorF;
905 if (UseIntrinsic) {
906 // Use vector version of the intrinsic.
907 Module *M = State.Builder.GetInsertBlock()->getModule();
908 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
909 assert(VectorF && "Can't retrieve vector intrinsic.");
910 } else {
911 #ifndef NDEBUG
912 assert(Variant != nullptr && "Can't create vector function.");
913 #endif
914 VectorF = Variant;
915 }
916
917 auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
918 SmallVector<OperandBundleDef, 1> OpBundles;
919 if (CI)
920 CI->getOperandBundlesAsDefs(OpBundles);
921
922 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
923
924 if (isa<FPMathOperator>(V))
925 V->copyFastMathFlags(CI);
926
927 if (!V->getType()->isVoidTy())
928 State.set(this, V, Part);
929 State.addMetadata(V, CI);
930 }
931 }
932
933 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const934 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
935 VPSlotTracker &SlotTracker) const {
936 O << Indent << "WIDEN-CALL ";
937
938 Function *CalledFn = getCalledScalarFunction();
939 if (CalledFn->getReturnType()->isVoidTy())
940 O << "void ";
941 else {
942 printAsOperand(O, SlotTracker);
943 O << " = ";
944 }
945
946 O << "call @" << CalledFn->getName() << "(";
947 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
948 Op->printAsOperand(O, SlotTracker);
949 });
950 O << ")";
951
952 if (VectorIntrinsicID)
953 O << " (using vector intrinsic)";
954 else {
955 O << " (using library function";
956 if (Variant->hasName())
957 O << ": " << Variant->getName();
958 O << ")";
959 }
960 }
961
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const962 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
963 VPSlotTracker &SlotTracker) const {
964 O << Indent << "WIDEN-SELECT ";
965 printAsOperand(O, SlotTracker);
966 O << " = select ";
967 getOperand(0)->printAsOperand(O, SlotTracker);
968 O << ", ";
969 getOperand(1)->printAsOperand(O, SlotTracker);
970 O << ", ";
971 getOperand(2)->printAsOperand(O, SlotTracker);
972 O << (isInvariantCond() ? " (condition is loop invariant)" : "");
973 }
974 #endif
975
execute(VPTransformState & State)976 void VPWidenSelectRecipe::execute(VPTransformState &State) {
977 State.setDebugLocFrom(getDebugLoc());
978
979 // The condition can be loop invariant but still defined inside the
980 // loop. This means that we can't just use the original 'cond' value.
981 // We have to take the 'vectorized' value and pick the first lane.
982 // Instcombine will make this a no-op.
983 auto *InvarCond =
984 isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
985
986 for (unsigned Part = 0; Part < State.UF; ++Part) {
987 Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
988 Value *Op0 = State.get(getOperand(1), Part);
989 Value *Op1 = State.get(getOperand(2), Part);
990 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
991 State.set(this, Sel, Part);
992 State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
993 }
994 }
995
FastMathFlagsTy(const FastMathFlags & FMF)996 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
997 const FastMathFlags &FMF) {
998 AllowReassoc = FMF.allowReassoc();
999 NoNaNs = FMF.noNaNs();
1000 NoInfs = FMF.noInfs();
1001 NoSignedZeros = FMF.noSignedZeros();
1002 AllowReciprocal = FMF.allowReciprocal();
1003 AllowContract = FMF.allowContract();
1004 ApproxFunc = FMF.approxFunc();
1005 }
1006
1007 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
printFlags(raw_ostream & O) const1008 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1009 switch (OpType) {
1010 case OperationType::Cmp:
1011 O << " " << CmpInst::getPredicateName(getPredicate());
1012 break;
1013 case OperationType::DisjointOp:
1014 if (DisjointFlags.IsDisjoint)
1015 O << " disjoint";
1016 break;
1017 case OperationType::PossiblyExactOp:
1018 if (ExactFlags.IsExact)
1019 O << " exact";
1020 break;
1021 case OperationType::OverflowingBinOp:
1022 if (WrapFlags.HasNUW)
1023 O << " nuw";
1024 if (WrapFlags.HasNSW)
1025 O << " nsw";
1026 break;
1027 case OperationType::FPMathOp:
1028 getFastMathFlags().print(O);
1029 break;
1030 case OperationType::GEPOp:
1031 if (GEPFlags.IsInBounds)
1032 O << " inbounds";
1033 break;
1034 case OperationType::NonNegOp:
1035 if (NonNegFlags.NonNeg)
1036 O << " nneg";
1037 break;
1038 case OperationType::Other:
1039 break;
1040 }
1041 if (getNumOperands() > 0)
1042 O << " ";
1043 }
1044 #endif
1045
execute(VPTransformState & State)1046 void VPWidenRecipe::execute(VPTransformState &State) {
1047 State.setDebugLocFrom(getDebugLoc());
1048 auto &Builder = State.Builder;
1049 switch (Opcode) {
1050 case Instruction::Call:
1051 case Instruction::Br:
1052 case Instruction::PHI:
1053 case Instruction::GetElementPtr:
1054 case Instruction::Select:
1055 llvm_unreachable("This instruction is handled by a different recipe.");
1056 case Instruction::UDiv:
1057 case Instruction::SDiv:
1058 case Instruction::SRem:
1059 case Instruction::URem:
1060 case Instruction::Add:
1061 case Instruction::FAdd:
1062 case Instruction::Sub:
1063 case Instruction::FSub:
1064 case Instruction::FNeg:
1065 case Instruction::Mul:
1066 case Instruction::FMul:
1067 case Instruction::FDiv:
1068 case Instruction::FRem:
1069 case Instruction::Shl:
1070 case Instruction::LShr:
1071 case Instruction::AShr:
1072 case Instruction::And:
1073 case Instruction::Or:
1074 case Instruction::Xor: {
1075 // Just widen unops and binops.
1076 for (unsigned Part = 0; Part < State.UF; ++Part) {
1077 SmallVector<Value *, 2> Ops;
1078 for (VPValue *VPOp : operands())
1079 Ops.push_back(State.get(VPOp, Part));
1080
1081 Value *V = Builder.CreateNAryOp(Opcode, Ops);
1082
1083 if (auto *VecOp = dyn_cast<Instruction>(V))
1084 setFlags(VecOp);
1085
1086 // Use this vector value for all users of the original instruction.
1087 State.set(this, V, Part);
1088 State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1089 }
1090
1091 break;
1092 }
1093 case Instruction::Freeze: {
1094 for (unsigned Part = 0; Part < State.UF; ++Part) {
1095 Value *Op = State.get(getOperand(0), Part);
1096
1097 Value *Freeze = Builder.CreateFreeze(Op);
1098 State.set(this, Freeze, Part);
1099 }
1100 break;
1101 }
1102 case Instruction::ICmp:
1103 case Instruction::FCmp: {
1104 // Widen compares. Generate vector compares.
1105 bool FCmp = Opcode == Instruction::FCmp;
1106 for (unsigned Part = 0; Part < State.UF; ++Part) {
1107 Value *A = State.get(getOperand(0), Part);
1108 Value *B = State.get(getOperand(1), Part);
1109 Value *C = nullptr;
1110 if (FCmp) {
1111 // Propagate fast math flags.
1112 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1113 if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
1114 Builder.setFastMathFlags(I->getFastMathFlags());
1115 C = Builder.CreateFCmp(getPredicate(), A, B);
1116 } else {
1117 C = Builder.CreateICmp(getPredicate(), A, B);
1118 }
1119 State.set(this, C, Part);
1120 State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1121 }
1122
1123 break;
1124 }
1125 default:
1126 // This instruction is not vectorized by simple widening.
1127 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1128 << Instruction::getOpcodeName(Opcode));
1129 llvm_unreachable("Unhandled instruction!");
1130 } // end of switch.
1131
1132 #if !defined(NDEBUG)
1133 // Verify that VPlan type inference results agree with the type of the
1134 // generated values.
1135 for (unsigned Part = 0; Part < State.UF; ++Part) {
1136 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
1137 State.VF) == State.get(this, Part)->getType() &&
1138 "inferred type and type from generated instructions do not match");
1139 }
1140 #endif
1141 }
1142
1143 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1144 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1145 VPSlotTracker &SlotTracker) const {
1146 O << Indent << "WIDEN ";
1147 printAsOperand(O, SlotTracker);
1148 O << " = " << Instruction::getOpcodeName(Opcode);
1149 printFlags(O);
1150 printOperands(O, SlotTracker);
1151 }
1152 #endif
1153
execute(VPTransformState & State)1154 void VPWidenCastRecipe::execute(VPTransformState &State) {
1155 State.setDebugLocFrom(getDebugLoc());
1156 auto &Builder = State.Builder;
1157 /// Vectorize casts.
1158 assert(State.VF.isVector() && "Not vectorizing?");
1159 Type *DestTy = VectorType::get(getResultType(), State.VF);
1160 VPValue *Op = getOperand(0);
1161 for (unsigned Part = 0; Part < State.UF; ++Part) {
1162 if (Part > 0 && Op->isLiveIn()) {
1163 // FIXME: Remove once explicit unrolling is implemented using VPlan.
1164 State.set(this, State.get(this, 0), Part);
1165 continue;
1166 }
1167 Value *A = State.get(Op, Part);
1168 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
1169 State.set(this, Cast, Part);
1170 State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
1171 }
1172 }
1173
1174 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1175 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1176 VPSlotTracker &SlotTracker) const {
1177 O << Indent << "WIDEN-CAST ";
1178 printAsOperand(O, SlotTracker);
1179 O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1180 printFlags(O);
1181 printOperands(O, SlotTracker);
1182 O << " to " << *getResultType();
1183 }
1184 #endif
1185
1186 /// This function adds
1187 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1188 /// to each vector element of Val. The sequence starts at StartIndex.
1189 /// \p Opcode is relevant for FP induction variable.
getStepVector(Value * Val,Value * StartIdx,Value * Step,Instruction::BinaryOps BinOp,ElementCount VF,IRBuilderBase & Builder)1190 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1191 Instruction::BinaryOps BinOp, ElementCount VF,
1192 IRBuilderBase &Builder) {
1193 assert(VF.isVector() && "only vector VFs are supported");
1194
1195 // Create and check the types.
1196 auto *ValVTy = cast<VectorType>(Val->getType());
1197 ElementCount VLen = ValVTy->getElementCount();
1198
1199 Type *STy = Val->getType()->getScalarType();
1200 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1201 "Induction Step must be an integer or FP");
1202 assert(Step->getType() == STy && "Step has wrong type");
1203
1204 SmallVector<Constant *, 8> Indices;
1205
1206 // Create a vector of consecutive numbers from zero to VF.
1207 VectorType *InitVecValVTy = ValVTy;
1208 if (STy->isFloatingPointTy()) {
1209 Type *InitVecValSTy =
1210 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
1211 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
1212 }
1213 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
1214
1215 // Splat the StartIdx
1216 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
1217
1218 if (STy->isIntegerTy()) {
1219 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
1220 Step = Builder.CreateVectorSplat(VLen, Step);
1221 assert(Step->getType() == Val->getType() && "Invalid step vec");
1222 // FIXME: The newly created binary instructions should contain nsw/nuw
1223 // flags, which can be found from the original scalar operations.
1224 Step = Builder.CreateMul(InitVec, Step);
1225 return Builder.CreateAdd(Val, Step, "induction");
1226 }
1227
1228 // Floating point induction.
1229 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1230 "Binary Opcode should be specified for FP induction");
1231 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
1232 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
1233
1234 Step = Builder.CreateVectorSplat(VLen, Step);
1235 Value *MulOp = Builder.CreateFMul(InitVec, Step);
1236 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1237 }
1238
1239 /// A helper function that returns an integer or floating-point constant with
1240 /// value C.
getSignedIntOrFpConstant(Type * Ty,int64_t C)1241 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1242 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
1243 : ConstantFP::get(Ty, C);
1244 }
1245
getRuntimeVFAsFloat(IRBuilderBase & B,Type * FTy,ElementCount VF)1246 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1247 ElementCount VF) {
1248 assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1249 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1250 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1251 return B.CreateUIToFP(RuntimeVF, FTy);
1252 }
1253
execute(VPTransformState & State)1254 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1255 assert(!State.Instance && "Int or FP induction being replicated.");
1256
1257 Value *Start = getStartValue()->getLiveInIRValue();
1258 const InductionDescriptor &ID = getInductionDescriptor();
1259 TruncInst *Trunc = getTruncInst();
1260 IRBuilderBase &Builder = State.Builder;
1261 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1262 assert(State.VF.isVector() && "must have vector VF");
1263
1264 // The value from the original loop to which we are mapping the new induction
1265 // variable.
1266 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1267
1268 // Fast-math-flags propagate from the original induction instruction.
1269 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1270 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
1271 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1272
1273 // Now do the actual transformations, and start with fetching the step value.
1274 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1275
1276 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1277 "Expected either an induction phi-node or a truncate of it!");
1278
1279 // Construct the initial value of the vector IV in the vector loop preheader
1280 auto CurrIP = Builder.saveIP();
1281 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1282 Builder.SetInsertPoint(VectorPH->getTerminator());
1283 if (isa<TruncInst>(EntryVal)) {
1284 assert(Start->getType()->isIntegerTy() &&
1285 "Truncation requires an integer type");
1286 auto *TruncType = cast<IntegerType>(EntryVal->getType());
1287 Step = Builder.CreateTrunc(Step, TruncType);
1288 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1289 }
1290
1291 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
1292 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1293 Value *SteppedStart = getStepVector(
1294 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1295
1296 // We create vector phi nodes for both integer and floating-point induction
1297 // variables. Here, we determine the kind of arithmetic we will perform.
1298 Instruction::BinaryOps AddOp;
1299 Instruction::BinaryOps MulOp;
1300 if (Step->getType()->isIntegerTy()) {
1301 AddOp = Instruction::Add;
1302 MulOp = Instruction::Mul;
1303 } else {
1304 AddOp = ID.getInductionOpcode();
1305 MulOp = Instruction::FMul;
1306 }
1307
1308 // Multiply the vectorization factor by the step using integer or
1309 // floating-point arithmetic as appropriate.
1310 Type *StepType = Step->getType();
1311 Value *RuntimeVF;
1312 if (Step->getType()->isFloatingPointTy())
1313 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
1314 else
1315 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
1316 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1317
1318 // Create a vector splat to use in the induction update.
1319 //
1320 // FIXME: If the step is non-constant, we create the vector splat with
1321 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1322 // handle a constant vector splat.
1323 Value *SplatVF = isa<Constant>(Mul)
1324 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1325 : Builder.CreateVectorSplat(State.VF, Mul);
1326 Builder.restoreIP(CurrIP);
1327
1328 // We may need to add the step a number of times, depending on the unroll
1329 // factor. The last of those goes into the PHI.
1330 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1331 VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1332 VecInd->setDebugLoc(EntryVal->getDebugLoc());
1333 Instruction *LastInduction = VecInd;
1334 for (unsigned Part = 0; Part < State.UF; ++Part) {
1335 State.set(this, LastInduction, Part);
1336
1337 if (isa<TruncInst>(EntryVal))
1338 State.addMetadata(LastInduction, EntryVal);
1339
1340 LastInduction = cast<Instruction>(
1341 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1342 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1343 }
1344
1345 LastInduction->setName("vec.ind.next");
1346 VecInd->addIncoming(SteppedStart, VectorPH);
1347 // Add induction update using an incorrect block temporarily. The phi node
1348 // will be fixed after VPlan execution. Note that at this point the latch
1349 // block cannot be used, as it does not exist yet.
1350 // TODO: Model increment value in VPlan, by turning the recipe into a
1351 // multi-def and a subclass of VPHeaderPHIRecipe.
1352 VecInd->addIncoming(LastInduction, VectorPH);
1353 }
1354
1355 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1356 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1357 VPSlotTracker &SlotTracker) const {
1358 O << Indent << "WIDEN-INDUCTION";
1359 if (getTruncInst()) {
1360 O << "\\l\"";
1361 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
1362 O << " +\n" << Indent << "\" ";
1363 getVPValue(0)->printAsOperand(O, SlotTracker);
1364 } else
1365 O << " " << VPlanIngredient(IV);
1366
1367 O << ", ";
1368 getStepValue()->printAsOperand(O, SlotTracker);
1369 }
1370 #endif
1371
isCanonical() const1372 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1373 // The step may be defined by a recipe in the preheader (e.g. if it requires
1374 // SCEV expansion), but for the canonical induction the step is required to be
1375 // 1, which is represented as live-in.
1376 if (getStepValue()->getDefiningRecipe())
1377 return false;
1378 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1379 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1380 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
1381 return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1382 getScalarType() == CanIV->getScalarType();
1383 }
1384
1385 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1386 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1387 VPSlotTracker &SlotTracker) const {
1388 O << Indent;
1389 printAsOperand(O, SlotTracker);
1390 O << Indent << "= DERIVED-IV ";
1391 getStartValue()->printAsOperand(O, SlotTracker);
1392 O << " + ";
1393 getOperand(1)->printAsOperand(O, SlotTracker);
1394 O << " * ";
1395 getStepValue()->printAsOperand(O, SlotTracker);
1396 }
1397 #endif
1398
execute(VPTransformState & State)1399 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1400 // Fast-math-flags propagate from the original induction instruction.
1401 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1402 if (hasFastMathFlags())
1403 State.Builder.setFastMathFlags(getFastMathFlags());
1404
1405 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1406 /// variable on which to base the steps, \p Step is the size of the step.
1407
1408 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1409 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1410 IRBuilderBase &Builder = State.Builder;
1411
1412 // Ensure step has the same type as that of scalar IV.
1413 Type *BaseIVTy = BaseIV->getType()->getScalarType();
1414 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1415
1416 // We build scalar steps for both integer and floating-point induction
1417 // variables. Here, we determine the kind of arithmetic we will perform.
1418 Instruction::BinaryOps AddOp;
1419 Instruction::BinaryOps MulOp;
1420 if (BaseIVTy->isIntegerTy()) {
1421 AddOp = Instruction::Add;
1422 MulOp = Instruction::Mul;
1423 } else {
1424 AddOp = InductionOpcode;
1425 MulOp = Instruction::FMul;
1426 }
1427
1428 // Determine the number of scalars we need to generate for each unroll
1429 // iteration.
1430 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1431 // Compute the scalar steps and save the results in State.
1432 Type *IntStepTy =
1433 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1434 Type *VecIVTy = nullptr;
1435 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1436 if (!FirstLaneOnly && State.VF.isScalable()) {
1437 VecIVTy = VectorType::get(BaseIVTy, State.VF);
1438 UnitStepVec =
1439 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1440 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1441 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1442 }
1443
1444 unsigned StartPart = 0;
1445 unsigned EndPart = State.UF;
1446 unsigned StartLane = 0;
1447 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1448 if (State.Instance) {
1449 StartPart = State.Instance->Part;
1450 EndPart = StartPart + 1;
1451 StartLane = State.Instance->Lane.getKnownLane();
1452 EndLane = StartLane + 1;
1453 }
1454 for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1455 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1456
1457 if (!FirstLaneOnly && State.VF.isScalable()) {
1458 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1459 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1460 if (BaseIVTy->isFloatingPointTy())
1461 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1462 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1463 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1464 State.set(this, Add, Part);
1465 // It's useful to record the lane values too for the known minimum number
1466 // of elements so we do those below. This improves the code quality when
1467 // trying to extract the first element, for example.
1468 }
1469
1470 if (BaseIVTy->isFloatingPointTy())
1471 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1472
1473 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1474 Value *StartIdx = Builder.CreateBinOp(
1475 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1476 // The step returned by `createStepForVF` is a runtime-evaluated value
1477 // when VF is scalable. Otherwise, it should be folded into a Constant.
1478 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1479 "Expected StartIdx to be folded to a constant when VF is not "
1480 "scalable");
1481 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1482 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1483 State.set(this, Add, VPIteration(Part, Lane));
1484 }
1485 }
1486 }
1487
1488 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1489 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1490 VPSlotTracker &SlotTracker) const {
1491 O << Indent;
1492 printAsOperand(O, SlotTracker);
1493 O << " = SCALAR-STEPS ";
1494 printOperands(O, SlotTracker);
1495 }
1496 #endif
1497
execute(VPTransformState & State)1498 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1499 assert(State.VF.isVector() && "not widening");
1500 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1501 // Construct a vector GEP by widening the operands of the scalar GEP as
1502 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1503 // results in a vector of pointers when at least one operand of the GEP
1504 // is vector-typed. Thus, to keep the representation compact, we only use
1505 // vector-typed operands for loop-varying values.
1506
1507 if (areAllOperandsInvariant()) {
1508 // If we are vectorizing, but the GEP has only loop-invariant operands,
1509 // the GEP we build (by only using vector-typed operands for
1510 // loop-varying values) would be a scalar pointer. Thus, to ensure we
1511 // produce a vector of pointers, we need to either arbitrarily pick an
1512 // operand to broadcast, or broadcast a clone of the original GEP.
1513 // Here, we broadcast a clone of the original.
1514 //
1515 // TODO: If at some point we decide to scalarize instructions having
1516 // loop-invariant operands, this special case will no longer be
1517 // required. We would add the scalarization decision to
1518 // collectLoopScalars() and teach getVectorValue() to broadcast
1519 // the lane-zero scalar value.
1520 SmallVector<Value *> Ops;
1521 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1522 Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1523
1524 auto *NewGEP =
1525 State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1526 ArrayRef(Ops).drop_front(), "", isInBounds());
1527 for (unsigned Part = 0; Part < State.UF; ++Part) {
1528 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1529 State.set(this, EntryPart, Part);
1530 State.addMetadata(EntryPart, GEP);
1531 }
1532 } else {
1533 // If the GEP has at least one loop-varying operand, we are sure to
1534 // produce a vector of pointers. But if we are only unrolling, we want
1535 // to produce a scalar GEP for each unroll part. Thus, the GEP we
1536 // produce with the code below will be scalar (if VF == 1) or vector
1537 // (otherwise). Note that for the unroll-only case, we still maintain
1538 // values in the vector mapping with initVector, as we do for other
1539 // instructions.
1540 for (unsigned Part = 0; Part < State.UF; ++Part) {
1541 // The pointer operand of the new GEP. If it's loop-invariant, we
1542 // won't broadcast it.
1543 auto *Ptr = isPointerLoopInvariant()
1544 ? State.get(getOperand(0), VPIteration(0, 0))
1545 : State.get(getOperand(0), Part);
1546
1547 // Collect all the indices for the new GEP. If any index is
1548 // loop-invariant, we won't broadcast it.
1549 SmallVector<Value *, 4> Indices;
1550 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1551 VPValue *Operand = getOperand(I);
1552 if (isIndexLoopInvariant(I - 1))
1553 Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1554 else
1555 Indices.push_back(State.get(Operand, Part));
1556 }
1557
1558 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1559 // but it should be a vector, otherwise.
1560 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1561 Indices, "", isInBounds());
1562 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1563 "NewGEP is not a pointer vector");
1564 State.set(this, NewGEP, Part);
1565 State.addMetadata(NewGEP, GEP);
1566 }
1567 }
1568 }
1569
1570 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1571 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1572 VPSlotTracker &SlotTracker) const {
1573 O << Indent << "WIDEN-GEP ";
1574 O << (isPointerLoopInvariant() ? "Inv" : "Var");
1575 for (size_t I = 0; I < getNumOperands() - 1; ++I)
1576 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1577
1578 O << " ";
1579 printAsOperand(O, SlotTracker);
1580 O << " = getelementptr";
1581 printFlags(O);
1582 printOperands(O, SlotTracker);
1583 }
1584 #endif
1585
execute(VPTransformState & State)1586 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1587 auto &Builder = State.Builder;
1588 State.setDebugLocFrom(getDebugLoc());
1589 for (unsigned Part = 0; Part < State.UF; ++Part) {
1590 // Calculate the pointer for the specific unroll-part.
1591 Value *PartPtr = nullptr;
1592 // Use i32 for the gep index type when the value is constant,
1593 // or query DataLayout for a more suitable index type otherwise.
1594 const DataLayout &DL =
1595 Builder.GetInsertBlock()->getDataLayout();
1596 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1597 ? DL.getIndexType(IndexedTy->getPointerTo())
1598 : Builder.getInt32Ty();
1599 Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1600 bool InBounds = isInBounds();
1601 if (IsReverse) {
1602 // If the address is consecutive but reversed, then the
1603 // wide store needs to start at the last vector element.
1604 // RunTimeVF = VScale * VF.getKnownMinValue()
1605 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1606 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1607 // NumElt = -Part * RunTimeVF
1608 Value *NumElt = Builder.CreateMul(
1609 ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1610 // LastLane = 1 - RunTimeVF
1611 Value *LastLane =
1612 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1613 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1614 PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1615 } else {
1616 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1617 PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1618 }
1619
1620 State.set(this, PartPtr, Part, /*IsScalar*/ true);
1621 }
1622 }
1623
1624 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1625 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1626 VPSlotTracker &SlotTracker) const {
1627 O << Indent;
1628 printAsOperand(O, SlotTracker);
1629 O << " = vector-pointer ";
1630 if (IsReverse)
1631 O << "(reverse) ";
1632
1633 printOperands(O, SlotTracker);
1634 }
1635 #endif
1636
execute(VPTransformState & State)1637 void VPBlendRecipe::execute(VPTransformState &State) {
1638 State.setDebugLocFrom(getDebugLoc());
1639 // We know that all PHIs in non-header blocks are converted into
1640 // selects, so we don't have to worry about the insertion order and we
1641 // can just use the builder.
1642 // At this point we generate the predication tree. There may be
1643 // duplications since this is a simple recursive scan, but future
1644 // optimizations will clean it up.
1645
1646 unsigned NumIncoming = getNumIncomingValues();
1647
1648 // Generate a sequence of selects of the form:
1649 // SELECT(Mask3, In3,
1650 // SELECT(Mask2, In2,
1651 // SELECT(Mask1, In1,
1652 // In0)))
1653 // Note that Mask0 is never used: lanes for which no path reaches this phi and
1654 // are essentially undef are taken from In0.
1655 VectorParts Entry(State.UF);
1656 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
1657 for (unsigned In = 0; In < NumIncoming; ++In) {
1658 for (unsigned Part = 0; Part < State.UF; ++Part) {
1659 // We might have single edge PHIs (blocks) - use an identity
1660 // 'select' for the first PHI operand.
1661 Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);
1662 if (In == 0)
1663 Entry[Part] = In0; // Initialize with the first incoming value.
1664 else {
1665 // Select between the current value and the previous incoming edge
1666 // based on the incoming mask.
1667 Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);
1668 Entry[Part] =
1669 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1670 }
1671 }
1672 }
1673 for (unsigned Part = 0; Part < State.UF; ++Part)
1674 State.set(this, Entry[Part], Part, OnlyFirstLaneUsed);
1675 }
1676
1677 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1678 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1679 VPSlotTracker &SlotTracker) const {
1680 O << Indent << "BLEND ";
1681 printAsOperand(O, SlotTracker);
1682 O << " =";
1683 if (getNumIncomingValues() == 1) {
1684 // Not a User of any mask: not really blending, this is a
1685 // single-predecessor phi.
1686 O << " ";
1687 getIncomingValue(0)->printAsOperand(O, SlotTracker);
1688 } else {
1689 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1690 O << " ";
1691 getIncomingValue(I)->printAsOperand(O, SlotTracker);
1692 if (I == 0)
1693 continue;
1694 O << "/";
1695 getMask(I)->printAsOperand(O, SlotTracker);
1696 }
1697 }
1698 }
1699 #endif
1700
execute(VPTransformState & State)1701 void VPReductionRecipe::execute(VPTransformState &State) {
1702 assert(!State.Instance && "Reduction being replicated.");
1703 Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
1704 RecurKind Kind = RdxDesc.getRecurrenceKind();
1705 // Propagate the fast-math flags carried by the underlying instruction.
1706 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1707 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1708 for (unsigned Part = 0; Part < State.UF; ++Part) {
1709 Value *NewVecOp = State.get(getVecOp(), Part);
1710 if (VPValue *Cond = getCondOp()) {
1711 Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
1712 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
1713 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1714 Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
1715 RdxDesc.getFastMathFlags());
1716 if (State.VF.isVector()) {
1717 Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
1718 }
1719
1720 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
1721 NewVecOp = Select;
1722 }
1723 Value *NewRed;
1724 Value *NextInChain;
1725 if (IsOrdered) {
1726 if (State.VF.isVector())
1727 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
1728 PrevInChain);
1729 else
1730 NewRed = State.Builder.CreateBinOp(
1731 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
1732 NewVecOp);
1733 PrevInChain = NewRed;
1734 } else {
1735 PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
1736 NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
1737 }
1738 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
1739 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
1740 NewRed, PrevInChain);
1741 } else if (IsOrdered)
1742 NextInChain = NewRed;
1743 else
1744 NextInChain = State.Builder.CreateBinOp(
1745 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
1746 State.set(this, NextInChain, Part, /*IsScalar*/ true);
1747 }
1748 }
1749
execute(VPTransformState & State)1750 void VPReductionEVLRecipe::execute(VPTransformState &State) {
1751 assert(!State.Instance && "Reduction being replicated.");
1752 assert(State.UF == 1 &&
1753 "Expected only UF == 1 when vectorizing with explicit vector length.");
1754
1755 auto &Builder = State.Builder;
1756 // Propagate the fast-math flags carried by the underlying instruction.
1757 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
1758 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1759 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1760
1761 RecurKind Kind = RdxDesc.getRecurrenceKind();
1762 Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true);
1763 Value *VecOp = State.get(getVecOp(), 0);
1764 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
1765
1766 VectorBuilder VBuilder(Builder);
1767 VBuilder.setEVL(EVL);
1768 Value *Mask;
1769 // TODO: move the all-true mask generation into VectorBuilder.
1770 if (VPValue *CondOp = getCondOp())
1771 Mask = State.get(CondOp, 0);
1772 else
1773 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
1774 VBuilder.setMask(Mask);
1775
1776 Value *NewRed;
1777 if (isOrdered()) {
1778 NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
1779 } else {
1780 NewRed = createSimpleTargetReduction(VBuilder, VecOp, RdxDesc);
1781 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1782 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
1783 else
1784 NewRed = Builder.CreateBinOp(
1785 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
1786 }
1787 State.set(this, NewRed, 0, /*IsScalar*/ true);
1788 }
1789
1790 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1791 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1792 VPSlotTracker &SlotTracker) const {
1793 O << Indent << "REDUCE ";
1794 printAsOperand(O, SlotTracker);
1795 O << " = ";
1796 getChainOp()->printAsOperand(O, SlotTracker);
1797 O << " +";
1798 if (isa<FPMathOperator>(getUnderlyingInstr()))
1799 O << getUnderlyingInstr()->getFastMathFlags();
1800 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1801 getVecOp()->printAsOperand(O, SlotTracker);
1802 if (isConditional()) {
1803 O << ", ";
1804 getCondOp()->printAsOperand(O, SlotTracker);
1805 }
1806 O << ")";
1807 if (RdxDesc.IntermediateStore)
1808 O << " (with final reduction value stored in invariant address sank "
1809 "outside of loop)";
1810 }
1811
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1812 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1813 VPSlotTracker &SlotTracker) const {
1814 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1815 O << Indent << "REDUCE ";
1816 printAsOperand(O, SlotTracker);
1817 O << " = ";
1818 getChainOp()->printAsOperand(O, SlotTracker);
1819 O << " +";
1820 if (isa<FPMathOperator>(getUnderlyingInstr()))
1821 O << getUnderlyingInstr()->getFastMathFlags();
1822 O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1823 getVecOp()->printAsOperand(O, SlotTracker);
1824 O << ", ";
1825 getEVL()->printAsOperand(O, SlotTracker);
1826 if (isConditional()) {
1827 O << ", ";
1828 getCondOp()->printAsOperand(O, SlotTracker);
1829 }
1830 O << ")";
1831 if (RdxDesc.IntermediateStore)
1832 O << " (with final reduction value stored in invariant address sank "
1833 "outside of loop)";
1834 }
1835 #endif
1836
shouldPack() const1837 bool VPReplicateRecipe::shouldPack() const {
1838 // Find if the recipe is used by a widened recipe via an intervening
1839 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1840 return any_of(users(), [](const VPUser *U) {
1841 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1842 return any_of(PredR->users(), [PredR](const VPUser *U) {
1843 return !U->usesScalars(PredR);
1844 });
1845 return false;
1846 });
1847 }
1848
1849 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1850 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1851 VPSlotTracker &SlotTracker) const {
1852 O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1853
1854 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1855 printAsOperand(O, SlotTracker);
1856 O << " = ";
1857 }
1858 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1859 O << "call";
1860 printFlags(O);
1861 O << "@" << CB->getCalledFunction()->getName() << "(";
1862 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1863 O, [&O, &SlotTracker](VPValue *Op) {
1864 Op->printAsOperand(O, SlotTracker);
1865 });
1866 O << ")";
1867 } else {
1868 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1869 printFlags(O);
1870 printOperands(O, SlotTracker);
1871 }
1872
1873 if (shouldPack())
1874 O << " (S->V)";
1875 }
1876 #endif
1877
1878 /// Checks if \p C is uniform across all VFs and UFs. It is considered as such
1879 /// if it is either defined outside the vector region or its operand is known to
1880 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
1881 /// TODO: Uniformity should be associated with a VPValue and there should be a
1882 /// generic way to check.
isUniformAcrossVFsAndUFs(VPScalarCastRecipe * C)1883 static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
1884 return C->isDefinedOutsideVectorRegions() ||
1885 isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
1886 isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
1887 }
1888
generate(VPTransformState & State,unsigned Part)1889 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
1890 assert(vputils::onlyFirstLaneUsed(this) &&
1891 "Codegen only implemented for first lane.");
1892 switch (Opcode) {
1893 case Instruction::SExt:
1894 case Instruction::ZExt:
1895 case Instruction::Trunc: {
1896 // Note: SExt/ZExt not used yet.
1897 Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
1898 return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
1899 }
1900 default:
1901 llvm_unreachable("opcode not implemented yet");
1902 }
1903 }
1904
execute(VPTransformState & State)1905 void VPScalarCastRecipe ::execute(VPTransformState &State) {
1906 bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
1907 for (unsigned Part = 0; Part != State.UF; ++Part) {
1908 Value *Res;
1909 // Only generate a single instance, if the recipe is uniform across UFs and
1910 // VFs.
1911 if (Part > 0 && IsUniformAcrossVFsAndUFs)
1912 Res = State.get(this, VPIteration(0, 0));
1913 else
1914 Res = generate(State, Part);
1915 State.set(this, Res, VPIteration(Part, 0));
1916 }
1917 }
1918
1919 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1920 void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
1921 VPSlotTracker &SlotTracker) const {
1922 O << Indent << "SCALAR-CAST ";
1923 printAsOperand(O, SlotTracker);
1924 O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1925 printOperands(O, SlotTracker);
1926 O << " to " << *ResultTy;
1927 }
1928 #endif
1929
execute(VPTransformState & State)1930 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1931 assert(State.Instance && "Branch on Mask works only on single instance.");
1932
1933 unsigned Part = State.Instance->Part;
1934 unsigned Lane = State.Instance->Lane.getKnownLane();
1935
1936 Value *ConditionBit = nullptr;
1937 VPValue *BlockInMask = getMask();
1938 if (BlockInMask) {
1939 ConditionBit = State.get(BlockInMask, Part);
1940 if (ConditionBit->getType()->isVectorTy())
1941 ConditionBit = State.Builder.CreateExtractElement(
1942 ConditionBit, State.Builder.getInt32(Lane));
1943 } else // Block in mask is all-one.
1944 ConditionBit = State.Builder.getTrue();
1945
1946 // Replace the temporary unreachable terminator with a new conditional branch,
1947 // whose two destinations will be set later when they are created.
1948 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1949 assert(isa<UnreachableInst>(CurrentTerminator) &&
1950 "Expected to replace unreachable terminator with conditional branch.");
1951 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1952 CondBr->setSuccessor(0, nullptr);
1953 ReplaceInstWithInst(CurrentTerminator, CondBr);
1954 }
1955
execute(VPTransformState & State)1956 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1957 assert(State.Instance && "Predicated instruction PHI works per instance.");
1958 Instruction *ScalarPredInst =
1959 cast<Instruction>(State.get(getOperand(0), *State.Instance));
1960 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1961 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1962 assert(PredicatingBB && "Predicated block has no single predecessor.");
1963 assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1964 "operand must be VPReplicateRecipe");
1965
1966 // By current pack/unpack logic we need to generate only a single phi node: if
1967 // a vector value for the predicated instruction exists at this point it means
1968 // the instruction has vector users only, and a phi for the vector value is
1969 // needed. In this case the recipe of the predicated instruction is marked to
1970 // also do that packing, thereby "hoisting" the insert-element sequence.
1971 // Otherwise, a phi node for the scalar value is needed.
1972 unsigned Part = State.Instance->Part;
1973 if (State.hasVectorValue(getOperand(0), Part)) {
1974 Value *VectorValue = State.get(getOperand(0), Part);
1975 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1976 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1977 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1978 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1979 if (State.hasVectorValue(this, Part))
1980 State.reset(this, VPhi, Part);
1981 else
1982 State.set(this, VPhi, Part);
1983 // NOTE: Currently we need to update the value of the operand, so the next
1984 // predicated iteration inserts its generated value in the correct vector.
1985 State.reset(getOperand(0), VPhi, Part);
1986 } else {
1987 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1988 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1989 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1990 PredicatingBB);
1991 Phi->addIncoming(ScalarPredInst, PredicatedBB);
1992 if (State.hasScalarValue(this, *State.Instance))
1993 State.reset(this, Phi, *State.Instance);
1994 else
1995 State.set(this, Phi, *State.Instance);
1996 // NOTE: Currently we need to update the value of the operand, so the next
1997 // predicated iteration inserts its generated value in the correct vector.
1998 State.reset(getOperand(0), Phi, *State.Instance);
1999 }
2000 }
2001
2002 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2003 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2004 VPSlotTracker &SlotTracker) const {
2005 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2006 printAsOperand(O, SlotTracker);
2007 O << " = ";
2008 printOperands(O, SlotTracker);
2009 }
2010
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2011 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2012 VPSlotTracker &SlotTracker) const {
2013 O << Indent << "WIDEN ";
2014 printAsOperand(O, SlotTracker);
2015 O << " = load ";
2016 printOperands(O, SlotTracker);
2017 }
2018
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2019 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2020 VPSlotTracker &SlotTracker) const {
2021 O << Indent << "WIDEN ";
2022 printAsOperand(O, SlotTracker);
2023 O << " = vp.load ";
2024 printOperands(O, SlotTracker);
2025 }
2026
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2027 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2028 VPSlotTracker &SlotTracker) const {
2029 O << Indent << "WIDEN store ";
2030 printOperands(O, SlotTracker);
2031 }
2032
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2033 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2034 VPSlotTracker &SlotTracker) const {
2035 O << Indent << "WIDEN vp.store ";
2036 printOperands(O, SlotTracker);
2037 }
2038 #endif
2039
createBitOrPointerCast(IRBuilderBase & Builder,Value * V,VectorType * DstVTy,const DataLayout & DL)2040 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2041 VectorType *DstVTy, const DataLayout &DL) {
2042 // Verify that V is a vector type with same number of elements as DstVTy.
2043 auto VF = DstVTy->getElementCount();
2044 auto *SrcVecTy = cast<VectorType>(V->getType());
2045 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2046 Type *SrcElemTy = SrcVecTy->getElementType();
2047 Type *DstElemTy = DstVTy->getElementType();
2048 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2049 "Vector elements must have same size");
2050
2051 // Do a direct cast if element types are castable.
2052 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2053 return Builder.CreateBitOrPointerCast(V, DstVTy);
2054 }
2055 // V cannot be directly casted to desired vector type.
2056 // May happen when V is a floating point vector but DstVTy is a vector of
2057 // pointers or vice-versa. Handle this using a two-step bitcast using an
2058 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2059 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2060 "Only one type should be a pointer type");
2061 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2062 "Only one type should be a floating point type");
2063 Type *IntTy =
2064 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2065 auto *VecIntTy = VectorType::get(IntTy, VF);
2066 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2067 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2068 }
2069
2070 /// Return a vector containing interleaved elements from multiple
2071 /// smaller input vectors.
interleaveVectors(IRBuilderBase & Builder,ArrayRef<Value * > Vals,const Twine & Name)2072 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2073 const Twine &Name) {
2074 unsigned Factor = Vals.size();
2075 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2076
2077 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
2078 #ifndef NDEBUG
2079 for (Value *Val : Vals)
2080 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2081 #endif
2082
2083 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2084 // must use intrinsics to interleave.
2085 if (VecTy->isScalableTy()) {
2086 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2087 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2088 Vals,
2089 /*FMFSource=*/nullptr, Name);
2090 }
2091
2092 // Fixed length. Start by concatenating all vectors into a wide vector.
2093 Value *WideVec = concatenateVectors(Builder, Vals);
2094
2095 // Interleave the elements into the wide vector.
2096 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2097 return Builder.CreateShuffleVector(
2098 WideVec, createInterleaveMask(NumElts, Factor), Name);
2099 }
2100
2101 // Try to vectorize the interleave group that \p Instr belongs to.
2102 //
2103 // E.g. Translate following interleaved load group (factor = 3):
2104 // for (i = 0; i < N; i+=3) {
2105 // R = Pic[i]; // Member of index 0
2106 // G = Pic[i+1]; // Member of index 1
2107 // B = Pic[i+2]; // Member of index 2
2108 // ... // do something to R, G, B
2109 // }
2110 // To:
2111 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2112 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2113 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2114 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2115 //
2116 // Or translate following interleaved store group (factor = 3):
2117 // for (i = 0; i < N; i+=3) {
2118 // ... do something to R, G, B
2119 // Pic[i] = R; // Member of index 0
2120 // Pic[i+1] = G; // Member of index 1
2121 // Pic[i+2] = B; // Member of index 2
2122 // }
2123 // To:
2124 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2125 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2126 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2127 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2128 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
execute(VPTransformState & State)2129 void VPInterleaveRecipe::execute(VPTransformState &State) {
2130 assert(!State.Instance && "Interleave group being replicated.");
2131 const InterleaveGroup<Instruction> *Group = IG;
2132 Instruction *Instr = Group->getInsertPos();
2133
2134 // Prepare for the vector type of the interleaved load/store.
2135 Type *ScalarTy = getLoadStoreType(Instr);
2136 unsigned InterleaveFactor = Group->getFactor();
2137 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2138
2139 // Prepare for the new pointers.
2140 SmallVector<Value *, 2> AddrParts;
2141 unsigned Index = Group->getIndex(Instr);
2142
2143 // TODO: extend the masked interleaved-group support to reversed access.
2144 VPValue *BlockInMask = getMask();
2145 assert((!BlockInMask || !Group->isReverse()) &&
2146 "Reversed masked interleave-group not supported.");
2147
2148 Value *Idx;
2149 // If the group is reverse, adjust the index to refer to the last vector lane
2150 // instead of the first. We adjust the index from the first vector lane,
2151 // rather than directly getting the pointer for lane VF - 1, because the
2152 // pointer operand of the interleaved access is supposed to be uniform. For
2153 // uniform instructions, we're only required to generate a value for the
2154 // first vector lane in each unroll iteration.
2155 if (Group->isReverse()) {
2156 Value *RuntimeVF =
2157 getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
2158 Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
2159 Idx = State.Builder.CreateMul(Idx,
2160 State.Builder.getInt32(Group->getFactor()));
2161 Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
2162 Idx = State.Builder.CreateNeg(Idx);
2163 } else
2164 Idx = State.Builder.getInt32(-Index);
2165
2166 VPValue *Addr = getAddr();
2167 for (unsigned Part = 0; Part < State.UF; Part++) {
2168 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2169 if (auto *I = dyn_cast<Instruction>(AddrPart))
2170 State.setDebugLocFrom(I->getDebugLoc());
2171
2172 // Notice current instruction could be any index. Need to adjust the address
2173 // to the member of index 0.
2174 //
2175 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2176 // b = A[i]; // Member of index 0
2177 // Current pointer is pointed to A[i+1], adjust it to A[i].
2178 //
2179 // E.g. A[i+1] = a; // Member of index 1
2180 // A[i] = b; // Member of index 0
2181 // A[i+2] = c; // Member of index 2 (Current instruction)
2182 // Current pointer is pointed to A[i+2], adjust it to A[i].
2183
2184 bool InBounds = false;
2185 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2186 InBounds = gep->isInBounds();
2187 AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2188 AddrParts.push_back(AddrPart);
2189 }
2190
2191 State.setDebugLocFrom(Instr->getDebugLoc());
2192 Value *PoisonVec = PoisonValue::get(VecTy);
2193
2194 auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor](
2195 unsigned Part, Value *MaskForGaps) -> Value * {
2196 if (State.VF.isScalable()) {
2197 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2198 assert(InterleaveFactor == 2 &&
2199 "Unsupported deinterleave factor for scalable vectors");
2200 auto *BlockInMaskPart = State.get(BlockInMask, Part);
2201 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2202 auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2203 State.VF.getKnownMinValue() * 2, true);
2204 return State.Builder.CreateIntrinsic(
2205 MaskTy, Intrinsic::vector_interleave2, Ops,
2206 /*FMFSource=*/nullptr, "interleaved.mask");
2207 }
2208
2209 if (!BlockInMask)
2210 return MaskForGaps;
2211
2212 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2213 Value *ShuffledMask = State.Builder.CreateShuffleVector(
2214 BlockInMaskPart,
2215 createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2216 "interleaved.mask");
2217 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
2218 ShuffledMask, MaskForGaps)
2219 : ShuffledMask;
2220 };
2221
2222 const DataLayout &DL = Instr->getDataLayout();
2223 // Vectorize the interleaved load group.
2224 if (isa<LoadInst>(Instr)) {
2225 Value *MaskForGaps = nullptr;
2226 if (NeedsMaskForGaps) {
2227 MaskForGaps = createBitMaskForGaps(State.Builder,
2228 State.VF.getKnownMinValue(), *Group);
2229 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2230 }
2231
2232 // For each unroll part, create a wide load for the group.
2233 SmallVector<Value *, 2> NewLoads;
2234 for (unsigned Part = 0; Part < State.UF; Part++) {
2235 Instruction *NewLoad;
2236 if (BlockInMask || MaskForGaps) {
2237 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2238 NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part],
2239 Group->getAlign(), GroupMask,
2240 PoisonVec, "wide.masked.vec");
2241 } else
2242 NewLoad = State.Builder.CreateAlignedLoad(
2243 VecTy, AddrParts[Part], Group->getAlign(), "wide.vec");
2244 Group->addMetadata(NewLoad);
2245 NewLoads.push_back(NewLoad);
2246 }
2247
2248 ArrayRef<VPValue *> VPDefs = definedValues();
2249 const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2250 if (VecTy->isScalableTy()) {
2251 assert(InterleaveFactor == 2 &&
2252 "Unsupported deinterleave factor for scalable vectors");
2253
2254 for (unsigned Part = 0; Part < State.UF; ++Part) {
2255 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2256 // so must use intrinsics to deinterleave.
2257 Value *DI = State.Builder.CreateIntrinsic(
2258 Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
2259 /*FMFSource=*/nullptr, "strided.vec");
2260 unsigned J = 0;
2261 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2262 Instruction *Member = Group->getMember(I);
2263
2264 if (!Member)
2265 continue;
2266
2267 Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
2268 // If this member has different type, cast the result type.
2269 if (Member->getType() != ScalarTy) {
2270 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2271 StridedVec =
2272 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2273 }
2274
2275 if (Group->isReverse())
2276 StridedVec =
2277 State.Builder.CreateVectorReverse(StridedVec, "reverse");
2278
2279 State.set(VPDefs[J], StridedVec, Part);
2280 ++J;
2281 }
2282 }
2283
2284 return;
2285 }
2286
2287 // For each member in the group, shuffle out the appropriate data from the
2288 // wide loads.
2289 unsigned J = 0;
2290 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2291 Instruction *Member = Group->getMember(I);
2292
2293 // Skip the gaps in the group.
2294 if (!Member)
2295 continue;
2296
2297 auto StrideMask =
2298 createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
2299 for (unsigned Part = 0; Part < State.UF; Part++) {
2300 Value *StridedVec = State.Builder.CreateShuffleVector(
2301 NewLoads[Part], StrideMask, "strided.vec");
2302
2303 // If this member has different type, cast the result type.
2304 if (Member->getType() != ScalarTy) {
2305 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2306 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2307 StridedVec =
2308 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2309 }
2310
2311 if (Group->isReverse())
2312 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2313
2314 State.set(VPDefs[J], StridedVec, Part);
2315 }
2316 ++J;
2317 }
2318 return;
2319 }
2320
2321 // The sub vector type for current instruction.
2322 auto *SubVT = VectorType::get(ScalarTy, State.VF);
2323
2324 // Vectorize the interleaved store group.
2325 Value *MaskForGaps =
2326 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
2327 assert((!MaskForGaps || !State.VF.isScalable()) &&
2328 "masking gaps for scalable vectors is not yet supported.");
2329 ArrayRef<VPValue *> StoredValues = getStoredValues();
2330 for (unsigned Part = 0; Part < State.UF; Part++) {
2331 // Collect the stored vector from each member.
2332 SmallVector<Value *, 4> StoredVecs;
2333 unsigned StoredIdx = 0;
2334 for (unsigned i = 0; i < InterleaveFactor; i++) {
2335 assert((Group->getMember(i) || MaskForGaps) &&
2336 "Fail to get a member from an interleaved store group");
2337 Instruction *Member = Group->getMember(i);
2338
2339 // Skip the gaps in the group.
2340 if (!Member) {
2341 Value *Undef = PoisonValue::get(SubVT);
2342 StoredVecs.push_back(Undef);
2343 continue;
2344 }
2345
2346 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2347 ++StoredIdx;
2348
2349 if (Group->isReverse())
2350 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
2351
2352 // If this member has different type, cast it to a unified type.
2353
2354 if (StoredVec->getType() != SubVT)
2355 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
2356
2357 StoredVecs.push_back(StoredVec);
2358 }
2359
2360 // Interleave all the smaller vectors into one wider vector.
2361 Value *IVec =
2362 interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
2363 Instruction *NewStoreInstr;
2364 if (BlockInMask || MaskForGaps) {
2365 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2366 NewStoreInstr = State.Builder.CreateMaskedStore(
2367 IVec, AddrParts[Part], Group->getAlign(), GroupMask);
2368 } else
2369 NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part],
2370 Group->getAlign());
2371
2372 Group->addMetadata(NewStoreInstr);
2373 }
2374 }
2375
2376 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2377 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
2378 VPSlotTracker &SlotTracker) const {
2379 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
2380 IG->getInsertPos()->printAsOperand(O, false);
2381 O << ", ";
2382 getAddr()->printAsOperand(O, SlotTracker);
2383 VPValue *Mask = getMask();
2384 if (Mask) {
2385 O << ", ";
2386 Mask->printAsOperand(O, SlotTracker);
2387 }
2388
2389 unsigned OpIdx = 0;
2390 for (unsigned i = 0; i < IG->getFactor(); ++i) {
2391 if (!IG->getMember(i))
2392 continue;
2393 if (getNumStoreOperands() > 0) {
2394 O << "\n" << Indent << " store ";
2395 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
2396 O << " to index " << i;
2397 } else {
2398 O << "\n" << Indent << " ";
2399 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
2400 O << " = load from index " << i;
2401 }
2402 ++OpIdx;
2403 }
2404 }
2405 #endif
2406
execute(VPTransformState & State)2407 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
2408 Value *Start = getStartValue()->getLiveInIRValue();
2409 PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
2410 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2411
2412 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2413 EntryPart->addIncoming(Start, VectorPH);
2414 EntryPart->setDebugLoc(getDebugLoc());
2415 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2416 State.set(this, EntryPart, Part, /*IsScalar*/ true);
2417 }
2418
2419 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2420 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2421 VPSlotTracker &SlotTracker) const {
2422 O << Indent << "EMIT ";
2423 printAsOperand(O, SlotTracker);
2424 O << " = CANONICAL-INDUCTION ";
2425 printOperands(O, SlotTracker);
2426 }
2427 #endif
2428
isCanonical(InductionDescriptor::InductionKind Kind,VPValue * Start,VPValue * Step) const2429 bool VPCanonicalIVPHIRecipe::isCanonical(
2430 InductionDescriptor::InductionKind Kind, VPValue *Start,
2431 VPValue *Step) const {
2432 // Must be an integer induction.
2433 if (Kind != InductionDescriptor::IK_IntInduction)
2434 return false;
2435 // Start must match the start value of this canonical induction.
2436 if (Start != getStartValue())
2437 return false;
2438
2439 // If the step is defined by a recipe, it is not a ConstantInt.
2440 if (Step->getDefiningRecipe())
2441 return false;
2442
2443 ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
2444 return StepC && StepC->isOne();
2445 }
2446
onlyScalarsGenerated(bool IsScalable)2447 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
2448 return IsScalarAfterVectorization &&
2449 (!IsScalable || vputils::onlyFirstLaneUsed(this));
2450 }
2451
2452 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2453 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2454 VPSlotTracker &SlotTracker) const {
2455 O << Indent << "EMIT ";
2456 printAsOperand(O, SlotTracker);
2457 O << " = WIDEN-POINTER-INDUCTION ";
2458 getStartValue()->printAsOperand(O, SlotTracker);
2459 O << ", " << *IndDesc.getStep();
2460 }
2461 #endif
2462
execute(VPTransformState & State)2463 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
2464 assert(!State.Instance && "cannot be used in per-lane");
2465 const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2466 SCEVExpander Exp(SE, DL, "induction");
2467
2468 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
2469 &*State.Builder.GetInsertPoint());
2470 assert(!State.ExpandedSCEVs.contains(Expr) &&
2471 "Same SCEV expanded multiple times");
2472 State.ExpandedSCEVs[Expr] = Res;
2473 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2474 State.set(this, Res, {Part, 0});
2475 }
2476
2477 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2478 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
2479 VPSlotTracker &SlotTracker) const {
2480 O << Indent << "EMIT ";
2481 getVPSingleValue()->printAsOperand(O, SlotTracker);
2482 O << " = EXPAND SCEV " << *Expr;
2483 }
2484 #endif
2485
execute(VPTransformState & State)2486 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
2487 Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
2488 Type *STy = CanonicalIV->getType();
2489 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
2490 ElementCount VF = State.VF;
2491 Value *VStart = VF.isScalar()
2492 ? CanonicalIV
2493 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
2494 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2495 Value *VStep = createStepForVF(Builder, STy, VF, Part);
2496 if (VF.isVector()) {
2497 VStep = Builder.CreateVectorSplat(VF, VStep);
2498 VStep =
2499 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
2500 }
2501 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
2502 State.set(this, CanonicalVectorIV, Part);
2503 }
2504 }
2505
2506 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2507 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
2508 VPSlotTracker &SlotTracker) const {
2509 O << Indent << "EMIT ";
2510 printAsOperand(O, SlotTracker);
2511 O << " = WIDEN-CANONICAL-INDUCTION ";
2512 printOperands(O, SlotTracker);
2513 }
2514 #endif
2515
execute(VPTransformState & State)2516 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
2517 auto &Builder = State.Builder;
2518 // Create a vector from the initial value.
2519 auto *VectorInit = getStartValue()->getLiveInIRValue();
2520
2521 Type *VecTy = State.VF.isScalar()
2522 ? VectorInit->getType()
2523 : VectorType::get(VectorInit->getType(), State.VF);
2524
2525 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2526 if (State.VF.isVector()) {
2527 auto *IdxTy = Builder.getInt32Ty();
2528 auto *One = ConstantInt::get(IdxTy, 1);
2529 IRBuilder<>::InsertPointGuard Guard(Builder);
2530 Builder.SetInsertPoint(VectorPH->getTerminator());
2531 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
2532 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
2533 VectorInit = Builder.CreateInsertElement(
2534 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
2535 }
2536
2537 // Create a phi node for the new recurrence.
2538 PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
2539 EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
2540 EntryPart->addIncoming(VectorInit, VectorPH);
2541 State.set(this, EntryPart, 0);
2542 }
2543
2544 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2545 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
2546 VPSlotTracker &SlotTracker) const {
2547 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
2548 printAsOperand(O, SlotTracker);
2549 O << " = phi ";
2550 printOperands(O, SlotTracker);
2551 }
2552 #endif
2553
execute(VPTransformState & State)2554 void VPReductionPHIRecipe::execute(VPTransformState &State) {
2555 auto &Builder = State.Builder;
2556
2557 // Reductions do not have to start at zero. They can start with
2558 // any loop invariant values.
2559 VPValue *StartVPV = getStartValue();
2560 Value *StartV = StartVPV->getLiveInIRValue();
2561
2562 // In order to support recurrences we need to be able to vectorize Phi nodes.
2563 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
2564 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
2565 // this value when we vectorize all of the instructions that use the PHI.
2566 bool ScalarPHI = State.VF.isScalar() || IsInLoop;
2567 Type *VecTy = ScalarPHI ? StartV->getType()
2568 : VectorType::get(StartV->getType(), State.VF);
2569
2570 BasicBlock *HeaderBB = State.CFG.PrevBB;
2571 assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
2572 "recipe must be in the vector loop header");
2573 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
2574 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2575 Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
2576 EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
2577 State.set(this, EntryPart, Part, IsInLoop);
2578 }
2579
2580 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2581
2582 Value *Iden = nullptr;
2583 RecurKind RK = RdxDesc.getRecurrenceKind();
2584 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
2585 RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
2586 // MinMax and AnyOf reductions have the start value as their identity.
2587 if (ScalarPHI) {
2588 Iden = StartV;
2589 } else {
2590 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2591 Builder.SetInsertPoint(VectorPH->getTerminator());
2592 StartV = Iden =
2593 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
2594 }
2595 } else {
2596 Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
2597 RdxDesc.getFastMathFlags());
2598
2599 if (!ScalarPHI) {
2600 Iden = Builder.CreateVectorSplat(State.VF, Iden);
2601 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2602 Builder.SetInsertPoint(VectorPH->getTerminator());
2603 Constant *Zero = Builder.getInt32(0);
2604 StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
2605 }
2606 }
2607
2608 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2609 Value *EntryPart = State.get(this, Part, IsInLoop);
2610 // Make sure to add the reduction start value only to the
2611 // first unroll part.
2612 Value *StartVal = (Part == 0) ? StartV : Iden;
2613 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
2614 }
2615 }
2616
2617 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2618 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2619 VPSlotTracker &SlotTracker) const {
2620 O << Indent << "WIDEN-REDUCTION-PHI ";
2621
2622 printAsOperand(O, SlotTracker);
2623 O << " = phi ";
2624 printOperands(O, SlotTracker);
2625 }
2626 #endif
2627
execute(VPTransformState & State)2628 void VPWidenPHIRecipe::execute(VPTransformState &State) {
2629 assert(EnableVPlanNativePath &&
2630 "Non-native vplans are not expected to have VPWidenPHIRecipes.");
2631
2632 Value *Op0 = State.get(getOperand(0), 0);
2633 Type *VecTy = Op0->getType();
2634 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
2635 State.set(this, VecPhi, 0);
2636 }
2637
2638 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2639 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2640 VPSlotTracker &SlotTracker) const {
2641 O << Indent << "WIDEN-PHI ";
2642
2643 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
2644 // Unless all incoming values are modeled in VPlan print the original PHI
2645 // directly.
2646 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
2647 // values as VPValues.
2648 if (getNumOperands() != OriginalPhi->getNumOperands()) {
2649 O << VPlanIngredient(OriginalPhi);
2650 return;
2651 }
2652
2653 printAsOperand(O, SlotTracker);
2654 O << " = phi ";
2655 printOperands(O, SlotTracker);
2656 }
2657 #endif
2658
2659 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
2660 // remove VPActiveLaneMaskPHIRecipe.
execute(VPTransformState & State)2661 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
2662 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2663 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2664 Value *StartMask = State.get(getOperand(0), Part);
2665 PHINode *EntryPart =
2666 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
2667 EntryPart->addIncoming(StartMask, VectorPH);
2668 EntryPart->setDebugLoc(getDebugLoc());
2669 State.set(this, EntryPart, Part);
2670 }
2671 }
2672
2673 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2674 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2675 VPSlotTracker &SlotTracker) const {
2676 O << Indent << "ACTIVE-LANE-MASK-PHI ";
2677
2678 printAsOperand(O, SlotTracker);
2679 O << " = phi ";
2680 printOperands(O, SlotTracker);
2681 }
2682 #endif
2683
execute(VPTransformState & State)2684 void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
2685 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
2686 assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
2687 Value *Start = State.get(getOperand(0), VPIteration(0, 0));
2688 PHINode *EntryPart =
2689 State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
2690 EntryPart->addIncoming(Start, VectorPH);
2691 EntryPart->setDebugLoc(getDebugLoc());
2692 State.set(this, EntryPart, 0, /*IsScalar=*/true);
2693 }
2694
2695 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2696 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2697 VPSlotTracker &SlotTracker) const {
2698 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
2699
2700 printAsOperand(O, SlotTracker);
2701 O << " = phi ";
2702 printOperands(O, SlotTracker);
2703 }
2704 #endif
2705