1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13
14 #include "LoopVectorizationPlanner.h"
15 #include "VPlan.h"
16 #include "VPlanAnalysis.h"
17 #include "VPlanHelpers.h"
18 #include "VPlanPatternMatch.h"
19 #include "VPlanUtils.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/Twine.h"
23 #include "llvm/Analysis/AssumptionCache.h"
24 #include "llvm/Analysis/IVDescriptors.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/IRBuilder.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Intrinsics.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/IR/Value.h"
33 #include "llvm/Support/Casting.h"
34 #include "llvm/Support/CommandLine.h"
35 #include "llvm/Support/Debug.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
38 #include "llvm/Transforms/Utils/LoopUtils.h"
39 #include "llvm/Transforms/Utils/LoopVersioning.h"
40 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
41 #include <cassert>
42
43 using namespace llvm;
44
45 using VectorParts = SmallVector<Value *, 2>;
46
47 #define LV_NAME "loop-vectorize"
48 #define DEBUG_TYPE LV_NAME
49
mayWriteToMemory() const50 bool VPRecipeBase::mayWriteToMemory() const {
51 switch (getVPDefID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
54 case VPInstructionSC:
55 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
56 case VPInterleaveSC:
57 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
58 case VPWidenStoreEVLSC:
59 case VPWidenStoreSC:
60 return true;
61 case VPReplicateSC:
62 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
63 ->mayWriteToMemory();
64 case VPWidenCallSC:
65 return !cast<VPWidenCallRecipe>(this)
66 ->getCalledScalarFunction()
67 ->onlyReadsMemory();
68 case VPWidenIntrinsicSC:
69 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
70 case VPCanonicalIVPHISC:
71 case VPBranchOnMaskSC:
72 case VPFirstOrderRecurrencePHISC:
73 case VPReductionPHISC:
74 case VPScalarIVStepsSC:
75 case VPPredInstPHISC:
76 return false;
77 case VPBlendSC:
78 case VPReductionEVLSC:
79 case VPReductionSC:
80 case VPVectorPointerSC:
81 case VPWidenCanonicalIVSC:
82 case VPWidenCastSC:
83 case VPWidenGEPSC:
84 case VPWidenIntOrFpInductionSC:
85 case VPWidenLoadEVLSC:
86 case VPWidenLoadSC:
87 case VPWidenPHISC:
88 case VPWidenSC:
89 case VPWidenSelectSC: {
90 const Instruction *I =
91 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
92 (void)I;
93 assert((!I || !I->mayWriteToMemory()) &&
94 "underlying instruction may write to memory");
95 return false;
96 }
97 default:
98 return true;
99 }
100 }
101
mayReadFromMemory() const102 bool VPRecipeBase::mayReadFromMemory() const {
103 switch (getVPDefID()) {
104 case VPExpressionSC:
105 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
106 case VPInstructionSC:
107 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
108 case VPWidenLoadEVLSC:
109 case VPWidenLoadSC:
110 return true;
111 case VPReplicateSC:
112 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
113 ->mayReadFromMemory();
114 case VPWidenCallSC:
115 return !cast<VPWidenCallRecipe>(this)
116 ->getCalledScalarFunction()
117 ->onlyWritesMemory();
118 case VPWidenIntrinsicSC:
119 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
120 case VPBranchOnMaskSC:
121 case VPFirstOrderRecurrencePHISC:
122 case VPPredInstPHISC:
123 case VPScalarIVStepsSC:
124 case VPWidenStoreEVLSC:
125 case VPWidenStoreSC:
126 return false;
127 case VPBlendSC:
128 case VPReductionEVLSC:
129 case VPReductionSC:
130 case VPVectorPointerSC:
131 case VPWidenCanonicalIVSC:
132 case VPWidenCastSC:
133 case VPWidenGEPSC:
134 case VPWidenIntOrFpInductionSC:
135 case VPWidenPHISC:
136 case VPWidenSC:
137 case VPWidenSelectSC: {
138 const Instruction *I =
139 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
140 (void)I;
141 assert((!I || !I->mayReadFromMemory()) &&
142 "underlying instruction may read from memory");
143 return false;
144 }
145 default:
146 return true;
147 }
148 }
149
mayHaveSideEffects() const150 bool VPRecipeBase::mayHaveSideEffects() const {
151 switch (getVPDefID()) {
152 case VPExpressionSC:
153 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
154 case VPDerivedIVSC:
155 case VPFirstOrderRecurrencePHISC:
156 case VPPredInstPHISC:
157 case VPVectorEndPointerSC:
158 return false;
159 case VPInstructionSC:
160 return mayWriteToMemory();
161 case VPWidenCallSC: {
162 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
163 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
164 }
165 case VPWidenIntrinsicSC:
166 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
167 case VPBlendSC:
168 case VPReductionEVLSC:
169 case VPReductionSC:
170 case VPScalarIVStepsSC:
171 case VPVectorPointerSC:
172 case VPWidenCanonicalIVSC:
173 case VPWidenCastSC:
174 case VPWidenGEPSC:
175 case VPWidenIntOrFpInductionSC:
176 case VPWidenPHISC:
177 case VPWidenPointerInductionSC:
178 case VPWidenSC:
179 case VPWidenSelectSC: {
180 const Instruction *I =
181 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
182 (void)I;
183 assert((!I || !I->mayHaveSideEffects()) &&
184 "underlying instruction has side-effects");
185 return false;
186 }
187 case VPInterleaveSC:
188 return mayWriteToMemory();
189 case VPWidenLoadEVLSC:
190 case VPWidenLoadSC:
191 case VPWidenStoreEVLSC:
192 case VPWidenStoreSC:
193 assert(
194 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
195 mayWriteToMemory() &&
196 "mayHaveSideffects result for ingredient differs from this "
197 "implementation");
198 return mayWriteToMemory();
199 case VPReplicateSC: {
200 auto *R = cast<VPReplicateRecipe>(this);
201 return R->getUnderlyingInstr()->mayHaveSideEffects();
202 }
203 default:
204 return true;
205 }
206 }
207
insertBefore(VPRecipeBase * InsertPos)208 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
209 assert(!Parent && "Recipe already in some VPBasicBlock");
210 assert(InsertPos->getParent() &&
211 "Insertion position not in any VPBasicBlock");
212 InsertPos->getParent()->insert(this, InsertPos->getIterator());
213 }
214
insertBefore(VPBasicBlock & BB,iplist<VPRecipeBase>::iterator I)215 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
216 iplist<VPRecipeBase>::iterator I) {
217 assert(!Parent && "Recipe already in some VPBasicBlock");
218 assert(I == BB.end() || I->getParent() == &BB);
219 BB.insert(this, I);
220 }
221
insertAfter(VPRecipeBase * InsertPos)222 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
223 assert(!Parent && "Recipe already in some VPBasicBlock");
224 assert(InsertPos->getParent() &&
225 "Insertion position not in any VPBasicBlock");
226 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
227 }
228
removeFromParent()229 void VPRecipeBase::removeFromParent() {
230 assert(getParent() && "Recipe not in any VPBasicBlock");
231 getParent()->getRecipeList().remove(getIterator());
232 Parent = nullptr;
233 }
234
eraseFromParent()235 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
236 assert(getParent() && "Recipe not in any VPBasicBlock");
237 return getParent()->getRecipeList().erase(getIterator());
238 }
239
moveAfter(VPRecipeBase * InsertPos)240 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
241 removeFromParent();
242 insertAfter(InsertPos);
243 }
244
moveBefore(VPBasicBlock & BB,iplist<VPRecipeBase>::iterator I)245 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
246 iplist<VPRecipeBase>::iterator I) {
247 removeFromParent();
248 insertBefore(BB, I);
249 }
250
cost(ElementCount VF,VPCostContext & Ctx)251 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
252 // Get the underlying instruction for the recipe, if there is one. It is used
253 // to
254 // * decide if cost computation should be skipped for this recipe,
255 // * apply forced target instruction cost.
256 Instruction *UI = nullptr;
257 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
258 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
259 else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
260 UI = IG->getInsertPos();
261 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
262 UI = &WidenMem->getIngredient();
263
264 InstructionCost RecipeCost;
265 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
266 RecipeCost = 0;
267 } else {
268 RecipeCost = computeCost(VF, Ctx);
269 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
270 RecipeCost.isValid())
271 RecipeCost = InstructionCost(ForceTargetInstructionCost);
272 }
273
274 LLVM_DEBUG({
275 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
276 dump();
277 });
278 return RecipeCost;
279 }
280
computeCost(ElementCount VF,VPCostContext & Ctx) const281 InstructionCost VPRecipeBase::computeCost(ElementCount VF,
282 VPCostContext &Ctx) const {
283 llvm_unreachable("subclasses should implement computeCost");
284 }
285
isPhi() const286 bool VPRecipeBase::isPhi() const {
287 return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) ||
288 (isa<VPInstruction>(this) &&
289 cast<VPInstruction>(this)->getOpcode() == Instruction::PHI) ||
290 isa<VPIRPhi>(this);
291 }
292
isScalarCast() const293 bool VPRecipeBase::isScalarCast() const {
294 auto *VPI = dyn_cast<VPInstruction>(this);
295 return VPI && Instruction::isCast(VPI->getOpcode());
296 }
297
298 InstructionCost
computeCost(ElementCount VF,VPCostContext & Ctx) const299 VPPartialReductionRecipe::computeCost(ElementCount VF,
300 VPCostContext &Ctx) const {
301 std::optional<unsigned> Opcode;
302 VPValue *Op = getOperand(0);
303 VPRecipeBase *OpR = Op->getDefiningRecipe();
304
305 // If the partial reduction is predicated, a select will be operand 0
306 using namespace llvm::VPlanPatternMatch;
307 if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
308 OpR = Op->getDefiningRecipe();
309 }
310
311 Type *InputTypeA = nullptr, *InputTypeB = nullptr;
312 TTI::PartialReductionExtendKind ExtAType = TTI::PR_None,
313 ExtBType = TTI::PR_None;
314
315 auto GetExtendKind = [](VPRecipeBase *R) {
316 if (!R)
317 return TTI::PR_None;
318 auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
319 if (!WidenCastR)
320 return TTI::PR_None;
321 if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
322 return TTI::PR_ZeroExtend;
323 if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
324 return TTI::PR_SignExtend;
325 return TTI::PR_None;
326 };
327
328 // Pick out opcode, type/ext information and use sub side effects from a widen
329 // recipe.
330 auto HandleWiden = [&](VPWidenRecipe *Widen) {
331 if (match(Widen,
332 m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Op)))) {
333 Widen = dyn_cast<VPWidenRecipe>(Op->getDefiningRecipe());
334 }
335 Opcode = Widen->getOpcode();
336 VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe();
337 VPRecipeBase *ExtBR = Widen->getOperand(1)->getDefiningRecipe();
338 InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0)
339 : Widen->getOperand(0));
340 InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0)
341 : Widen->getOperand(1));
342 ExtAType = GetExtendKind(ExtAR);
343 ExtBType = GetExtendKind(ExtBR);
344 };
345
346 if (isa<VPWidenCastRecipe>(OpR)) {
347 InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0));
348 ExtAType = GetExtendKind(OpR);
349 } else if (isa<VPReductionPHIRecipe>(OpR)) {
350 auto RedPhiOp1R = getOperand(1)->getDefiningRecipe();
351 if (isa<VPWidenCastRecipe>(RedPhiOp1R)) {
352 InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0));
353 ExtAType = GetExtendKind(RedPhiOp1R);
354 } else if (auto Widen = dyn_cast<VPWidenRecipe>(RedPhiOp1R))
355 HandleWiden(Widen);
356 } else if (auto Widen = dyn_cast<VPWidenRecipe>(OpR)) {
357 HandleWiden(Widen);
358 } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(OpR)) {
359 return Reduction->computeCost(VF, Ctx);
360 }
361 auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
362 return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB,
363 PhiType, VF, ExtAType, ExtBType,
364 Opcode, Ctx.CostKind);
365 }
366
execute(VPTransformState & State)367 void VPPartialReductionRecipe::execute(VPTransformState &State) {
368 auto &Builder = State.Builder;
369
370 assert(getOpcode() == Instruction::Add &&
371 "Unhandled partial reduction opcode");
372
373 Value *BinOpVal = State.get(getOperand(1));
374 Value *PhiVal = State.get(getOperand(0));
375 assert(PhiVal && BinOpVal && "Phi and Mul must be set");
376
377 Type *RetTy = PhiVal->getType();
378
379 CallInst *V = Builder.CreateIntrinsic(
380 RetTy, Intrinsic::experimental_vector_partial_reduce_add,
381 {PhiVal, BinOpVal}, nullptr, "partial.reduce");
382
383 State.set(this, V);
384 }
385
386 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const387 void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
388 VPSlotTracker &SlotTracker) const {
389 O << Indent << "PARTIAL-REDUCE ";
390 printAsOperand(O, SlotTracker);
391 O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
392 printOperands(O, SlotTracker);
393 }
394 #endif
395
getFastMathFlags() const396 FastMathFlags VPIRFlags::getFastMathFlags() const {
397 assert(OpType == OperationType::FPMathOp &&
398 "recipe doesn't have fast math flags");
399 FastMathFlags Res;
400 Res.setAllowReassoc(FMFs.AllowReassoc);
401 Res.setNoNaNs(FMFs.NoNaNs);
402 Res.setNoInfs(FMFs.NoInfs);
403 Res.setNoSignedZeros(FMFs.NoSignedZeros);
404 Res.setAllowReciprocal(FMFs.AllowReciprocal);
405 Res.setAllowContract(FMFs.AllowContract);
406 Res.setApproxFunc(FMFs.ApproxFunc);
407 return Res;
408 }
409
410 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump() const411 void VPSingleDefRecipe::dump() const { VPDef::dump(); }
412 #endif
413
414 template <unsigned PartOpIdx>
415 VPValue *
getUnrollPartOperand(VPUser & U) const416 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
417 if (U.getNumOperands() == PartOpIdx + 1)
418 return U.getOperand(PartOpIdx);
419 return nullptr;
420 }
421
422 template <unsigned PartOpIdx>
getUnrollPart(VPUser & U) const423 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
424 if (auto *UnrollPartOp = getUnrollPartOperand(U))
425 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
426 return 0;
427 }
428
429 namespace llvm {
430 template class VPUnrollPartAccessor<2>;
431 template class VPUnrollPartAccessor<3>;
432 }
433
VPInstruction(unsigned Opcode,ArrayRef<VPValue * > Operands,const VPIRFlags & Flags,DebugLoc DL,const Twine & Name)434 VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
435 const VPIRFlags &Flags, DebugLoc DL,
436 const Twine &Name)
437 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
438 VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
439 assert(flagsValidForOpcode(getOpcode()) &&
440 "Set flags not supported for the provided opcode");
441 assert((getNumOperandsForOpcode(Opcode) == -1u ||
442 getNumOperandsForOpcode(Opcode) == getNumOperands()) &&
443 "number of operands does not match opcode");
444 }
445
446 #ifndef NDEBUG
getNumOperandsForOpcode(unsigned Opcode)447 unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
448 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
449 return 1;
450
451 if (Instruction::isBinaryOp(Opcode))
452 return 2;
453
454 switch (Opcode) {
455 case VPInstruction::StepVector:
456 return 0;
457 case Instruction::Alloca:
458 case Instruction::ExtractValue:
459 case Instruction::Freeze:
460 case Instruction::Load:
461 case VPInstruction::AnyOf:
462 case VPInstruction::BranchOnCond:
463 case VPInstruction::CalculateTripCountMinusVF:
464 case VPInstruction::CanonicalIVIncrementForPart:
465 case VPInstruction::ExplicitVectorLength:
466 case VPInstruction::ExtractLastElement:
467 case VPInstruction::ExtractPenultimateElement:
468 case VPInstruction::FirstActiveLane:
469 case VPInstruction::Not:
470 return 1;
471 case Instruction::ICmp:
472 case Instruction::FCmp:
473 case Instruction::Store:
474 case VPInstruction::ActiveLaneMask:
475 case VPInstruction::BranchOnCount:
476 case VPInstruction::ComputeReductionResult:
477 case VPInstruction::FirstOrderRecurrenceSplice:
478 case VPInstruction::LogicalAnd:
479 case VPInstruction::PtrAdd:
480 case VPInstruction::WideIVStep:
481 return 2;
482 case Instruction::Select:
483 case VPInstruction::ComputeAnyOfResult:
484 case VPInstruction::ReductionStartVector:
485 return 3;
486 case VPInstruction::ComputeFindIVResult:
487 return 4;
488 case Instruction::Call:
489 case Instruction::GetElementPtr:
490 case Instruction::PHI:
491 case Instruction::Switch:
492 // Cannot determine the number of operands from the opcode.
493 return -1u;
494 }
495 llvm_unreachable("all cases should be handled above");
496 }
497 #endif
498
doesGeneratePerAllLanes() const499 bool VPInstruction::doesGeneratePerAllLanes() const {
500 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
501 }
502
canGenerateScalarForFirstLane() const503 bool VPInstruction::canGenerateScalarForFirstLane() const {
504 if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
505 return true;
506 if (isSingleScalar() || isVectorToScalar())
507 return true;
508 switch (Opcode) {
509 case Instruction::Freeze:
510 case Instruction::ICmp:
511 case Instruction::PHI:
512 case Instruction::Select:
513 case VPInstruction::BranchOnCond:
514 case VPInstruction::BranchOnCount:
515 case VPInstruction::CalculateTripCountMinusVF:
516 case VPInstruction::CanonicalIVIncrementForPart:
517 case VPInstruction::PtrAdd:
518 case VPInstruction::ExplicitVectorLength:
519 case VPInstruction::AnyOf:
520 return true;
521 default:
522 return false;
523 }
524 }
525
generatePerLane(VPTransformState & State,const VPLane & Lane)526 Value *VPInstruction::generatePerLane(VPTransformState &State,
527 const VPLane &Lane) {
528 IRBuilderBase &Builder = State.Builder;
529
530 assert(getOpcode() == VPInstruction::PtrAdd &&
531 "only PtrAdd opcodes are supported for now");
532 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
533 State.get(getOperand(1), Lane), Name);
534 }
535
536 /// Create a conditional branch using \p Cond branching to the successors of \p
537 /// VPBB. Note that the first successor is always forward (i.e. not created yet)
538 /// while the second successor may already have been created (if it is a header
539 /// block and VPBB is a latch).
createCondBranch(Value * Cond,VPBasicBlock * VPBB,VPTransformState & State)540 static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB,
541 VPTransformState &State) {
542 // Replace the temporary unreachable terminator with a new conditional
543 // branch, hooking it up to backward destination (header) for latch blocks
544 // now, and to forward destination(s) later when they are created.
545 // Second successor may be backwards - iff it is already in VPBB2IRBB.
546 VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]);
547 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
548 BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB];
549 BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
550 // First successor is always forward, reset it to nullptr
551 CondBr->setSuccessor(0, nullptr);
552 IRBB->getTerminator()->eraseFromParent();
553 return CondBr;
554 }
555
generate(VPTransformState & State)556 Value *VPInstruction::generate(VPTransformState &State) {
557 IRBuilderBase &Builder = State.Builder;
558
559 if (Instruction::isBinaryOp(getOpcode())) {
560 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
561 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
562 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
563 auto *Res =
564 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
565 if (auto *I = dyn_cast<Instruction>(Res))
566 applyFlags(*I);
567 return Res;
568 }
569
570 switch (getOpcode()) {
571 case VPInstruction::Not: {
572 Value *A = State.get(getOperand(0));
573 return Builder.CreateNot(A, Name);
574 }
575 case Instruction::ExtractElement: {
576 assert(State.VF.isVector() && "Only extract elements from vectors");
577 if (getOperand(1)->isLiveIn()) {
578 unsigned IdxToExtract =
579 cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
580 return State.get(getOperand(0), VPLane(IdxToExtract));
581 }
582 Value *Vec = State.get(getOperand(0));
583 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
584 return Builder.CreateExtractElement(Vec, Idx, Name);
585 }
586 case Instruction::Freeze: {
587 Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
588 return Builder.CreateFreeze(Op, Name);
589 }
590 case Instruction::FCmp:
591 case Instruction::ICmp: {
592 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
593 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
594 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
595 return Builder.CreateCmp(getPredicate(), A, B, Name);
596 }
597 case Instruction::PHI: {
598 llvm_unreachable("should be handled by VPPhi::execute");
599 }
600 case Instruction::Select: {
601 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
602 Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
603 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
604 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
605 return Builder.CreateSelect(Cond, Op1, Op2, Name);
606 }
607 case VPInstruction::ActiveLaneMask: {
608 // Get first lane of vector induction variable.
609 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
610 // Get the original loop tripcount.
611 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
612
613 // If this part of the active lane mask is scalar, generate the CMP directly
614 // to avoid unnecessary extracts.
615 if (State.VF.isScalar())
616 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
617 Name);
618
619 auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
620 auto *PredTy = VectorType::get(Int1Ty, State.VF);
621 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
622 {PredTy, ScalarTC->getType()},
623 {VIVElem0, ScalarTC}, nullptr, Name);
624 }
625 case VPInstruction::FirstOrderRecurrenceSplice: {
626 // Generate code to combine the previous and current values in vector v3.
627 //
628 // vector.ph:
629 // v_init = vector(..., ..., ..., a[-1])
630 // br vector.body
631 //
632 // vector.body
633 // i = phi [0, vector.ph], [i+4, vector.body]
634 // v1 = phi [v_init, vector.ph], [v2, vector.body]
635 // v2 = a[i, i+1, i+2, i+3];
636 // v3 = vector(v1(3), v2(0, 1, 2))
637
638 auto *V1 = State.get(getOperand(0));
639 if (!V1->getType()->isVectorTy())
640 return V1;
641 Value *V2 = State.get(getOperand(1));
642 return Builder.CreateVectorSplice(V1, V2, -1, Name);
643 }
644 case VPInstruction::CalculateTripCountMinusVF: {
645 unsigned UF = getParent()->getPlan()->getUF();
646 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
647 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
648 Value *Sub = Builder.CreateSub(ScalarTC, Step);
649 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
650 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
651 return Builder.CreateSelect(Cmp, Sub, Zero);
652 }
653 case VPInstruction::ExplicitVectorLength: {
654 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
655 // be outside of the main loop.
656 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
657 // Compute EVL
658 assert(AVL->getType()->isIntegerTy() &&
659 "Requested vector length should be an integer.");
660
661 assert(State.VF.isScalable() && "Expected scalable vector factor.");
662 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
663
664 Value *EVL = State.Builder.CreateIntrinsic(
665 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
666 {AVL, VFArg, State.Builder.getTrue()});
667 return EVL;
668 }
669 case VPInstruction::CanonicalIVIncrementForPart: {
670 unsigned Part = getUnrollPart(*this);
671 auto *IV = State.get(getOperand(0), VPLane(0));
672 assert(Part != 0 && "Must have a positive part");
673 // The canonical IV is incremented by the vectorization factor (num of
674 // SIMD elements) times the unroll part.
675 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
676 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
677 hasNoSignedWrap());
678 }
679 case VPInstruction::BranchOnCond: {
680 Value *Cond = State.get(getOperand(0), VPLane(0));
681 auto *Br = createCondBranch(Cond, getParent(), State);
682 applyMetadata(*Br);
683 return Br;
684 }
685 case VPInstruction::BranchOnCount: {
686 // First create the compare.
687 Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
688 Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
689 Value *Cond = Builder.CreateICmpEQ(IV, TC);
690 return createCondBranch(Cond, getParent(), State);
691 }
692 case VPInstruction::Broadcast: {
693 return Builder.CreateVectorSplat(
694 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
695 }
696 case VPInstruction::BuildStructVector: {
697 // For struct types, we need to build a new 'wide' struct type, where each
698 // element is widened, i.e., we create a struct of vectors.
699 auto *StructTy =
700 cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
701 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
702 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
703 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
704 FieldIndex++) {
705 Value *ScalarValue =
706 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
707 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
708 VectorValue =
709 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
710 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
711 }
712 }
713 return Res;
714 }
715 case VPInstruction::BuildVector: {
716 auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
717 auto NumOfElements = ElementCount::getFixed(getNumOperands());
718 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
719 for (const auto &[Idx, Op] : enumerate(operands()))
720 Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
721 State.Builder.getInt32(Idx));
722 return Res;
723 }
724 case VPInstruction::ReductionStartVector: {
725 if (State.VF.isScalar())
726 return State.get(getOperand(0), true);
727 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
728 Builder.setFastMathFlags(getFastMathFlags());
729 // If this start vector is scaled then it should produce a vector with fewer
730 // elements than the VF.
731 ElementCount VF = State.VF.divideCoefficientBy(
732 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
733 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
734 Constant *Zero = Builder.getInt32(0);
735 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
736 Zero);
737 }
738 case VPInstruction::ComputeAnyOfResult: {
739 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
740 // and will be removed by breaking up the recipe further.
741 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
742 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
743 Value *ReducedPartRdx = State.get(getOperand(2));
744 for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
745 ReducedPartRdx = Builder.CreateBinOp(
746 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(
747 RecurKind::AnyOf),
748 State.get(getOperand(Idx)), ReducedPartRdx, "bin.rdx");
749 return createAnyOfReduction(Builder, ReducedPartRdx,
750 State.get(getOperand(1), VPLane(0)), OrigPhi);
751 }
752 case VPInstruction::ComputeFindIVResult: {
753 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
754 // and will be removed by breaking up the recipe further.
755 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
756 // Get its reduction variable descriptor.
757 RecurKind RK = PhiR->getRecurrenceKind();
758 assert(RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
759 "Unexpected reduction kind");
760 assert(!PhiR->isInLoop() &&
761 "In-loop FindLastIV reduction is not supported yet");
762
763 // The recipe's operands are the reduction phi, the start value, the
764 // sentinel value, followed by one operand for each part of the reduction.
765 unsigned UF = getNumOperands() - 3;
766 Value *ReducedPartRdx = State.get(getOperand(3));
767 RecurKind MinMaxKind;
768 bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RK);
769 if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
770 MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
771 else
772 MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
773 for (unsigned Part = 1; Part < UF; ++Part)
774 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
775 State.get(getOperand(3 + Part)));
776
777 Value *Start = State.get(getOperand(1), true);
778 Value *Sentinel = getOperand(2)->getLiveInIRValue();
779 return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start,
780 Sentinel);
781 }
782 case VPInstruction::ComputeReductionResult: {
783 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
784 // and will be removed by breaking up the recipe further.
785 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
786 // Get its reduction variable descriptor.
787
788 RecurKind RK = PhiR->getRecurrenceKind();
789 assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
790 "should be handled by ComputeFindIVResult");
791
792 // The recipe's operands are the reduction phi, followed by one operand for
793 // each part of the reduction.
794 unsigned UF = getNumOperands() - 1;
795 VectorParts RdxParts(UF);
796 for (unsigned Part = 0; Part < UF; ++Part)
797 RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
798
799 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
800 if (hasFastMathFlags())
801 Builder.setFastMathFlags(getFastMathFlags());
802
803 // Reduce all of the unrolled parts into a single vector.
804 Value *ReducedPartRdx = RdxParts[0];
805 if (PhiR->isOrdered()) {
806 ReducedPartRdx = RdxParts[UF - 1];
807 } else {
808 // Floating-point operations should have some FMF to enable the reduction.
809 for (unsigned Part = 1; Part < UF; ++Part) {
810 Value *RdxPart = RdxParts[Part];
811 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
812 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
813 else
814 ReducedPartRdx = Builder.CreateBinOp(
815 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
816 RdxPart, ReducedPartRdx, "bin.rdx");
817 }
818 }
819
820 // Create the reduction after the loop. Note that inloop reductions create
821 // the target reduction in the loop using a Reduction recipe.
822 if (State.VF.isVector() && !PhiR->isInLoop()) {
823 // TODO: Support in-order reductions based on the recurrence descriptor.
824 // All ops in the reduction inherit fast-math-flags from the recurrence
825 // descriptor.
826 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
827 }
828
829 return ReducedPartRdx;
830 }
831 case VPInstruction::ExtractLastElement:
832 case VPInstruction::ExtractPenultimateElement: {
833 unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
834 Value *Res;
835 if (State.VF.isVector()) {
836 assert(Offset <= State.VF.getKnownMinValue() &&
837 "invalid offset to extract from");
838 // Extract lane VF - Offset from the operand.
839 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
840 } else {
841 assert(Offset <= 1 && "invalid offset to extract from");
842 Res = State.get(getOperand(0));
843 }
844 if (isa<ExtractElementInst>(Res))
845 Res->setName(Name);
846 return Res;
847 }
848 case VPInstruction::LogicalAnd: {
849 Value *A = State.get(getOperand(0));
850 Value *B = State.get(getOperand(1));
851 return Builder.CreateLogicalAnd(A, B, Name);
852 }
853 case VPInstruction::PtrAdd: {
854 assert(vputils::onlyFirstLaneUsed(this) &&
855 "can only generate first lane for PtrAdd");
856 Value *Ptr = State.get(getOperand(0), VPLane(0));
857 Value *Addend = State.get(getOperand(1), VPLane(0));
858 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
859 }
860 case VPInstruction::AnyOf: {
861 Value *Res = State.get(getOperand(0));
862 for (VPValue *Op : drop_begin(operands()))
863 Res = Builder.CreateOr(Res, State.get(Op));
864 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
865 }
866 case VPInstruction::FirstActiveLane: {
867 if (getNumOperands() == 1) {
868 Value *Mask = State.get(getOperand(0));
869 return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
870 true, Name);
871 }
872 // If there are multiple operands, create a chain of selects to pick the
873 // first operand with an active lane and add the number of lanes of the
874 // preceding operands.
875 Value *RuntimeVF =
876 getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF);
877 unsigned LastOpIdx = getNumOperands() - 1;
878 Value *Res = nullptr;
879 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
880 Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
881 Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
882 Value *Current = Builder.CreateAdd(
883 Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
884 if (Res) {
885 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
886 Res = Builder.CreateSelect(Cmp, Current, Res);
887 } else {
888 Res = Current;
889 }
890 }
891
892 return Res;
893 }
894 default:
895 llvm_unreachable("Unsupported opcode for instruction");
896 }
897 }
898
computeCost(ElementCount VF,VPCostContext & Ctx) const899 InstructionCost VPInstruction::computeCost(ElementCount VF,
900 VPCostContext &Ctx) const {
901 if (Instruction::isBinaryOp(getOpcode())) {
902 Type *ResTy = Ctx.Types.inferScalarType(this);
903 if (!vputils::onlyFirstLaneUsed(this))
904 ResTy = toVectorTy(ResTy, VF);
905
906 if (!getUnderlyingValue()) {
907 switch (getOpcode()) {
908 case Instruction::FMul:
909 return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind);
910 default:
911 // TODO: Compute cost for VPInstructions without underlying values once
912 // the legacy cost model has been retired.
913 return 0;
914 }
915 }
916
917 assert(!doesGeneratePerAllLanes() &&
918 "Should only generate a vector value or single scalar, not scalars "
919 "for all lanes.");
920 return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind);
921 }
922
923 switch (getOpcode()) {
924 case Instruction::ExtractElement: {
925 // Add on the cost of extracting the element.
926 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
927 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
928 Ctx.CostKind);
929 }
930 case VPInstruction::AnyOf: {
931 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
932 return Ctx.TTI.getArithmeticReductionCost(
933 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
934 }
935 case VPInstruction::FirstActiveLane: {
936 // Calculate the cost of determining the lane index.
937 auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
938 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
939 Type::getInt64Ty(Ctx.LLVMCtx),
940 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
941 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
942 }
943 case VPInstruction::FirstOrderRecurrenceSplice: {
944 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
945 SmallVector<int> Mask(VF.getKnownMinValue());
946 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
947 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
948
949 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
950 cast<VectorType>(VectorTy),
951 cast<VectorType>(VectorTy), Mask,
952 Ctx.CostKind, VF.getKnownMinValue() - 1);
953 }
954 case VPInstruction::ActiveLaneMask: {
955 Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
956 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
957 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
958 {ArgTy, ArgTy});
959 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
960 }
961 case VPInstruction::ExplicitVectorLength: {
962 Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
963 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
964 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
965 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
966 I32Ty, {Arg0Ty, I32Ty, I1Ty});
967 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
968 }
969 case VPInstruction::ExtractPenultimateElement:
970 if (VF == ElementCount::getScalable(1))
971 return InstructionCost::getInvalid();
972 LLVM_FALLTHROUGH;
973 default:
974 // TODO: Compute cost other VPInstructions once the legacy cost model has
975 // been retired.
976 assert(!getUnderlyingValue() &&
977 "unexpected VPInstruction witht underlying value");
978 return 0;
979 }
980 }
981
isVectorToScalar() const982 bool VPInstruction::isVectorToScalar() const {
983 return getOpcode() == VPInstruction::ExtractLastElement ||
984 getOpcode() == VPInstruction::ExtractPenultimateElement ||
985 getOpcode() == Instruction::ExtractElement ||
986 getOpcode() == VPInstruction::FirstActiveLane ||
987 getOpcode() == VPInstruction::ComputeAnyOfResult ||
988 getOpcode() == VPInstruction::ComputeFindIVResult ||
989 getOpcode() == VPInstruction::ComputeReductionResult ||
990 getOpcode() == VPInstruction::AnyOf;
991 }
992
isSingleScalar() const993 bool VPInstruction::isSingleScalar() const {
994 return getOpcode() == Instruction::PHI || isScalarCast();
995 }
996
execute(VPTransformState & State)997 void VPInstruction::execute(VPTransformState &State) {
998 assert(!State.Lane && "VPInstruction executing an Lane");
999 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1000 assert(flagsValidForOpcode(getOpcode()) &&
1001 "Set flags not supported for the provided opcode");
1002 if (hasFastMathFlags())
1003 State.Builder.setFastMathFlags(getFastMathFlags());
1004 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1005 (vputils::onlyFirstLaneUsed(this) ||
1006 isVectorToScalar() || isSingleScalar());
1007 bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
1008 if (GeneratesPerAllLanes) {
1009 for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
1010 Lane != NumLanes; ++Lane) {
1011 Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
1012 assert(GeneratedValue && "generatePerLane must produce a value");
1013 State.set(this, GeneratedValue, VPLane(Lane));
1014 }
1015 return;
1016 }
1017
1018 Value *GeneratedValue = generate(State);
1019 if (!hasResult())
1020 return;
1021 assert(GeneratedValue && "generate must produce a value");
1022 assert((((GeneratedValue->getType()->isVectorTy() ||
1023 GeneratedValue->getType()->isStructTy()) ==
1024 !GeneratesPerFirstLaneOnly) ||
1025 State.VF.isScalar()) &&
1026 "scalar value but not only first lane defined");
1027 State.set(this, GeneratedValue,
1028 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1029 }
1030
opcodeMayReadOrWriteFromMemory() const1031 bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
1032 if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
1033 return false;
1034 switch (getOpcode()) {
1035 case Instruction::ExtractElement:
1036 case Instruction::Freeze:
1037 case Instruction::FCmp:
1038 case Instruction::ICmp:
1039 case Instruction::Select:
1040 case VPInstruction::AnyOf:
1041 case VPInstruction::BuildStructVector:
1042 case VPInstruction::BuildVector:
1043 case VPInstruction::CalculateTripCountMinusVF:
1044 case VPInstruction::CanonicalIVIncrementForPart:
1045 case VPInstruction::ExtractLastElement:
1046 case VPInstruction::ExtractPenultimateElement:
1047 case VPInstruction::FirstActiveLane:
1048 case VPInstruction::FirstOrderRecurrenceSplice:
1049 case VPInstruction::LogicalAnd:
1050 case VPInstruction::Not:
1051 case VPInstruction::PtrAdd:
1052 case VPInstruction::WideIVStep:
1053 case VPInstruction::StepVector:
1054 case VPInstruction::ReductionStartVector:
1055 return false;
1056 default:
1057 return true;
1058 }
1059 }
1060
onlyFirstLaneUsed(const VPValue * Op) const1061 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
1062 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1063 if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
1064 return vputils::onlyFirstLaneUsed(this);
1065
1066 switch (getOpcode()) {
1067 default:
1068 return false;
1069 case Instruction::ExtractElement:
1070 return Op == getOperand(1);
1071 case Instruction::PHI:
1072 return true;
1073 case Instruction::FCmp:
1074 case Instruction::ICmp:
1075 case Instruction::Select:
1076 case Instruction::Or:
1077 case Instruction::Freeze:
1078 // TODO: Cover additional opcodes.
1079 return vputils::onlyFirstLaneUsed(this);
1080 case VPInstruction::ActiveLaneMask:
1081 case VPInstruction::ExplicitVectorLength:
1082 case VPInstruction::CalculateTripCountMinusVF:
1083 case VPInstruction::CanonicalIVIncrementForPart:
1084 case VPInstruction::BranchOnCount:
1085 case VPInstruction::BranchOnCond:
1086 case VPInstruction::Broadcast:
1087 case VPInstruction::ReductionStartVector:
1088 return true;
1089 case VPInstruction::PtrAdd:
1090 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1091 case VPInstruction::ComputeAnyOfResult:
1092 case VPInstruction::ComputeFindIVResult:
1093 return Op == getOperand(1);
1094 };
1095 llvm_unreachable("switch should return");
1096 }
1097
onlyFirstPartUsed(const VPValue * Op) const1098 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
1099 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1100 if (Instruction::isBinaryOp(getOpcode()))
1101 return vputils::onlyFirstPartUsed(this);
1102
1103 switch (getOpcode()) {
1104 default:
1105 return false;
1106 case Instruction::FCmp:
1107 case Instruction::ICmp:
1108 case Instruction::Select:
1109 return vputils::onlyFirstPartUsed(this);
1110 case VPInstruction::BranchOnCount:
1111 case VPInstruction::BranchOnCond:
1112 case VPInstruction::CanonicalIVIncrementForPart:
1113 return true;
1114 };
1115 llvm_unreachable("switch should return");
1116 }
1117
1118 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump() const1119 void VPInstruction::dump() const {
1120 VPSlotTracker SlotTracker(getParent()->getPlan());
1121 print(dbgs(), "", SlotTracker);
1122 }
1123
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1124 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
1125 VPSlotTracker &SlotTracker) const {
1126 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1127
1128 if (hasResult()) {
1129 printAsOperand(O, SlotTracker);
1130 O << " = ";
1131 }
1132
1133 switch (getOpcode()) {
1134 case VPInstruction::Not:
1135 O << "not";
1136 break;
1137 case VPInstruction::SLPLoad:
1138 O << "combined load";
1139 break;
1140 case VPInstruction::SLPStore:
1141 O << "combined store";
1142 break;
1143 case VPInstruction::ActiveLaneMask:
1144 O << "active lane mask";
1145 break;
1146 case VPInstruction::ExplicitVectorLength:
1147 O << "EXPLICIT-VECTOR-LENGTH";
1148 break;
1149 case VPInstruction::FirstOrderRecurrenceSplice:
1150 O << "first-order splice";
1151 break;
1152 case VPInstruction::BranchOnCond:
1153 O << "branch-on-cond";
1154 break;
1155 case VPInstruction::CalculateTripCountMinusVF:
1156 O << "TC > VF ? TC - VF : 0";
1157 break;
1158 case VPInstruction::CanonicalIVIncrementForPart:
1159 O << "VF * Part +";
1160 break;
1161 case VPInstruction::BranchOnCount:
1162 O << "branch-on-count";
1163 break;
1164 case VPInstruction::Broadcast:
1165 O << "broadcast";
1166 break;
1167 case VPInstruction::BuildStructVector:
1168 O << "buildstructvector";
1169 break;
1170 case VPInstruction::BuildVector:
1171 O << "buildvector";
1172 break;
1173 case VPInstruction::ExtractLastElement:
1174 O << "extract-last-element";
1175 break;
1176 case VPInstruction::ExtractPenultimateElement:
1177 O << "extract-penultimate-element";
1178 break;
1179 case VPInstruction::ComputeAnyOfResult:
1180 O << "compute-anyof-result";
1181 break;
1182 case VPInstruction::ComputeFindIVResult:
1183 O << "compute-find-iv-result";
1184 break;
1185 case VPInstruction::ComputeReductionResult:
1186 O << "compute-reduction-result";
1187 break;
1188 case VPInstruction::LogicalAnd:
1189 O << "logical-and";
1190 break;
1191 case VPInstruction::PtrAdd:
1192 O << "ptradd";
1193 break;
1194 case VPInstruction::AnyOf:
1195 O << "any-of";
1196 break;
1197 case VPInstruction::FirstActiveLane:
1198 O << "first-active-lane";
1199 break;
1200 case VPInstruction::ReductionStartVector:
1201 O << "reduction-start-vector";
1202 break;
1203 default:
1204 O << Instruction::getOpcodeName(getOpcode());
1205 }
1206
1207 printFlags(O);
1208 printOperands(O, SlotTracker);
1209
1210 if (auto DL = getDebugLoc()) {
1211 O << ", !dbg ";
1212 DL.print(O);
1213 }
1214 }
1215 #endif
1216
execute(VPTransformState & State)1217 void VPInstructionWithType::execute(VPTransformState &State) {
1218 State.setDebugLocFrom(getDebugLoc());
1219 if (isScalarCast()) {
1220 Value *Op = State.get(getOperand(0), VPLane(0));
1221 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1222 Op, ResultTy);
1223 State.set(this, Cast, VPLane(0));
1224 return;
1225 }
1226 switch (getOpcode()) {
1227 case VPInstruction::StepVector: {
1228 Value *StepVector =
1229 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1230 State.set(this, StepVector);
1231 break;
1232 }
1233 default:
1234 llvm_unreachable("opcode not implemented yet");
1235 }
1236 }
1237
1238 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1239 void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,
1240 VPSlotTracker &SlotTracker) const {
1241 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1242 printAsOperand(O, SlotTracker);
1243 O << " = ";
1244
1245 switch (getOpcode()) {
1246 case VPInstruction::WideIVStep:
1247 O << "wide-iv-step ";
1248 printOperands(O, SlotTracker);
1249 break;
1250 case VPInstruction::StepVector:
1251 O << "step-vector " << *ResultTy;
1252 break;
1253 default:
1254 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1255 O << Instruction::getOpcodeName(getOpcode()) << " ";
1256 printOperands(O, SlotTracker);
1257 O << " to " << *ResultTy;
1258 }
1259 }
1260 #endif
1261
execute(VPTransformState & State)1262 void VPPhi::execute(VPTransformState &State) {
1263 State.setDebugLocFrom(getDebugLoc());
1264 PHINode *NewPhi = State.Builder.CreatePHI(
1265 State.TypeAnalysis.inferScalarType(this), 2, getName());
1266 unsigned NumIncoming = getNumIncoming();
1267 if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1268 // TODO: Fixup all incoming values of header phis once recipes defining them
1269 // are introduced.
1270 NumIncoming = 1;
1271 }
1272 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1273 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1274 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1275 NewPhi->addIncoming(IncV, PredBB);
1276 }
1277 State.set(this, NewPhi, VPLane(0));
1278 }
1279
1280 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1281 void VPPhi::print(raw_ostream &O, const Twine &Indent,
1282 VPSlotTracker &SlotTracker) const {
1283 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1284 printAsOperand(O, SlotTracker);
1285 O << " = phi ";
1286 printPhiOperands(O, SlotTracker);
1287 }
1288 #endif
1289
create(Instruction & I)1290 VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1291 if (auto *Phi = dyn_cast<PHINode>(&I))
1292 return new VPIRPhi(*Phi);
1293 return new VPIRInstruction(I);
1294 }
1295
execute(VPTransformState & State)1296 void VPIRInstruction::execute(VPTransformState &State) {
1297 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1298 "PHINodes must be handled by VPIRPhi");
1299 // Advance the insert point after the wrapped IR instruction. This allows
1300 // interleaving VPIRInstructions and other recipes.
1301 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1302 }
1303
computeCost(ElementCount VF,VPCostContext & Ctx) const1304 InstructionCost VPIRInstruction::computeCost(ElementCount VF,
1305 VPCostContext &Ctx) const {
1306 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1307 // hence it does not contribute to the cost-modeling for the VPlan.
1308 return 0;
1309 }
1310
extractLastLaneOfFirstOperand(VPBuilder & Builder)1311 void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) {
1312 assert(isa<PHINode>(getInstruction()) &&
1313 "can only update exiting operands to phi nodes");
1314 assert(getNumOperands() > 0 && "must have at least one operand");
1315 VPValue *Exiting = getOperand(0);
1316 if (Exiting->isLiveIn())
1317 return;
1318
1319 Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting});
1320 setOperand(0, Exiting);
1321 }
1322
1323 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1324 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
1325 VPSlotTracker &SlotTracker) const {
1326 O << Indent << "IR " << I;
1327 }
1328 #endif
1329
execute(VPTransformState & State)1330 void VPIRPhi::execute(VPTransformState &State) {
1331 PHINode *Phi = &getIRPhi();
1332 for (const auto &[Idx, Op] : enumerate(operands())) {
1333 VPValue *ExitValue = Op;
1334 auto Lane = vputils::isSingleScalar(ExitValue)
1335 ? VPLane::getFirstLane()
1336 : VPLane::getLastLaneForVF(State.VF);
1337 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1338 auto *PredVPBB = Pred->getExitingBasicBlock();
1339 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1340 // Set insertion point in PredBB in case an extract needs to be generated.
1341 // TODO: Model extracts explicitly.
1342 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
1343 Value *V = State.get(ExitValue, VPLane(Lane));
1344 // If there is no existing block for PredBB in the phi, add a new incoming
1345 // value. Otherwise update the existing incoming value for PredBB.
1346 if (Phi->getBasicBlockIndex(PredBB) == -1)
1347 Phi->addIncoming(V, PredBB);
1348 else
1349 Phi->setIncomingValueForBlock(PredBB, V);
1350 }
1351
1352 // Advance the insert point after the wrapped IR instruction. This allows
1353 // interleaving VPIRInstructions and other recipes.
1354 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1355 }
1356
removeIncomingValueFor(VPBlockBase * IncomingBlock) const1357 void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
1358 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1359 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1360 "Number of phi operands must match number of predecessors");
1361 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1362 R->removeOperand(Position);
1363 }
1364
1365 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
printPhiOperands(raw_ostream & O,VPSlotTracker & SlotTracker) const1366 void VPPhiAccessors::printPhiOperands(raw_ostream &O,
1367 VPSlotTracker &SlotTracker) const {
1368 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1369 [this, &O, &SlotTracker](auto Op) {
1370 O << "[ ";
1371 Op.value()->printAsOperand(O, SlotTracker);
1372 O << ", ";
1373 getIncomingBlock(Op.index())->printAsOperand(O);
1374 O << " ]";
1375 });
1376 }
1377 #endif
1378
1379 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1380 void VPIRPhi::print(raw_ostream &O, const Twine &Indent,
1381 VPSlotTracker &SlotTracker) const {
1382 VPIRInstruction::print(O, Indent, SlotTracker);
1383
1384 if (getNumOperands() != 0) {
1385 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1386 interleaveComma(
1387 enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
1388 Op.value()->printAsOperand(O, SlotTracker);
1389 O << " from ";
1390 getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
1391 });
1392 O << ")";
1393 }
1394 }
1395 #endif
1396
VPIRMetadata(Instruction & I,LoopVersioning * LVer)1397 VPIRMetadata::VPIRMetadata(Instruction &I, LoopVersioning *LVer)
1398 : VPIRMetadata(I) {
1399 if (!LVer || !isa<LoadInst, StoreInst>(&I))
1400 return;
1401 const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(&I);
1402 if (AliasScopeMD)
1403 Metadata.emplace_back(LLVMContext::MD_alias_scope, AliasScopeMD);
1404 if (NoAliasMD)
1405 Metadata.emplace_back(LLVMContext::MD_noalias, NoAliasMD);
1406 }
1407
applyMetadata(Instruction & I) const1408 void VPIRMetadata::applyMetadata(Instruction &I) const {
1409 for (const auto &[Kind, Node] : Metadata)
1410 I.setMetadata(Kind, Node);
1411 }
1412
execute(VPTransformState & State)1413 void VPWidenCallRecipe::execute(VPTransformState &State) {
1414 assert(State.VF.isVector() && "not widening");
1415 assert(Variant != nullptr && "Can't create vector function.");
1416
1417 FunctionType *VFTy = Variant->getFunctionType();
1418 // Add return type if intrinsic is overloaded on it.
1419 SmallVector<Value *, 4> Args;
1420 for (const auto &I : enumerate(args())) {
1421 Value *Arg;
1422 // Some vectorized function variants may also take a scalar argument,
1423 // e.g. linear parameters for pointers. This needs to be the scalar value
1424 // from the start of the respective part when interleaving.
1425 if (!VFTy->getParamType(I.index())->isVectorTy())
1426 Arg = State.get(I.value(), VPLane(0));
1427 else
1428 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1429 Args.push_back(Arg);
1430 }
1431
1432 auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
1433 SmallVector<OperandBundleDef, 1> OpBundles;
1434 if (CI)
1435 CI->getOperandBundlesAsDefs(OpBundles);
1436
1437 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
1438 applyFlags(*V);
1439 applyMetadata(*V);
1440 V->setCallingConv(Variant->getCallingConv());
1441
1442 if (!V->getType()->isVoidTy())
1443 State.set(this, V);
1444 }
1445
computeCost(ElementCount VF,VPCostContext & Ctx) const1446 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
1447 VPCostContext &Ctx) const {
1448 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
1449 Variant->getFunctionType()->params(),
1450 Ctx.CostKind);
1451 }
1452
1453 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1454 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
1455 VPSlotTracker &SlotTracker) const {
1456 O << Indent << "WIDEN-CALL ";
1457
1458 Function *CalledFn = getCalledScalarFunction();
1459 if (CalledFn->getReturnType()->isVoidTy())
1460 O << "void ";
1461 else {
1462 printAsOperand(O, SlotTracker);
1463 O << " = ";
1464 }
1465
1466 O << "call";
1467 printFlags(O);
1468 O << " @" << CalledFn->getName() << "(";
1469 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1470 Op->printAsOperand(O, SlotTracker);
1471 });
1472 O << ")";
1473
1474 O << " (using library function";
1475 if (Variant->hasName())
1476 O << ": " << Variant->getName();
1477 O << ")";
1478 }
1479 #endif
1480
execute(VPTransformState & State)1481 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
1482 assert(State.VF.isVector() && "not widening");
1483
1484 SmallVector<Type *, 2> TysForDecl;
1485 // Add return type if intrinsic is overloaded on it.
1486 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
1487 TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
1488 SmallVector<Value *, 4> Args;
1489 for (const auto &I : enumerate(operands())) {
1490 // Some intrinsics have a scalar argument - don't replace it with a
1491 // vector.
1492 Value *Arg;
1493 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1494 State.TTI))
1495 Arg = State.get(I.value(), VPLane(0));
1496 else
1497 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1498 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1499 State.TTI))
1500 TysForDecl.push_back(Arg->getType());
1501 Args.push_back(Arg);
1502 }
1503
1504 // Use vector version of the intrinsic.
1505 Module *M = State.Builder.GetInsertBlock()->getModule();
1506 Function *VectorF =
1507 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1508 assert(VectorF &&
1509 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1510
1511 auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
1512 SmallVector<OperandBundleDef, 1> OpBundles;
1513 if (CI)
1514 CI->getOperandBundlesAsDefs(OpBundles);
1515
1516 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1517
1518 applyFlags(*V);
1519 applyMetadata(*V);
1520
1521 if (!V->getType()->isVoidTy())
1522 State.set(this, V);
1523 }
1524
computeCost(ElementCount VF,VPCostContext & Ctx) const1525 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1526 VPCostContext &Ctx) const {
1527 // Some backends analyze intrinsic arguments to determine cost. Use the
1528 // underlying value for the operand if it has one. Otherwise try to use the
1529 // operand of the underlying call instruction, if there is one. Otherwise
1530 // clear Arguments.
1531 // TODO: Rework TTI interface to be independent of concrete IR values.
1532 SmallVector<const Value *> Arguments;
1533 for (const auto &[Idx, Op] : enumerate(operands())) {
1534 auto *V = Op->getUnderlyingValue();
1535 if (!V) {
1536 if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
1537 Arguments.push_back(UI->getArgOperand(Idx));
1538 continue;
1539 }
1540 Arguments.clear();
1541 break;
1542 }
1543 Arguments.push_back(V);
1544 }
1545
1546 Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
1547 SmallVector<Type *> ParamTys;
1548 for (unsigned I = 0; I != getNumOperands(); ++I)
1549 ParamTys.push_back(
1550 toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
1551
1552 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1553 FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
1554 IntrinsicCostAttributes CostAttrs(
1555 VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
1556 dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
1557 InstructionCost::getInvalid(), &Ctx.TLI);
1558 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
1559 }
1560
getIntrinsicName() const1561 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1562 return Intrinsic::getBaseName(VectorIntrinsicID);
1563 }
1564
onlyFirstLaneUsed(const VPValue * Op) const1565 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
1566 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1567 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
1568 auto [Idx, V] = X;
1569 return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
1570 Idx, nullptr);
1571 });
1572 }
1573
1574 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1575 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
1576 VPSlotTracker &SlotTracker) const {
1577 O << Indent << "WIDEN-INTRINSIC ";
1578 if (ResultTy->isVoidTy()) {
1579 O << "void ";
1580 } else {
1581 printAsOperand(O, SlotTracker);
1582 O << " = ";
1583 }
1584
1585 O << "call";
1586 printFlags(O);
1587 O << getIntrinsicName() << "(";
1588
1589 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1590 Op->printAsOperand(O, SlotTracker);
1591 });
1592 O << ")";
1593 }
1594 #endif
1595
execute(VPTransformState & State)1596 void VPHistogramRecipe::execute(VPTransformState &State) {
1597 IRBuilderBase &Builder = State.Builder;
1598
1599 Value *Address = State.get(getOperand(0));
1600 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1601 VectorType *VTy = cast<VectorType>(Address->getType());
1602
1603 // The histogram intrinsic requires a mask even if the recipe doesn't;
1604 // if the mask operand was omitted then all lanes should be executed and
1605 // we just need to synthesize an all-true mask.
1606 Value *Mask = nullptr;
1607 if (VPValue *VPMask = getMask())
1608 Mask = State.get(VPMask);
1609 else
1610 Mask =
1611 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1612
1613 // If this is a subtract, we want to invert the increment amount. We may
1614 // add a separate intrinsic in future, but for now we'll try this.
1615 if (Opcode == Instruction::Sub)
1616 IncAmt = Builder.CreateNeg(IncAmt);
1617 else
1618 assert(Opcode == Instruction::Add && "only add or sub supported for now");
1619
1620 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1621 {VTy, IncAmt->getType()},
1622 {Address, IncAmt, Mask});
1623 }
1624
computeCost(ElementCount VF,VPCostContext & Ctx) const1625 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
1626 VPCostContext &Ctx) const {
1627 // FIXME: Take the gather and scatter into account as well. For now we're
1628 // generating the same cost as the fallback path, but we'll likely
1629 // need to create a new TTI method for determining the cost, including
1630 // whether we can use base + vec-of-smaller-indices or just
1631 // vec-of-pointers.
1632 assert(VF.isVector() && "Invalid VF for histogram cost");
1633 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
1634 VPValue *IncAmt = getOperand(1);
1635 Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
1636 VectorType *VTy = VectorType::get(IncTy, VF);
1637
1638 // Assume that a non-constant update value (or a constant != 1) requires
1639 // a multiply, and add that into the cost.
1640 InstructionCost MulCost =
1641 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
1642 if (IncAmt->isLiveIn()) {
1643 ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());
1644
1645 if (CI && CI->getZExtValue() == 1)
1646 MulCost = TTI::TCC_Free;
1647 }
1648
1649 // Find the cost of the histogram operation itself.
1650 Type *PtrTy = VectorType::get(AddressTy, VF);
1651 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1652 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1653 Type::getVoidTy(Ctx.LLVMCtx),
1654 {PtrTy, IncTy, MaskTy});
1655
1656 // Add the costs together with the add/sub operation.
1657 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
1658 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
1659 }
1660
1661 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1662 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
1663 VPSlotTracker &SlotTracker) const {
1664 O << Indent << "WIDEN-HISTOGRAM buckets: ";
1665 getOperand(0)->printAsOperand(O, SlotTracker);
1666
1667 if (Opcode == Instruction::Sub)
1668 O << ", dec: ";
1669 else {
1670 assert(Opcode == Instruction::Add);
1671 O << ", inc: ";
1672 }
1673 getOperand(1)->printAsOperand(O, SlotTracker);
1674
1675 if (VPValue *Mask = getMask()) {
1676 O << ", mask: ";
1677 Mask->printAsOperand(O, SlotTracker);
1678 }
1679 }
1680
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1681 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1682 VPSlotTracker &SlotTracker) const {
1683 O << Indent << "WIDEN-SELECT ";
1684 printAsOperand(O, SlotTracker);
1685 O << " = select ";
1686 printFlags(O);
1687 getOperand(0)->printAsOperand(O, SlotTracker);
1688 O << ", ";
1689 getOperand(1)->printAsOperand(O, SlotTracker);
1690 O << ", ";
1691 getOperand(2)->printAsOperand(O, SlotTracker);
1692 O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1693 }
1694 #endif
1695
execute(VPTransformState & State)1696 void VPWidenSelectRecipe::execute(VPTransformState &State) {
1697 // The condition can be loop invariant but still defined inside the
1698 // loop. This means that we can't just use the original 'cond' value.
1699 // We have to take the 'vectorized' value and pick the first lane.
1700 // Instcombine will make this a no-op.
1701 auto *InvarCond =
1702 isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr;
1703
1704 Value *Cond = InvarCond ? InvarCond : State.get(getCond());
1705 Value *Op0 = State.get(getOperand(1));
1706 Value *Op1 = State.get(getOperand(2));
1707 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1708 State.set(this, Sel);
1709 if (auto *I = dyn_cast<Instruction>(Sel)) {
1710 if (isa<FPMathOperator>(I))
1711 applyFlags(*I);
1712 applyMetadata(*I);
1713 }
1714 }
1715
computeCost(ElementCount VF,VPCostContext & Ctx) const1716 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
1717 VPCostContext &Ctx) const {
1718 SelectInst *SI = cast<SelectInst>(getUnderlyingValue());
1719 bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1720 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1721 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1722
1723 VPValue *Op0, *Op1;
1724 using namespace llvm::VPlanPatternMatch;
1725 if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1726 (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
1727 match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
1728 // select x, y, false --> x & y
1729 // select x, true, y --> x | y
1730 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1731 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1732
1733 SmallVector<const Value *, 2> Operands;
1734 if (all_of(operands(),
1735 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1736 Operands.append(SI->op_begin(), SI->op_end());
1737 bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1738 return Ctx.TTI.getArithmeticInstrCost(
1739 IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
1740 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1741 }
1742
1743 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1744 if (!ScalarCond)
1745 CondTy = VectorType::get(CondTy, VF);
1746
1747 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1748 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
1749 Pred = Cmp->getPredicate();
1750 return Ctx.TTI.getCmpSelInstrCost(
1751 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1752 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1753 }
1754
FastMathFlagsTy(const FastMathFlags & FMF)1755 VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
1756 AllowReassoc = FMF.allowReassoc();
1757 NoNaNs = FMF.noNaNs();
1758 NoInfs = FMF.noInfs();
1759 NoSignedZeros = FMF.noSignedZeros();
1760 AllowReciprocal = FMF.allowReciprocal();
1761 AllowContract = FMF.allowContract();
1762 ApproxFunc = FMF.approxFunc();
1763 }
1764
1765 #if !defined(NDEBUG)
flagsValidForOpcode(unsigned Opcode) const1766 bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
1767 switch (OpType) {
1768 case OperationType::OverflowingBinOp:
1769 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
1770 Opcode == Instruction::Mul ||
1771 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
1772 case OperationType::Trunc:
1773 return Opcode == Instruction::Trunc;
1774 case OperationType::DisjointOp:
1775 return Opcode == Instruction::Or;
1776 case OperationType::PossiblyExactOp:
1777 return Opcode == Instruction::AShr;
1778 case OperationType::GEPOp:
1779 return Opcode == Instruction::GetElementPtr ||
1780 Opcode == VPInstruction::PtrAdd;
1781 case OperationType::FPMathOp:
1782 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
1783 Opcode == Instruction::FSub || Opcode == Instruction::FNeg ||
1784 Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
1785 Opcode == Instruction::FCmp || Opcode == Instruction::Select ||
1786 Opcode == VPInstruction::WideIVStep ||
1787 Opcode == VPInstruction::ReductionStartVector ||
1788 Opcode == VPInstruction::ComputeReductionResult;
1789 case OperationType::NonNegOp:
1790 return Opcode == Instruction::ZExt;
1791 break;
1792 case OperationType::Cmp:
1793 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
1794 case OperationType::Other:
1795 return true;
1796 }
1797 llvm_unreachable("Unknown OperationType enum");
1798 }
1799 #endif
1800
1801 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
printFlags(raw_ostream & O) const1802 void VPIRFlags::printFlags(raw_ostream &O) const {
1803 switch (OpType) {
1804 case OperationType::Cmp:
1805 O << " " << CmpInst::getPredicateName(getPredicate());
1806 break;
1807 case OperationType::DisjointOp:
1808 if (DisjointFlags.IsDisjoint)
1809 O << " disjoint";
1810 break;
1811 case OperationType::PossiblyExactOp:
1812 if (ExactFlags.IsExact)
1813 O << " exact";
1814 break;
1815 case OperationType::OverflowingBinOp:
1816 if (WrapFlags.HasNUW)
1817 O << " nuw";
1818 if (WrapFlags.HasNSW)
1819 O << " nsw";
1820 break;
1821 case OperationType::Trunc:
1822 if (TruncFlags.HasNUW)
1823 O << " nuw";
1824 if (TruncFlags.HasNSW)
1825 O << " nsw";
1826 break;
1827 case OperationType::FPMathOp:
1828 getFastMathFlags().print(O);
1829 break;
1830 case OperationType::GEPOp:
1831 if (GEPFlags.isInBounds())
1832 O << " inbounds";
1833 else if (GEPFlags.hasNoUnsignedSignedWrap())
1834 O << " nusw";
1835 if (GEPFlags.hasNoUnsignedWrap())
1836 O << " nuw";
1837 break;
1838 case OperationType::NonNegOp:
1839 if (NonNegFlags.NonNeg)
1840 O << " nneg";
1841 break;
1842 case OperationType::Other:
1843 break;
1844 }
1845 O << " ";
1846 }
1847 #endif
1848
execute(VPTransformState & State)1849 void VPWidenRecipe::execute(VPTransformState &State) {
1850 auto &Builder = State.Builder;
1851 switch (Opcode) {
1852 case Instruction::Call:
1853 case Instruction::Br:
1854 case Instruction::PHI:
1855 case Instruction::GetElementPtr:
1856 case Instruction::Select:
1857 llvm_unreachable("This instruction is handled by a different recipe.");
1858 case Instruction::UDiv:
1859 case Instruction::SDiv:
1860 case Instruction::SRem:
1861 case Instruction::URem:
1862 case Instruction::Add:
1863 case Instruction::FAdd:
1864 case Instruction::Sub:
1865 case Instruction::FSub:
1866 case Instruction::FNeg:
1867 case Instruction::Mul:
1868 case Instruction::FMul:
1869 case Instruction::FDiv:
1870 case Instruction::FRem:
1871 case Instruction::Shl:
1872 case Instruction::LShr:
1873 case Instruction::AShr:
1874 case Instruction::And:
1875 case Instruction::Or:
1876 case Instruction::Xor: {
1877 // Just widen unops and binops.
1878 SmallVector<Value *, 2> Ops;
1879 for (VPValue *VPOp : operands())
1880 Ops.push_back(State.get(VPOp));
1881
1882 Value *V = Builder.CreateNAryOp(Opcode, Ops);
1883
1884 if (auto *VecOp = dyn_cast<Instruction>(V)) {
1885 applyFlags(*VecOp);
1886 applyMetadata(*VecOp);
1887 }
1888
1889 // Use this vector value for all users of the original instruction.
1890 State.set(this, V);
1891 break;
1892 }
1893 case Instruction::ExtractValue: {
1894 assert(getNumOperands() == 2 && "expected single level extractvalue");
1895 Value *Op = State.get(getOperand(0));
1896 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
1897 Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue());
1898 State.set(this, Extract);
1899 break;
1900 }
1901 case Instruction::Freeze: {
1902 Value *Op = State.get(getOperand(0));
1903 Value *Freeze = Builder.CreateFreeze(Op);
1904 State.set(this, Freeze);
1905 break;
1906 }
1907 case Instruction::ICmp:
1908 case Instruction::FCmp: {
1909 // Widen compares. Generate vector compares.
1910 bool FCmp = Opcode == Instruction::FCmp;
1911 Value *A = State.get(getOperand(0));
1912 Value *B = State.get(getOperand(1));
1913 Value *C = nullptr;
1914 if (FCmp) {
1915 // Propagate fast math flags.
1916 C = Builder.CreateFCmpFMF(
1917 getPredicate(), A, B,
1918 dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1919 } else {
1920 C = Builder.CreateICmp(getPredicate(), A, B);
1921 }
1922 if (auto *I = dyn_cast<Instruction>(C))
1923 applyMetadata(*I);
1924 State.set(this, C);
1925 break;
1926 }
1927 default:
1928 // This instruction is not vectorized by simple widening.
1929 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1930 << Instruction::getOpcodeName(Opcode));
1931 llvm_unreachable("Unhandled instruction!");
1932 } // end of switch.
1933
1934 #if !defined(NDEBUG)
1935 // Verify that VPlan type inference results agree with the type of the
1936 // generated values.
1937 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
1938 State.get(this)->getType() &&
1939 "inferred type and type from generated instructions do not match");
1940 #endif
1941 }
1942
computeCost(ElementCount VF,VPCostContext & Ctx) const1943 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1944 VPCostContext &Ctx) const {
1945 switch (Opcode) {
1946 case Instruction::FNeg: {
1947 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1948 return Ctx.TTI.getArithmeticInstrCost(
1949 Opcode, VectorTy, Ctx.CostKind,
1950 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1951 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1952 }
1953
1954 case Instruction::UDiv:
1955 case Instruction::SDiv:
1956 case Instruction::SRem:
1957 case Instruction::URem:
1958 // More complex computation, let the legacy cost-model handle this for now.
1959 return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1960 case Instruction::Add:
1961 case Instruction::FAdd:
1962 case Instruction::Sub:
1963 case Instruction::FSub:
1964 case Instruction::Mul:
1965 case Instruction::FMul:
1966 case Instruction::FDiv:
1967 case Instruction::FRem:
1968 case Instruction::Shl:
1969 case Instruction::LShr:
1970 case Instruction::AShr:
1971 case Instruction::And:
1972 case Instruction::Or:
1973 case Instruction::Xor: {
1974 VPValue *RHS = getOperand(1);
1975 // Certain instructions can be cheaper to vectorize if they have a constant
1976 // second vector operand. One example of this are shifts on x86.
1977 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);
1978
1979 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1980 getOperand(1)->isDefinedOutsideLoopRegions())
1981 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1982 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1983 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1984
1985 SmallVector<const Value *, 4> Operands;
1986 if (CtxI)
1987 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1988 return Ctx.TTI.getArithmeticInstrCost(
1989 Opcode, VectorTy, Ctx.CostKind,
1990 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1991 RHSInfo, Operands, CtxI, &Ctx.TLI);
1992 }
1993 case Instruction::Freeze: {
1994 // This opcode is unknown. Assume that it is the same as 'mul'.
1995 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1996 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
1997 Ctx.CostKind);
1998 }
1999 case Instruction::ExtractValue: {
2000 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
2001 Ctx.CostKind);
2002 }
2003 case Instruction::ICmp:
2004 case Instruction::FCmp: {
2005 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
2006 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
2007 return Ctx.TTI.getCmpSelInstrCost(
2008 Opcode, VectorTy, CmpInst::makeCmpResultType(VectorTy), getPredicate(),
2009 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2010 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
2011 }
2012 default:
2013 llvm_unreachable("Unsupported opcode for instruction");
2014 }
2015 }
2016
2017 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2018 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
2019 VPSlotTracker &SlotTracker) const {
2020 O << Indent << "WIDEN ";
2021 printAsOperand(O, SlotTracker);
2022 O << " = " << Instruction::getOpcodeName(Opcode);
2023 printFlags(O);
2024 printOperands(O, SlotTracker);
2025 }
2026 #endif
2027
execute(VPTransformState & State)2028 void VPWidenCastRecipe::execute(VPTransformState &State) {
2029 auto &Builder = State.Builder;
2030 /// Vectorize casts.
2031 assert(State.VF.isVector() && "Not vectorizing?");
2032 Type *DestTy = VectorType::get(getResultType(), State.VF);
2033 VPValue *Op = getOperand(0);
2034 Value *A = State.get(Op);
2035 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2036 State.set(this, Cast);
2037 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2038 applyFlags(*CastOp);
2039 applyMetadata(*CastOp);
2040 }
2041 }
2042
computeCost(ElementCount VF,VPCostContext & Ctx) const2043 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
2044 VPCostContext &Ctx) const {
2045 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2046 // the legacy cost model, including truncates/extends when evaluating a
2047 // reduction in a smaller type.
2048 if (!getUnderlyingValue())
2049 return 0;
2050 // Computes the CastContextHint from a recipes that may access memory.
2051 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
2052 if (VF.isScalar())
2053 return TTI::CastContextHint::Normal;
2054 if (isa<VPInterleaveRecipe>(R))
2055 return TTI::CastContextHint::Interleave;
2056 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
2057 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
2058 : TTI::CastContextHint::Normal;
2059 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
2060 if (WidenMemoryRecipe == nullptr)
2061 return TTI::CastContextHint::None;
2062 if (!WidenMemoryRecipe->isConsecutive())
2063 return TTI::CastContextHint::GatherScatter;
2064 if (WidenMemoryRecipe->isReverse())
2065 return TTI::CastContextHint::Reversed;
2066 if (WidenMemoryRecipe->isMasked())
2067 return TTI::CastContextHint::Masked;
2068 return TTI::CastContextHint::Normal;
2069 };
2070
2071 VPValue *Operand = getOperand(0);
2072 TTI::CastContextHint CCH = TTI::CastContextHint::None;
2073 // For Trunc/FPTrunc, get the context from the only user.
2074 if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
2075 !hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
2076 if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
2077 CCH = ComputeCCH(StoreRecipe);
2078 }
2079 // For Z/Sext, get the context from the operand.
2080 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
2081 Opcode == Instruction::FPExt) {
2082 if (Operand->isLiveIn())
2083 CCH = TTI::CastContextHint::Normal;
2084 else if (Operand->getDefiningRecipe())
2085 CCH = ComputeCCH(Operand->getDefiningRecipe());
2086 }
2087
2088 auto *SrcTy =
2089 cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
2090 auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
2091 // Arm TTI will use the underlying instruction to determine the cost.
2092 return Ctx.TTI.getCastInstrCost(
2093 Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
2094 dyn_cast_if_present<Instruction>(getUnderlyingValue()));
2095 }
2096
2097 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2098 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
2099 VPSlotTracker &SlotTracker) const {
2100 O << Indent << "WIDEN-CAST ";
2101 printAsOperand(O, SlotTracker);
2102 O << " = " << Instruction::getOpcodeName(Opcode);
2103 printFlags(O);
2104 printOperands(O, SlotTracker);
2105 O << " to " << *getResultType();
2106 }
2107 #endif
2108
computeCost(ElementCount VF,VPCostContext & Ctx) const2109 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
2110 VPCostContext &Ctx) const {
2111 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2112 }
2113
2114 /// A helper function that returns an integer or floating-point constant with
2115 /// value C.
getSignedIntOrFpConstant(Type * Ty,int64_t C)2116 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
2117 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
2118 : ConstantFP::get(Ty, C);
2119 }
2120
2121 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2122 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2123 VPSlotTracker &SlotTracker) const {
2124 O << Indent;
2125 printAsOperand(O, SlotTracker);
2126 O << " = WIDEN-INDUCTION ";
2127 printOperands(O, SlotTracker);
2128
2129 if (auto *TI = getTruncInst())
2130 O << " (truncated to " << *TI->getType() << ")";
2131 }
2132 #endif
2133
isCanonical() const2134 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
2135 // The step may be defined by a recipe in the preheader (e.g. if it requires
2136 // SCEV expansion), but for the canonical induction the step is required to be
2137 // 1, which is represented as live-in.
2138 if (getStepValue()->getDefiningRecipe())
2139 return false;
2140 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
2141 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
2142 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
2143 return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2144 getScalarType() == CanIV->getScalarType();
2145 }
2146
2147 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2148 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
2149 VPSlotTracker &SlotTracker) const {
2150 O << Indent;
2151 printAsOperand(O, SlotTracker);
2152 O << " = DERIVED-IV ";
2153 getStartValue()->printAsOperand(O, SlotTracker);
2154 O << " + ";
2155 getOperand(1)->printAsOperand(O, SlotTracker);
2156 O << " * ";
2157 getStepValue()->printAsOperand(O, SlotTracker);
2158 }
2159 #endif
2160
execute(VPTransformState & State)2161 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
2162 // Fast-math-flags propagate from the original induction instruction.
2163 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2164 if (hasFastMathFlags())
2165 State.Builder.setFastMathFlags(getFastMathFlags());
2166
2167 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2168 /// variable on which to base the steps, \p Step is the size of the step.
2169
2170 Value *BaseIV = State.get(getOperand(0), VPLane(0));
2171 Value *Step = State.get(getStepValue(), VPLane(0));
2172 IRBuilderBase &Builder = State.Builder;
2173
2174 // Ensure step has the same type as that of scalar IV.
2175 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2176 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2177
2178 // We build scalar steps for both integer and floating-point induction
2179 // variables. Here, we determine the kind of arithmetic we will perform.
2180 Instruction::BinaryOps AddOp;
2181 Instruction::BinaryOps MulOp;
2182 if (BaseIVTy->isIntegerTy()) {
2183 AddOp = Instruction::Add;
2184 MulOp = Instruction::Mul;
2185 } else {
2186 AddOp = InductionOpcode;
2187 MulOp = Instruction::FMul;
2188 }
2189
2190 // Determine the number of scalars we need to generate for each unroll
2191 // iteration.
2192 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2193 // Compute the scalar steps and save the results in State.
2194 Type *IntStepTy =
2195 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
2196 Type *VecIVTy = nullptr;
2197 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2198 if (!FirstLaneOnly && State.VF.isScalable()) {
2199 VecIVTy = VectorType::get(BaseIVTy, State.VF);
2200 UnitStepVec =
2201 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2202 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2203 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
2204 }
2205
2206 unsigned StartLane = 0;
2207 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2208 if (State.Lane) {
2209 StartLane = State.Lane->getKnownLane();
2210 EndLane = StartLane + 1;
2211 }
2212 Value *StartIdx0;
2213 if (getUnrollPart(*this) == 0)
2214 StartIdx0 = ConstantInt::get(IntStepTy, 0);
2215 else {
2216 StartIdx0 = State.get(getOperand(2), true);
2217 if (getUnrollPart(*this) != 1) {
2218 StartIdx0 =
2219 Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(),
2220 getUnrollPart(*this)));
2221 }
2222 StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy);
2223 }
2224
2225 if (!FirstLaneOnly && State.VF.isScalable()) {
2226 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2227 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2228 if (BaseIVTy->isFloatingPointTy())
2229 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2230 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2231 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2232 State.set(this, Add);
2233 // It's useful to record the lane values too for the known minimum number
2234 // of elements so we do those below. This improves the code quality when
2235 // trying to extract the first element, for example.
2236 }
2237
2238 if (BaseIVTy->isFloatingPointTy())
2239 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
2240
2241 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2242 Value *StartIdx = Builder.CreateBinOp(
2243 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
2244 // The step returned by `createStepForVF` is a runtime-evaluated value
2245 // when VF is scalable. Otherwise, it should be folded into a Constant.
2246 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2247 "Expected StartIdx to be folded to a constant when VF is not "
2248 "scalable");
2249 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2250 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2251 State.set(this, Add, VPLane(Lane));
2252 }
2253 }
2254
2255 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2256 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
2257 VPSlotTracker &SlotTracker) const {
2258 O << Indent;
2259 printAsOperand(O, SlotTracker);
2260 O << " = SCALAR-STEPS ";
2261 printOperands(O, SlotTracker);
2262 }
2263 #endif
2264
execute(VPTransformState & State)2265 void VPWidenGEPRecipe::execute(VPTransformState &State) {
2266 assert(State.VF.isVector() && "not widening");
2267 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
2268 // Construct a vector GEP by widening the operands of the scalar GEP as
2269 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2270 // results in a vector of pointers when at least one operand of the GEP
2271 // is vector-typed. Thus, to keep the representation compact, we only use
2272 // vector-typed operands for loop-varying values.
2273
2274 if (areAllOperandsInvariant()) {
2275 // If we are vectorizing, but the GEP has only loop-invariant operands,
2276 // the GEP we build (by only using vector-typed operands for
2277 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2278 // produce a vector of pointers, we need to either arbitrarily pick an
2279 // operand to broadcast, or broadcast a clone of the original GEP.
2280 // Here, we broadcast a clone of the original.
2281 //
2282 // TODO: If at some point we decide to scalarize instructions having
2283 // loop-invariant operands, this special case will no longer be
2284 // required. We would add the scalarization decision to
2285 // collectLoopScalars() and teach getVectorValue() to broadcast
2286 // the lane-zero scalar value.
2287 SmallVector<Value *> Ops;
2288 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2289 Ops.push_back(State.get(getOperand(I), VPLane(0)));
2290
2291 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
2292 ArrayRef(Ops).drop_front(), "",
2293 getGEPNoWrapFlags());
2294 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2295 State.set(this, Splat);
2296 } else {
2297 // If the GEP has at least one loop-varying operand, we are sure to
2298 // produce a vector of pointers unless VF is scalar.
2299 // The pointer operand of the new GEP. If it's loop-invariant, we
2300 // won't broadcast it.
2301 auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0))
2302 : State.get(getOperand(0));
2303
2304 // Collect all the indices for the new GEP. If any index is
2305 // loop-invariant, we won't broadcast it.
2306 SmallVector<Value *, 4> Indices;
2307 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2308 VPValue *Operand = getOperand(I);
2309 if (isIndexLoopInvariant(I - 1))
2310 Indices.push_back(State.get(Operand, VPLane(0)));
2311 else
2312 Indices.push_back(State.get(Operand));
2313 }
2314
2315 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2316 // but it should be a vector, otherwise.
2317 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
2318 Indices, "", getGEPNoWrapFlags());
2319 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2320 "NewGEP is not a pointer vector");
2321 State.set(this, NewGEP);
2322 }
2323 }
2324
2325 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2326 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
2327 VPSlotTracker &SlotTracker) const {
2328 O << Indent << "WIDEN-GEP ";
2329 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2330 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2331 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2332
2333 O << " ";
2334 printAsOperand(O, SlotTracker);
2335 O << " = getelementptr";
2336 printFlags(O);
2337 printOperands(O, SlotTracker);
2338 }
2339 #endif
2340
getGEPIndexTy(bool IsScalable,bool IsReverse,bool IsUnitStride,unsigned CurrentPart,IRBuilderBase & Builder)2341 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride,
2342 unsigned CurrentPart, IRBuilderBase &Builder) {
2343 // Use i32 for the gep index type when the value is constant,
2344 // or query DataLayout for a more suitable index type otherwise.
2345 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2346 return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0))
2347 ? DL.getIndexType(Builder.getPtrTy(0))
2348 : Builder.getInt32Ty();
2349 }
2350
execute(VPTransformState & State)2351 void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
2352 auto &Builder = State.Builder;
2353 unsigned CurrentPart = getUnrollPart(*this);
2354 bool IsUnitStride = Stride == 1 || Stride == -1;
2355 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
2356 IsUnitStride, CurrentPart, Builder);
2357
2358 // The wide store needs to start at the last vector element.
2359 Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
2360 if (IndexTy != RunTimeVF->getType())
2361 RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
2362 // NumElt = Stride * CurrentPart * RunTimeVF
2363 Value *NumElt = Builder.CreateMul(
2364 ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
2365 // LastLane = Stride * (RunTimeVF - 1)
2366 Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
2367 if (Stride != 1)
2368 LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane);
2369 Value *Ptr = State.get(getOperand(0), VPLane(0));
2370 Value *ResultPtr =
2371 Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
2372 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
2373 getGEPNoWrapFlags());
2374
2375 State.set(this, ResultPtr, /*IsScalar*/ true);
2376 }
2377
2378 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2379 void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2380 VPSlotTracker &SlotTracker) const {
2381 O << Indent;
2382 printAsOperand(O, SlotTracker);
2383 O << " = vector-end-pointer";
2384 printFlags(O);
2385 printOperands(O, SlotTracker);
2386 }
2387 #endif
2388
execute(VPTransformState & State)2389 void VPVectorPointerRecipe::execute(VPTransformState &State) {
2390 auto &Builder = State.Builder;
2391 unsigned CurrentPart = getUnrollPart(*this);
2392 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2393 /*IsUnitStride*/ true, CurrentPart, Builder);
2394 Value *Ptr = State.get(getOperand(0), VPLane(0));
2395
2396 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2397 Value *ResultPtr =
2398 Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
2399
2400 State.set(this, ResultPtr, /*IsScalar*/ true);
2401 }
2402
2403 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2404 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2405 VPSlotTracker &SlotTracker) const {
2406 O << Indent;
2407 printAsOperand(O, SlotTracker);
2408 O << " = vector-pointer ";
2409
2410 printOperands(O, SlotTracker);
2411 }
2412 #endif
2413
execute(VPTransformState & State)2414 void VPBlendRecipe::execute(VPTransformState &State) {
2415 assert(isNormalized() && "Expected blend to be normalized!");
2416 // We know that all PHIs in non-header blocks are converted into
2417 // selects, so we don't have to worry about the insertion order and we
2418 // can just use the builder.
2419 // At this point we generate the predication tree. There may be
2420 // duplications since this is a simple recursive scan, but future
2421 // optimizations will clean it up.
2422
2423 unsigned NumIncoming = getNumIncomingValues();
2424
2425 // Generate a sequence of selects of the form:
2426 // SELECT(Mask3, In3,
2427 // SELECT(Mask2, In2,
2428 // SELECT(Mask1, In1,
2429 // In0)))
2430 // Note that Mask0 is never used: lanes for which no path reaches this phi and
2431 // are essentially undef are taken from In0.
2432 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
2433 Value *Result = nullptr;
2434 for (unsigned In = 0; In < NumIncoming; ++In) {
2435 // We might have single edge PHIs (blocks) - use an identity
2436 // 'select' for the first PHI operand.
2437 Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
2438 if (In == 0)
2439 Result = In0; // Initialize with the first incoming value.
2440 else {
2441 // Select between the current value and the previous incoming edge
2442 // based on the incoming mask.
2443 Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
2444 Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
2445 }
2446 }
2447 State.set(this, Result, OnlyFirstLaneUsed);
2448 }
2449
computeCost(ElementCount VF,VPCostContext & Ctx) const2450 InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2451 VPCostContext &Ctx) const {
2452 // Handle cases where only the first lane is used the same way as the legacy
2453 // cost model.
2454 if (vputils::onlyFirstLaneUsed(this))
2455 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2456
2457 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2458 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2459 return (getNumIncomingValues() - 1) *
2460 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2461 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2462 }
2463
2464 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2465 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
2466 VPSlotTracker &SlotTracker) const {
2467 O << Indent << "BLEND ";
2468 printAsOperand(O, SlotTracker);
2469 O << " =";
2470 if (getNumIncomingValues() == 1) {
2471 // Not a User of any mask: not really blending, this is a
2472 // single-predecessor phi.
2473 O << " ";
2474 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2475 } else {
2476 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2477 O << " ";
2478 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2479 if (I == 0)
2480 continue;
2481 O << "/";
2482 getMask(I)->printAsOperand(O, SlotTracker);
2483 }
2484 }
2485 }
2486 #endif
2487
execute(VPTransformState & State)2488 void VPReductionRecipe::execute(VPTransformState &State) {
2489 assert(!State.Lane && "Reduction being replicated.");
2490 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2491 RecurKind Kind = getRecurrenceKind();
2492 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
2493 "In-loop AnyOf reductions aren't currently supported");
2494 // Propagate the fast-math flags carried by the underlying instruction.
2495 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2496 State.Builder.setFastMathFlags(getFastMathFlags());
2497 Value *NewVecOp = State.get(getVecOp());
2498 if (VPValue *Cond = getCondOp()) {
2499 Value *NewCond = State.get(Cond, State.VF.isScalar());
2500 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2501 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2502
2503 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
2504 if (State.VF.isVector())
2505 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2506
2507 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2508 NewVecOp = Select;
2509 }
2510 Value *NewRed;
2511 Value *NextInChain;
2512 if (IsOrdered) {
2513 if (State.VF.isVector())
2514 NewRed =
2515 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
2516 else
2517 NewRed = State.Builder.CreateBinOp(
2518 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2519 PrevInChain, NewVecOp);
2520 PrevInChain = NewRed;
2521 NextInChain = NewRed;
2522 } else {
2523 PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2524 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
2525 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2526 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
2527 else
2528 NextInChain = State.Builder.CreateBinOp(
2529 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed,
2530 PrevInChain);
2531 }
2532 State.set(this, NextInChain, /*IsScalar*/ true);
2533 }
2534
execute(VPTransformState & State)2535 void VPReductionEVLRecipe::execute(VPTransformState &State) {
2536 assert(!State.Lane && "Reduction being replicated.");
2537
2538 auto &Builder = State.Builder;
2539 // Propagate the fast-math flags carried by the underlying instruction.
2540 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2541 Builder.setFastMathFlags(getFastMathFlags());
2542
2543 RecurKind Kind = getRecurrenceKind();
2544 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2545 Value *VecOp = State.get(getVecOp());
2546 Value *EVL = State.get(getEVL(), VPLane(0));
2547
2548 Value *Mask;
2549 if (VPValue *CondOp = getCondOp())
2550 Mask = State.get(CondOp);
2551 else
2552 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2553
2554 Value *NewRed;
2555 if (isOrdered()) {
2556 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
2557 } else {
2558 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
2559 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2560 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2561 else
2562 NewRed = Builder.CreateBinOp(
2563 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed,
2564 Prev);
2565 }
2566 State.set(this, NewRed, /*IsScalar*/ true);
2567 }
2568
computeCost(ElementCount VF,VPCostContext & Ctx) const2569 InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2570 VPCostContext &Ctx) const {
2571 RecurKind RdxKind = getRecurrenceKind();
2572 Type *ElementTy = Ctx.Types.inferScalarType(this);
2573 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2574 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
2575 FastMathFlags FMFs = getFastMathFlags();
2576 std::optional<FastMathFlags> OptionalFMF =
2577 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
2578
2579 // TODO: Support any-of reductions.
2580 assert(
2581 (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
2582 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2583 "Any-of reduction not implemented in VPlan-based cost model currently.");
2584
2585 // Note that TTI should model the cost of moving result to the scalar register
2586 // and the BinOp cost in the getMinMaxReductionCost().
2587 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
2588 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
2589 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2590 }
2591
2592 // Note that TTI should model the cost of moving result to the scalar register
2593 // and the BinOp cost in the getArithmeticReductionCost().
2594 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2595 Ctx.CostKind);
2596 }
2597
VPExpressionRecipe(ExpressionTypes ExpressionType,ArrayRef<VPSingleDefRecipe * > ExpressionRecipes)2598 VPExpressionRecipe::VPExpressionRecipe(
2599 ExpressionTypes ExpressionType,
2600 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2601 : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
2602 ExpressionRecipes(SetVector<VPSingleDefRecipe *>(
2603 ExpressionRecipes.begin(), ExpressionRecipes.end())
2604 .takeVector()),
2605 ExpressionType(ExpressionType) {
2606 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2607 assert(
2608 none_of(ExpressionRecipes,
__anon36a120040a02(VPSingleDefRecipe *R) 2609 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2610 "expression cannot contain recipes with side-effects");
2611
2612 // Maintain a copy of the expression recipes as a set of users.
2613 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2614 for (auto *R : ExpressionRecipes)
2615 ExpressionRecipesAsSetOfUsers.insert(R);
2616
2617 // Recipes in the expression, except the last one, must only be used by
2618 // (other) recipes inside the expression. If there are other users, external
2619 // to the expression, use a clone of the recipe for external users.
2620 for (VPSingleDefRecipe *R : ExpressionRecipes) {
2621 if (R != ExpressionRecipes.back() &&
__anon36a120040b02(VPUser *U) 2622 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2623 return !ExpressionRecipesAsSetOfUsers.contains(U);
2624 })) {
2625 // There are users outside of the expression. Clone the recipe and use the
2626 // clone those external users.
2627 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2628 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
__anon36a120040c02( VPUser &U, unsigned) 2629 VPUser &U, unsigned) {
2630 return !ExpressionRecipesAsSetOfUsers.contains(&U);
2631 });
2632 CopyForExtUsers->insertBefore(R);
2633 }
2634 if (R->getParent())
2635 R->removeFromParent();
2636 }
2637
2638 // Internalize all external operands to the expression recipes. To do so,
2639 // create new temporary VPValues for all operands defined by a recipe outside
2640 // the expression. The original operands are added as operands of the
2641 // VPExpressionRecipe itself.
2642 for (auto *R : ExpressionRecipes) {
2643 for (const auto &[Idx, Op] : enumerate(R->operands())) {
2644 auto *Def = Op->getDefiningRecipe();
2645 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
2646 continue;
2647 addOperand(Op);
2648 LiveInPlaceholders.push_back(new VPValue());
2649 R->setOperand(Idx, LiveInPlaceholders.back());
2650 }
2651 }
2652 }
2653
decompose()2654 void VPExpressionRecipe::decompose() {
2655 for (auto *R : ExpressionRecipes)
2656 R->insertBefore(this);
2657
2658 for (const auto &[Idx, Op] : enumerate(operands()))
2659 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
2660
2661 replaceAllUsesWith(ExpressionRecipes.back());
2662 ExpressionRecipes.clear();
2663 }
2664
computeCost(ElementCount VF,VPCostContext & Ctx) const2665 InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
2666 VPCostContext &Ctx) const {
2667 Type *RedTy = Ctx.Types.inferScalarType(this);
2668 auto *SrcVecTy = cast<VectorType>(
2669 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
2670 assert(RedTy->isIntegerTy() &&
2671 "VPExpressionRecipe only supports integer types currently.");
2672 switch (ExpressionType) {
2673 case ExpressionTypes::ExtendedReduction: {
2674 unsigned Opcode = RecurrenceDescriptor::getOpcode(
2675 cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
2676 return Ctx.TTI.getExtendedReductionCost(
2677 Opcode,
2678 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
2679 Instruction::ZExt,
2680 RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
2681 }
2682 case ExpressionTypes::MulAccReduction:
2683 return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind);
2684
2685 case ExpressionTypes::ExtMulAccReduction:
2686 return Ctx.TTI.getMulAccReductionCost(
2687 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
2688 Instruction::ZExt,
2689 RedTy, SrcVecTy, Ctx.CostKind);
2690 }
2691 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
2692 }
2693
mayReadOrWriteMemory() const2694 bool VPExpressionRecipe::mayReadOrWriteMemory() const {
2695 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
2696 return R->mayReadFromMemory() || R->mayWriteToMemory();
2697 });
2698 }
2699
mayHaveSideEffects() const2700 bool VPExpressionRecipe::mayHaveSideEffects() const {
2701 assert(
2702 none_of(ExpressionRecipes,
2703 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2704 "expression cannot contain recipes with side-effects");
2705 return false;
2706 }
2707
2708 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2709
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2710 void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
2711 VPSlotTracker &SlotTracker) const {
2712 O << Indent << "EXPRESSION ";
2713 printAsOperand(O, SlotTracker);
2714 O << " = ";
2715 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
2716 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
2717
2718 switch (ExpressionType) {
2719 case ExpressionTypes::ExtendedReduction: {
2720 getOperand(1)->printAsOperand(O, SlotTracker);
2721 O << " +";
2722 O << " reduce." << Instruction::getOpcodeName(Opcode) << " (";
2723 getOperand(0)->printAsOperand(O, SlotTracker);
2724 Red->printFlags(O);
2725
2726 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2727 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2728 << *Ext0->getResultType();
2729 if (Red->isConditional()) {
2730 O << ", ";
2731 Red->getCondOp()->printAsOperand(O, SlotTracker);
2732 }
2733 O << ")";
2734 break;
2735 }
2736 case ExpressionTypes::MulAccReduction:
2737 case ExpressionTypes::ExtMulAccReduction: {
2738 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
2739 O << " + ";
2740 O << "reduce."
2741 << Instruction::getOpcodeName(
2742 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
2743 << " (";
2744 O << "mul";
2745 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
2746 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
2747 : ExpressionRecipes[0]);
2748 Mul->printFlags(O);
2749 if (IsExtended)
2750 O << "(";
2751 getOperand(0)->printAsOperand(O, SlotTracker);
2752 if (IsExtended) {
2753 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2754 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2755 << *Ext0->getResultType() << "), (";
2756 } else {
2757 O << ", ";
2758 }
2759 getOperand(1)->printAsOperand(O, SlotTracker);
2760 if (IsExtended) {
2761 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
2762 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
2763 << *Ext1->getResultType() << ")";
2764 }
2765 if (Red->isConditional()) {
2766 O << ", ";
2767 Red->getCondOp()->printAsOperand(O, SlotTracker);
2768 }
2769 O << ")";
2770 break;
2771 }
2772 }
2773 }
2774
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2775 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
2776 VPSlotTracker &SlotTracker) const {
2777 O << Indent << "REDUCE ";
2778 printAsOperand(O, SlotTracker);
2779 O << " = ";
2780 getChainOp()->printAsOperand(O, SlotTracker);
2781 O << " +";
2782 printFlags(O);
2783 O << " reduce."
2784 << Instruction::getOpcodeName(
2785 RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
2786 << " (";
2787 getVecOp()->printAsOperand(O, SlotTracker);
2788 if (isConditional()) {
2789 O << ", ";
2790 getCondOp()->printAsOperand(O, SlotTracker);
2791 }
2792 O << ")";
2793 }
2794
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2795 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2796 VPSlotTracker &SlotTracker) const {
2797 O << Indent << "REDUCE ";
2798 printAsOperand(O, SlotTracker);
2799 O << " = ";
2800 getChainOp()->printAsOperand(O, SlotTracker);
2801 O << " +";
2802 printFlags(O);
2803 O << " vp.reduce."
2804 << Instruction::getOpcodeName(
2805 RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
2806 << " (";
2807 getVecOp()->printAsOperand(O, SlotTracker);
2808 O << ", ";
2809 getEVL()->printAsOperand(O, SlotTracker);
2810 if (isConditional()) {
2811 O << ", ";
2812 getCondOp()->printAsOperand(O, SlotTracker);
2813 }
2814 O << ")";
2815 }
2816
2817 #endif
2818
2819 /// A helper function to scalarize a single Instruction in the innermost loop.
2820 /// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
2821 /// operands from \p RepRecipe instead of \p Instr's operands.
scalarizeInstruction(const Instruction * Instr,VPReplicateRecipe * RepRecipe,const VPLane & Lane,VPTransformState & State)2822 static void scalarizeInstruction(const Instruction *Instr,
2823 VPReplicateRecipe *RepRecipe,
2824 const VPLane &Lane, VPTransformState &State) {
2825 assert((!Instr->getType()->isAggregateType() ||
2826 canVectorizeTy(Instr->getType())) &&
2827 "Expected vectorizable or non-aggregate type.");
2828
2829 // Does this instruction return a value ?
2830 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2831
2832 Instruction *Cloned = Instr->clone();
2833 if (!IsVoidRetTy) {
2834 Cloned->setName(Instr->getName() + ".cloned");
2835 #if !defined(NDEBUG)
2836 // Verify that VPlan type inference results agree with the type of the
2837 // generated values.
2838 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2839 "inferred type and type from generated instructions do not match");
2840 #endif
2841 }
2842
2843 RepRecipe->applyFlags(*Cloned);
2844 RepRecipe->applyMetadata(*Cloned);
2845
2846 if (auto DL = RepRecipe->getDebugLoc())
2847 State.setDebugLocFrom(DL);
2848
2849 // Replace the operands of the cloned instructions with their scalar
2850 // equivalents in the new loop.
2851 for (const auto &I : enumerate(RepRecipe->operands())) {
2852 auto InputLane = Lane;
2853 VPValue *Operand = I.value();
2854 if (vputils::isSingleScalar(Operand))
2855 InputLane = VPLane::getFirstLane();
2856 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2857 }
2858
2859 // Place the cloned scalar in the new loop.
2860 State.Builder.Insert(Cloned);
2861
2862 State.set(RepRecipe, Cloned, Lane);
2863
2864 // If we just cloned a new assumption, add it the assumption cache.
2865 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2866 State.AC->registerAssumption(II);
2867
2868 assert(
2869 (RepRecipe->getParent()->getParent() ||
2870 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2871 all_of(RepRecipe->operands(),
2872 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2873 "Expected a recipe is either within a region or all of its operands "
2874 "are defined outside the vectorized region.");
2875 }
2876
execute(VPTransformState & State)2877 void VPReplicateRecipe::execute(VPTransformState &State) {
2878 Instruction *UI = getUnderlyingInstr();
2879
2880 if (!State.Lane) {
2881 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
2882 "must have already been unrolled");
2883 scalarizeInstruction(UI, this, VPLane(0), State);
2884 return;
2885 }
2886
2887 assert((State.VF.isScalar() || !isSingleScalar()) &&
2888 "uniform recipe shouldn't be predicated");
2889 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
2890 scalarizeInstruction(UI, this, *State.Lane, State);
2891 // Insert scalar instance packing it into a vector.
2892 if (State.VF.isVector() && shouldPack()) {
2893 Value *WideValue =
2894 State.Lane->isFirstLane()
2895 ? PoisonValue::get(VectorType::get(UI->getType(), State.VF))
2896 : State.get(this);
2897 State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
2898 *State.Lane));
2899 }
2900 }
2901
shouldPack() const2902 bool VPReplicateRecipe::shouldPack() const {
2903 // Find if the recipe is used by a widened recipe via an intervening
2904 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2905 return any_of(users(), [](const VPUser *U) {
2906 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
2907 return any_of(PredR->users(), [PredR](const VPUser *U) {
2908 return !U->usesScalars(PredR);
2909 });
2910 return false;
2911 });
2912 }
2913
computeCost(ElementCount VF,VPCostContext & Ctx) const2914 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
2915 VPCostContext &Ctx) const {
2916 Instruction *UI = cast<Instruction>(getUnderlyingValue());
2917 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
2918 // transform, avoid computing their cost multiple times for now.
2919 Ctx.SkipCostComputation.insert(UI);
2920
2921 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2922 Type *ResultTy = Ctx.Types.inferScalarType(this);
2923 switch (UI->getOpcode()) {
2924 case Instruction::GetElementPtr:
2925 // We mark this instruction as zero-cost because the cost of GEPs in
2926 // vectorized code depends on whether the corresponding memory instruction
2927 // is scalarized or not. Therefore, we handle GEPs with the memory
2928 // instruction cost.
2929 return 0;
2930 case Instruction::Add:
2931 case Instruction::Sub:
2932 case Instruction::FAdd:
2933 case Instruction::FSub:
2934 case Instruction::Mul:
2935 case Instruction::FMul:
2936 case Instruction::FDiv:
2937 case Instruction::FRem:
2938 case Instruction::Shl:
2939 case Instruction::LShr:
2940 case Instruction::AShr:
2941 case Instruction::And:
2942 case Instruction::Or:
2943 case Instruction::Xor: {
2944 auto Op2Info = Ctx.getOperandInfo(getOperand(1));
2945 SmallVector<const Value *, 4> Operands(UI->operand_values());
2946 return Ctx.TTI.getArithmeticInstrCost(
2947 UI->getOpcode(), ResultTy, CostKind,
2948 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2949 Op2Info, Operands, UI, &Ctx.TLI) *
2950 (isSingleScalar() ? 1 : VF.getFixedValue());
2951 }
2952 }
2953
2954 return Ctx.getLegacyCost(UI, VF);
2955 }
2956
2957 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const2958 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2959 VPSlotTracker &SlotTracker) const {
2960 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
2961
2962 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2963 printAsOperand(O, SlotTracker);
2964 O << " = ";
2965 }
2966 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2967 O << "call";
2968 printFlags(O);
2969 O << "@" << CB->getCalledFunction()->getName() << "(";
2970 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2971 O, [&O, &SlotTracker](VPValue *Op) {
2972 Op->printAsOperand(O, SlotTracker);
2973 });
2974 O << ")";
2975 } else {
2976 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2977 printFlags(O);
2978 printOperands(O, SlotTracker);
2979 }
2980
2981 if (shouldPack())
2982 O << " (S->V)";
2983 }
2984 #endif
2985
execute(VPTransformState & State)2986 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2987 assert(State.Lane && "Branch on Mask works only on single instance.");
2988
2989 VPValue *BlockInMask = getOperand(0);
2990 Value *ConditionBit = State.get(BlockInMask, *State.Lane);
2991
2992 // Replace the temporary unreachable terminator with a new conditional branch,
2993 // whose two destinations will be set later when they are created.
2994 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2995 assert(isa<UnreachableInst>(CurrentTerminator) &&
2996 "Expected to replace unreachable terminator with conditional branch.");
2997 auto CondBr =
2998 State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
2999 CondBr->setSuccessor(0, nullptr);
3000 CurrentTerminator->eraseFromParent();
3001 }
3002
computeCost(ElementCount VF,VPCostContext & Ctx) const3003 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
3004 VPCostContext &Ctx) const {
3005 // The legacy cost model doesn't assign costs to branches for individual
3006 // replicate regions. Match the current behavior in the VPlan cost model for
3007 // now.
3008 return 0;
3009 }
3010
execute(VPTransformState & State)3011 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
3012 assert(State.Lane && "Predicated instruction PHI works per instance.");
3013 Instruction *ScalarPredInst =
3014 cast<Instruction>(State.get(getOperand(0), *State.Lane));
3015 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3016 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3017 assert(PredicatingBB && "Predicated block has no single predecessor.");
3018 assert(isa<VPReplicateRecipe>(getOperand(0)) &&
3019 "operand must be VPReplicateRecipe");
3020
3021 // By current pack/unpack logic we need to generate only a single phi node: if
3022 // a vector value for the predicated instruction exists at this point it means
3023 // the instruction has vector users only, and a phi for the vector value is
3024 // needed. In this case the recipe of the predicated instruction is marked to
3025 // also do that packing, thereby "hoisting" the insert-element sequence.
3026 // Otherwise, a phi node for the scalar value is needed.
3027 if (State.hasVectorValue(getOperand(0))) {
3028 Value *VectorValue = State.get(getOperand(0));
3029 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
3030 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
3031 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
3032 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
3033 if (State.hasVectorValue(this))
3034 State.reset(this, VPhi);
3035 else
3036 State.set(this, VPhi);
3037 // NOTE: Currently we need to update the value of the operand, so the next
3038 // predicated iteration inserts its generated value in the correct vector.
3039 State.reset(getOperand(0), VPhi);
3040 } else {
3041 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
3042 return;
3043
3044 Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
3045 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
3046 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
3047 PredicatingBB);
3048 Phi->addIncoming(ScalarPredInst, PredicatedBB);
3049 if (State.hasScalarValue(this, *State.Lane))
3050 State.reset(this, Phi, *State.Lane);
3051 else
3052 State.set(this, Phi, *State.Lane);
3053 // NOTE: Currently we need to update the value of the operand, so the next
3054 // predicated iteration inserts its generated value in the correct vector.
3055 State.reset(getOperand(0), Phi, *State.Lane);
3056 }
3057 }
3058
3059 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3060 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3061 VPSlotTracker &SlotTracker) const {
3062 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3063 printAsOperand(O, SlotTracker);
3064 O << " = ";
3065 printOperands(O, SlotTracker);
3066 }
3067 #endif
3068
computeCost(ElementCount VF,VPCostContext & Ctx) const3069 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
3070 VPCostContext &Ctx) const {
3071 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
3072 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3073 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3074 ->getAddressSpace();
3075 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
3076 ? Instruction::Load
3077 : Instruction::Store;
3078
3079 if (!Consecutive) {
3080 // TODO: Using the original IR may not be accurate.
3081 // Currently, ARM will use the underlying IR to calculate gather/scatter
3082 // instruction cost.
3083 const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
3084 assert(!Reverse &&
3085 "Inconsecutive memory access should not have the order.");
3086 return Ctx.TTI.getAddressComputationCost(Ty) +
3087 Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
3088 Ctx.CostKind, &Ingredient);
3089 }
3090
3091 InstructionCost Cost = 0;
3092 if (IsMasked) {
3093 Cost +=
3094 Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind);
3095 } else {
3096 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3097 isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0)
3098 : getOperand(1));
3099 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3100 OpInfo, &Ingredient);
3101 }
3102 if (!Reverse)
3103 return Cost;
3104
3105 return Cost += Ctx.TTI.getShuffleCost(
3106 TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
3107 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3108 }
3109
execute(VPTransformState & State)3110 void VPWidenLoadRecipe::execute(VPTransformState &State) {
3111 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3112 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3113 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3114 bool CreateGather = !isConsecutive();
3115
3116 auto &Builder = State.Builder;
3117 Value *Mask = nullptr;
3118 if (auto *VPMask = getMask()) {
3119 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3120 // of a null all-one mask is a null mask.
3121 Mask = State.get(VPMask);
3122 if (isReverse())
3123 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3124 }
3125
3126 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
3127 Value *NewLI;
3128 if (CreateGather) {
3129 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
3130 "wide.masked.gather");
3131 } else if (Mask) {
3132 NewLI =
3133 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
3134 PoisonValue::get(DataTy), "wide.masked.load");
3135 } else {
3136 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
3137 }
3138 applyMetadata(*cast<Instruction>(NewLI));
3139 if (Reverse)
3140 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
3141 State.set(this, NewLI);
3142 }
3143
3144 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3145 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
3146 VPSlotTracker &SlotTracker) const {
3147 O << Indent << "WIDEN ";
3148 printAsOperand(O, SlotTracker);
3149 O << " = load ";
3150 printOperands(O, SlotTracker);
3151 }
3152 #endif
3153
3154 /// Use all-true mask for reverse rather than actual mask, as it avoids a
3155 /// dependence w/o affecting the result.
createReverseEVL(IRBuilderBase & Builder,Value * Operand,Value * EVL,const Twine & Name)3156 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
3157 Value *EVL, const Twine &Name) {
3158 VectorType *ValTy = cast<VectorType>(Operand->getType());
3159 Value *AllTrueMask =
3160 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
3161 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
3162 {Operand, AllTrueMask, EVL}, nullptr, Name);
3163 }
3164
execute(VPTransformState & State)3165 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
3166 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3167 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3168 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3169 bool CreateGather = !isConsecutive();
3170
3171 auto &Builder = State.Builder;
3172 CallInst *NewLI;
3173 Value *EVL = State.get(getEVL(), VPLane(0));
3174 Value *Addr = State.get(getAddr(), !CreateGather);
3175 Value *Mask = nullptr;
3176 if (VPValue *VPMask = getMask()) {
3177 Mask = State.get(VPMask);
3178 if (isReverse())
3179 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3180 } else {
3181 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3182 }
3183
3184 if (CreateGather) {
3185 NewLI =
3186 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
3187 nullptr, "wide.masked.gather");
3188 } else {
3189 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
3190 {Addr, Mask, EVL}, nullptr, "vp.op.load");
3191 }
3192 NewLI->addParamAttr(
3193 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
3194 applyMetadata(*NewLI);
3195 Instruction *Res = NewLI;
3196 if (isReverse())
3197 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
3198 State.set(this, Res);
3199 }
3200
computeCost(ElementCount VF,VPCostContext & Ctx) const3201 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
3202 VPCostContext &Ctx) const {
3203 if (!Consecutive || IsMasked)
3204 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3205
3206 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3207 // here because the EVL recipes using EVL to replace the tail mask. But in the
3208 // legacy model, it will always calculate the cost of mask.
3209 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3210 // don't need to compare to the legacy cost model.
3211 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
3212 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3213 unsigned AS = getLoadStoreAddressSpace(&Ingredient);
3214 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3215 Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
3216 if (!Reverse)
3217 return Cost;
3218
3219 return Cost + Ctx.TTI.getShuffleCost(
3220 TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
3221 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3222 }
3223
3224 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3225 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
3226 VPSlotTracker &SlotTracker) const {
3227 O << Indent << "WIDEN ";
3228 printAsOperand(O, SlotTracker);
3229 O << " = vp.load ";
3230 printOperands(O, SlotTracker);
3231 }
3232 #endif
3233
execute(VPTransformState & State)3234 void VPWidenStoreRecipe::execute(VPTransformState &State) {
3235 VPValue *StoredVPValue = getStoredValue();
3236 bool CreateScatter = !isConsecutive();
3237 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3238
3239 auto &Builder = State.Builder;
3240
3241 Value *Mask = nullptr;
3242 if (auto *VPMask = getMask()) {
3243 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3244 // of a null all-one mask is a null mask.
3245 Mask = State.get(VPMask);
3246 if (isReverse())
3247 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3248 }
3249
3250 Value *StoredVal = State.get(StoredVPValue);
3251 if (isReverse()) {
3252 // If we store to reverse consecutive memory locations, then we need
3253 // to reverse the order of elements in the stored value.
3254 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
3255 // We don't want to update the value in the map as it might be used in
3256 // another expression. So don't call resetVectorValue(StoredVal).
3257 }
3258 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
3259 Instruction *NewSI = nullptr;
3260 if (CreateScatter)
3261 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
3262 else if (Mask)
3263 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
3264 else
3265 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
3266 applyMetadata(*NewSI);
3267 }
3268
3269 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3270 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
3271 VPSlotTracker &SlotTracker) const {
3272 O << Indent << "WIDEN store ";
3273 printOperands(O, SlotTracker);
3274 }
3275 #endif
3276
execute(VPTransformState & State)3277 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
3278 VPValue *StoredValue = getStoredValue();
3279 bool CreateScatter = !isConsecutive();
3280 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3281
3282 auto &Builder = State.Builder;
3283
3284 CallInst *NewSI = nullptr;
3285 Value *StoredVal = State.get(StoredValue);
3286 Value *EVL = State.get(getEVL(), VPLane(0));
3287 if (isReverse())
3288 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
3289 Value *Mask = nullptr;
3290 if (VPValue *VPMask = getMask()) {
3291 Mask = State.get(VPMask);
3292 if (isReverse())
3293 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3294 } else {
3295 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3296 }
3297 Value *Addr = State.get(getAddr(), !CreateScatter);
3298 if (CreateScatter) {
3299 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3300 Intrinsic::vp_scatter,
3301 {StoredVal, Addr, Mask, EVL});
3302 } else {
3303 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3304 Intrinsic::vp_store,
3305 {StoredVal, Addr, Mask, EVL});
3306 }
3307 NewSI->addParamAttr(
3308 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
3309 applyMetadata(*NewSI);
3310 }
3311
computeCost(ElementCount VF,VPCostContext & Ctx) const3312 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
3313 VPCostContext &Ctx) const {
3314 if (!Consecutive || IsMasked)
3315 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3316
3317 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3318 // here because the EVL recipes using EVL to replace the tail mask. But in the
3319 // legacy model, it will always calculate the cost of mask.
3320 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3321 // don't need to compare to the legacy cost model.
3322 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
3323 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3324 unsigned AS = getLoadStoreAddressSpace(&Ingredient);
3325 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3326 Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
3327 if (!Reverse)
3328 return Cost;
3329
3330 return Cost + Ctx.TTI.getShuffleCost(
3331 TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
3332 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3333 }
3334
3335 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3336 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
3337 VPSlotTracker &SlotTracker) const {
3338 O << Indent << "WIDEN vp.store ";
3339 printOperands(O, SlotTracker);
3340 }
3341 #endif
3342
createBitOrPointerCast(IRBuilderBase & Builder,Value * V,VectorType * DstVTy,const DataLayout & DL)3343 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
3344 VectorType *DstVTy, const DataLayout &DL) {
3345 // Verify that V is a vector type with same number of elements as DstVTy.
3346 auto VF = DstVTy->getElementCount();
3347 auto *SrcVecTy = cast<VectorType>(V->getType());
3348 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
3349 Type *SrcElemTy = SrcVecTy->getElementType();
3350 Type *DstElemTy = DstVTy->getElementType();
3351 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3352 "Vector elements must have same size");
3353
3354 // Do a direct cast if element types are castable.
3355 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3356 return Builder.CreateBitOrPointerCast(V, DstVTy);
3357 }
3358 // V cannot be directly casted to desired vector type.
3359 // May happen when V is a floating point vector but DstVTy is a vector of
3360 // pointers or vice-versa. Handle this using a two-step bitcast using an
3361 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3362 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3363 "Only one type should be a pointer type");
3364 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3365 "Only one type should be a floating point type");
3366 Type *IntTy =
3367 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3368 auto *VecIntTy = VectorType::get(IntTy, VF);
3369 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3370 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
3371 }
3372
3373 /// Return a vector containing interleaved elements from multiple
3374 /// smaller input vectors.
interleaveVectors(IRBuilderBase & Builder,ArrayRef<Value * > Vals,const Twine & Name)3375 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
3376 const Twine &Name) {
3377 unsigned Factor = Vals.size();
3378 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
3379
3380 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
3381 #ifndef NDEBUG
3382 for (Value *Val : Vals)
3383 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
3384 #endif
3385
3386 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
3387 // must use intrinsics to interleave.
3388 if (VecTy->isScalableTy()) {
3389 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
3390 VectorType *InterleaveTy =
3391 VectorType::get(VecTy->getElementType(),
3392 VecTy->getElementCount().multiplyCoefficientBy(Factor));
3393 return Builder.CreateIntrinsic(InterleaveTy,
3394 getInterleaveIntrinsicID(Factor), Vals,
3395 /*FMFSource=*/nullptr, Name);
3396 }
3397
3398 // Fixed length. Start by concatenating all vectors into a wide vector.
3399 Value *WideVec = concatenateVectors(Builder, Vals);
3400
3401 // Interleave the elements into the wide vector.
3402 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
3403 return Builder.CreateShuffleVector(
3404 WideVec, createInterleaveMask(NumElts, Factor), Name);
3405 }
3406
3407 // Try to vectorize the interleave group that \p Instr belongs to.
3408 //
3409 // E.g. Translate following interleaved load group (factor = 3):
3410 // for (i = 0; i < N; i+=3) {
3411 // R = Pic[i]; // Member of index 0
3412 // G = Pic[i+1]; // Member of index 1
3413 // B = Pic[i+2]; // Member of index 2
3414 // ... // do something to R, G, B
3415 // }
3416 // To:
3417 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
3418 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
3419 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
3420 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
3421 //
3422 // Or translate following interleaved store group (factor = 3):
3423 // for (i = 0; i < N; i+=3) {
3424 // ... do something to R, G, B
3425 // Pic[i] = R; // Member of index 0
3426 // Pic[i+1] = G; // Member of index 1
3427 // Pic[i+2] = B; // Member of index 2
3428 // }
3429 // To:
3430 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
3431 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
3432 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
3433 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
3434 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
execute(VPTransformState & State)3435 void VPInterleaveRecipe::execute(VPTransformState &State) {
3436 assert(!State.Lane && "Interleave group being replicated.");
3437 const InterleaveGroup<Instruction> *Group = IG;
3438 Instruction *Instr = Group->getInsertPos();
3439
3440 // Prepare for the vector type of the interleaved load/store.
3441 Type *ScalarTy = getLoadStoreType(Instr);
3442 unsigned InterleaveFactor = Group->getFactor();
3443 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
3444
3445 VPValue *BlockInMask = getMask();
3446 VPValue *Addr = getAddr();
3447 Value *ResAddr = State.get(Addr, VPLane(0));
3448 Value *PoisonVec = PoisonValue::get(VecTy);
3449
3450 auto CreateGroupMask = [&BlockInMask, &State,
3451 &InterleaveFactor](Value *MaskForGaps) -> Value * {
3452 if (State.VF.isScalable()) {
3453 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
3454 assert(InterleaveFactor <= 8 &&
3455 "Unsupported deinterleave factor for scalable vectors");
3456 auto *ResBlockInMask = State.get(BlockInMask);
3457 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
3458 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
3459 }
3460
3461 if (!BlockInMask)
3462 return MaskForGaps;
3463
3464 Value *ResBlockInMask = State.get(BlockInMask);
3465 Value *ShuffledMask = State.Builder.CreateShuffleVector(
3466 ResBlockInMask,
3467 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
3468 "interleaved.mask");
3469 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
3470 ShuffledMask, MaskForGaps)
3471 : ShuffledMask;
3472 };
3473
3474 const DataLayout &DL = Instr->getDataLayout();
3475 // Vectorize the interleaved load group.
3476 if (isa<LoadInst>(Instr)) {
3477 Value *MaskForGaps = nullptr;
3478 if (NeedsMaskForGaps) {
3479 MaskForGaps =
3480 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
3481 assert(MaskForGaps && "Mask for Gaps is required but it is null");
3482 }
3483
3484 Instruction *NewLoad;
3485 if (BlockInMask || MaskForGaps) {
3486 Value *GroupMask = CreateGroupMask(MaskForGaps);
3487 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
3488 Group->getAlign(), GroupMask,
3489 PoisonVec, "wide.masked.vec");
3490 } else
3491 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
3492 Group->getAlign(), "wide.vec");
3493 Group->addMetadata(NewLoad);
3494
3495 ArrayRef<VPValue *> VPDefs = definedValues();
3496 const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
3497 if (VecTy->isScalableTy()) {
3498 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
3499 // so must use intrinsics to deinterleave.
3500 assert(InterleaveFactor <= 8 &&
3501 "Unsupported deinterleave factor for scalable vectors");
3502 Value *Deinterleave = State.Builder.CreateIntrinsic(
3503 getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
3504 NewLoad,
3505 /*FMFSource=*/nullptr, "strided.vec");
3506
3507 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
3508 Instruction *Member = Group->getMember(I);
3509 Value *StridedVec = State.Builder.CreateExtractValue(Deinterleave, I);
3510 if (!Member) {
3511 // This value is not needed as it's not used
3512 cast<Instruction>(StridedVec)->eraseFromParent();
3513 continue;
3514 }
3515 // If this member has different type, cast the result type.
3516 if (Member->getType() != ScalarTy) {
3517 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3518 StridedVec =
3519 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3520 }
3521
3522 if (Group->isReverse())
3523 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
3524
3525 State.set(VPDefs[J], StridedVec);
3526 ++J;
3527 }
3528
3529 return;
3530 }
3531 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
3532
3533 // For each member in the group, shuffle out the appropriate data from the
3534 // wide loads.
3535 unsigned J = 0;
3536 for (unsigned I = 0; I < InterleaveFactor; ++I) {
3537 Instruction *Member = Group->getMember(I);
3538
3539 // Skip the gaps in the group.
3540 if (!Member)
3541 continue;
3542
3543 auto StrideMask =
3544 createStrideMask(I, InterleaveFactor, State.VF.getFixedValue());
3545 Value *StridedVec =
3546 State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
3547
3548 // If this member has different type, cast the result type.
3549 if (Member->getType() != ScalarTy) {
3550 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3551 StridedVec =
3552 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3553 }
3554
3555 if (Group->isReverse())
3556 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
3557
3558 State.set(VPDefs[J], StridedVec);
3559 ++J;
3560 }
3561 return;
3562 }
3563
3564 // The sub vector type for current instruction.
3565 auto *SubVT = VectorType::get(ScalarTy, State.VF);
3566
3567 // Vectorize the interleaved store group.
3568 Value *MaskForGaps =
3569 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
3570 assert((!MaskForGaps || !State.VF.isScalable()) &&
3571 "masking gaps for scalable vectors is not yet supported.");
3572 ArrayRef<VPValue *> StoredValues = getStoredValues();
3573 // Collect the stored vector from each member.
3574 SmallVector<Value *, 4> StoredVecs;
3575 unsigned StoredIdx = 0;
3576 for (unsigned i = 0; i < InterleaveFactor; i++) {
3577 assert((Group->getMember(i) || MaskForGaps) &&
3578 "Fail to get a member from an interleaved store group");
3579 Instruction *Member = Group->getMember(i);
3580
3581 // Skip the gaps in the group.
3582 if (!Member) {
3583 Value *Undef = PoisonValue::get(SubVT);
3584 StoredVecs.push_back(Undef);
3585 continue;
3586 }
3587
3588 Value *StoredVec = State.get(StoredValues[StoredIdx]);
3589 ++StoredIdx;
3590
3591 if (Group->isReverse())
3592 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
3593
3594 // If this member has different type, cast it to a unified type.
3595
3596 if (StoredVec->getType() != SubVT)
3597 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
3598
3599 StoredVecs.push_back(StoredVec);
3600 }
3601
3602 // Interleave all the smaller vectors into one wider vector.
3603 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
3604 Instruction *NewStoreInstr;
3605 if (BlockInMask || MaskForGaps) {
3606 Value *GroupMask = CreateGroupMask(MaskForGaps);
3607 NewStoreInstr = State.Builder.CreateMaskedStore(
3608 IVec, ResAddr, Group->getAlign(), GroupMask);
3609 } else
3610 NewStoreInstr =
3611 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
3612
3613 Group->addMetadata(NewStoreInstr);
3614 }
3615
3616 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3617 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
3618 VPSlotTracker &SlotTracker) const {
3619 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
3620 IG->getInsertPos()->printAsOperand(O, false);
3621 O << ", ";
3622 getAddr()->printAsOperand(O, SlotTracker);
3623 VPValue *Mask = getMask();
3624 if (Mask) {
3625 O << ", ";
3626 Mask->printAsOperand(O, SlotTracker);
3627 }
3628
3629 unsigned OpIdx = 0;
3630 for (unsigned i = 0; i < IG->getFactor(); ++i) {
3631 if (!IG->getMember(i))
3632 continue;
3633 if (getNumStoreOperands() > 0) {
3634 O << "\n" << Indent << " store ";
3635 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
3636 O << " to index " << i;
3637 } else {
3638 O << "\n" << Indent << " ";
3639 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
3640 O << " = load from index " << i;
3641 }
3642 ++OpIdx;
3643 }
3644 }
3645 #endif
3646
computeCost(ElementCount VF,VPCostContext & Ctx) const3647 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
3648 VPCostContext &Ctx) const {
3649 Instruction *InsertPos = getInsertPos();
3650 // Find the VPValue index of the interleave group. We need to skip gaps.
3651 unsigned InsertPosIdx = 0;
3652 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
3653 if (auto *Member = IG->getMember(Idx)) {
3654 if (Member == InsertPos)
3655 break;
3656 InsertPosIdx++;
3657 }
3658 Type *ValTy = Ctx.Types.inferScalarType(
3659 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
3660 : getStoredValues()[InsertPosIdx]);
3661 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
3662 unsigned AS = getLoadStoreAddressSpace(InsertPos);
3663
3664 unsigned InterleaveFactor = IG->getFactor();
3665 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
3666
3667 // Holds the indices of existing members in the interleaved group.
3668 SmallVector<unsigned, 4> Indices;
3669 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
3670 if (IG->getMember(IF))
3671 Indices.push_back(IF);
3672
3673 // Calculate the cost of the whole interleaved group.
3674 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
3675 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
3676 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
3677
3678 if (!IG->isReverse())
3679 return Cost;
3680
3681 return Cost + IG->getNumMembers() *
3682 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
3683 VectorTy, VectorTy, {}, Ctx.CostKind,
3684 0);
3685 }
3686
3687 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3688 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3689 VPSlotTracker &SlotTracker) const {
3690 O << Indent << "EMIT ";
3691 printAsOperand(O, SlotTracker);
3692 O << " = CANONICAL-INDUCTION ";
3693 printOperands(O, SlotTracker);
3694 }
3695 #endif
3696
onlyScalarsGenerated(bool IsScalable)3697 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
3698 return IsScalarAfterVectorization &&
3699 (!IsScalable || vputils::onlyFirstLaneUsed(this));
3700 }
3701
execute(VPTransformState & State)3702 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
3703 assert(getInductionDescriptor().getKind() ==
3704 InductionDescriptor::IK_PtrInduction &&
3705 "Not a pointer induction according to InductionDescriptor!");
3706 assert(State.TypeAnalysis.inferScalarType(this)->isPointerTy() &&
3707 "Unexpected type.");
3708 assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
3709 "Recipe should have been replaced");
3710
3711 unsigned CurrentPart = getUnrollPart(*this);
3712
3713 // Build a pointer phi
3714 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
3715 Type *ScStValueType = ScalarStartValue->getType();
3716
3717 BasicBlock *VectorPH =
3718 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
3719 PHINode *NewPointerPhi = nullptr;
3720 if (CurrentPart == 0) {
3721 IRBuilder<>::InsertPointGuard Guard(State.Builder);
3722 if (State.Builder.GetInsertPoint() !=
3723 State.Builder.GetInsertBlock()->getFirstNonPHIIt())
3724 State.Builder.SetInsertPoint(
3725 State.Builder.GetInsertBlock()->getFirstNonPHIIt());
3726 NewPointerPhi = State.Builder.CreatePHI(ScStValueType, 2, "pointer.phi");
3727 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
3728 NewPointerPhi->setDebugLoc(getDebugLoc());
3729 } else {
3730 // The recipe has been unrolled. In that case, fetch the single pointer phi
3731 // shared among all unrolled parts of the recipe.
3732 auto *GEP =
3733 cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand()));
3734 NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
3735 }
3736
3737 // A pointer induction, performed by using a gep
3738 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
3739 Value *ScalarStepValue = State.get(getStepValue(), VPLane(0));
3740 Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue());
3741 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
3742 // Add induction update using an incorrect block temporarily. The phi node
3743 // will be fixed after VPlan execution. Note that at this point the latch
3744 // block cannot be used, as it does not exist yet.
3745 // TODO: Model increment value in VPlan, by turning the recipe into a
3746 // multi-def and a subclass of VPHeaderPHIRecipe.
3747 if (CurrentPart == 0) {
3748 // The recipe represents the first part of the pointer induction. Create the
3749 // GEP to increment the phi across all unrolled parts.
3750 Value *NumUnrolledElems = State.get(getOperand(2), true);
3751
3752 Value *InductionGEP = GetElementPtrInst::Create(
3753 State.Builder.getInt8Ty(), NewPointerPhi,
3754 State.Builder.CreateMul(
3755 ScalarStepValue,
3756 State.Builder.CreateTrunc(NumUnrolledElems, PhiType)),
3757 "ptr.ind", InductionLoc);
3758
3759 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
3760 }
3761
3762 // Create actual address geps that use the pointer phi as base and a
3763 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3764 Type *VecPhiType = VectorType::get(PhiType, State.VF);
3765 Value *StartOffsetScalar = State.Builder.CreateMul(
3766 RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
3767 Value *StartOffset =
3768 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
3769 // Create a vector of consecutive numbers from zero to VF.
3770 StartOffset = State.Builder.CreateAdd(
3771 StartOffset, State.Builder.CreateStepVector(VecPhiType));
3772
3773 assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) &&
3774 "scalar step must be the same across all parts");
3775 Value *GEP = State.Builder.CreateGEP(
3776 State.Builder.getInt8Ty(), NewPointerPhi,
3777 State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat(
3778 State.VF, ScalarStepValue)),
3779 "vector.gep");
3780 State.set(this, GEP);
3781 }
3782
3783 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3784 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
3785 VPSlotTracker &SlotTracker) const {
3786 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
3787 "unexpected number of operands");
3788 O << Indent << "EMIT ";
3789 printAsOperand(O, SlotTracker);
3790 O << " = WIDEN-POINTER-INDUCTION ";
3791 getStartValue()->printAsOperand(O, SlotTracker);
3792 O << ", ";
3793 getStepValue()->printAsOperand(O, SlotTracker);
3794 O << ", ";
3795 getOperand(2)->printAsOperand(O, SlotTracker);
3796 if (getNumOperands() == 5) {
3797 O << ", ";
3798 getOperand(3)->printAsOperand(O, SlotTracker);
3799 O << ", ";
3800 getOperand(4)->printAsOperand(O, SlotTracker);
3801 }
3802 }
3803 #endif
3804
execute(VPTransformState & State)3805 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
3806 assert(!State.Lane && "cannot be used in per-lane");
3807 const DataLayout &DL = SE.getDataLayout();
3808 SCEVExpander Exp(SE, DL, "induction", /*PreserveLCSSA=*/true);
3809 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
3810 &*State.Builder.GetInsertPoint());
3811 State.set(this, Res, VPLane(0));
3812 }
3813
3814 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3815 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
3816 VPSlotTracker &SlotTracker) const {
3817 O << Indent << "EMIT ";
3818 printAsOperand(O, SlotTracker);
3819 O << " = EXPAND SCEV " << *Expr;
3820 }
3821 #endif
3822
execute(VPTransformState & State)3823 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
3824 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
3825 Type *STy = CanonicalIV->getType();
3826 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
3827 ElementCount VF = State.VF;
3828 Value *VStart = VF.isScalar()
3829 ? CanonicalIV
3830 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
3831 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
3832 if (VF.isVector()) {
3833 VStep = Builder.CreateVectorSplat(VF, VStep);
3834 VStep =
3835 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
3836 }
3837 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
3838 State.set(this, CanonicalVectorIV);
3839 }
3840
3841 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3842 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
3843 VPSlotTracker &SlotTracker) const {
3844 O << Indent << "EMIT ";
3845 printAsOperand(O, SlotTracker);
3846 O << " = WIDEN-CANONICAL-INDUCTION ";
3847 printOperands(O, SlotTracker);
3848 }
3849 #endif
3850
execute(VPTransformState & State)3851 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
3852 auto &Builder = State.Builder;
3853 // Create a vector from the initial value.
3854 auto *VectorInit = getStartValue()->getLiveInIRValue();
3855
3856 Type *VecTy = State.VF.isScalar()
3857 ? VectorInit->getType()
3858 : VectorType::get(VectorInit->getType(), State.VF);
3859
3860 BasicBlock *VectorPH =
3861 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
3862 if (State.VF.isVector()) {
3863 auto *IdxTy = Builder.getInt32Ty();
3864 auto *One = ConstantInt::get(IdxTy, 1);
3865 IRBuilder<>::InsertPointGuard Guard(Builder);
3866 Builder.SetInsertPoint(VectorPH->getTerminator());
3867 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
3868 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3869 VectorInit = Builder.CreateInsertElement(
3870 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
3871 }
3872
3873 // Create a phi node for the new recurrence.
3874 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
3875 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
3876 Phi->addIncoming(VectorInit, VectorPH);
3877 State.set(this, Phi);
3878 }
3879
3880 InstructionCost
computeCost(ElementCount VF,VPCostContext & Ctx) const3881 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
3882 VPCostContext &Ctx) const {
3883 if (VF.isScalar())
3884 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3885
3886 return 0;
3887 }
3888
3889 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3890 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3891 VPSlotTracker &SlotTracker) const {
3892 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3893 printAsOperand(O, SlotTracker);
3894 O << " = phi ";
3895 printOperands(O, SlotTracker);
3896 }
3897 #endif
3898
execute(VPTransformState & State)3899 void VPReductionPHIRecipe::execute(VPTransformState &State) {
3900 // Reductions do not have to start at zero. They can start with
3901 // any loop invariant values.
3902 VPValue *StartVPV = getStartValue();
3903
3904 // In order to support recurrences we need to be able to vectorize Phi nodes.
3905 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3906 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3907 // this value when we vectorize all of the instructions that use the PHI.
3908 BasicBlock *VectorPH =
3909 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
3910 bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3911 Value *StartV = State.get(StartVPV, ScalarPHI);
3912 Type *VecTy = StartV->getType();
3913
3914 BasicBlock *HeaderBB = State.CFG.PrevBB;
3915 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
3916 "recipe must be in the vector loop header");
3917 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
3918 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
3919 State.set(this, Phi, IsInLoop);
3920
3921 Phi->addIncoming(StartV, VectorPH);
3922 }
3923
3924 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3925 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3926 VPSlotTracker &SlotTracker) const {
3927 O << Indent << "WIDEN-REDUCTION-PHI ";
3928
3929 printAsOperand(O, SlotTracker);
3930 O << " = phi ";
3931 printOperands(O, SlotTracker);
3932 if (VFScaleFactor != 1)
3933 O << " (VF scaled by 1/" << VFScaleFactor << ")";
3934 }
3935 #endif
3936
execute(VPTransformState & State)3937 void VPWidenPHIRecipe::execute(VPTransformState &State) {
3938 Value *Op0 = State.get(getOperand(0));
3939 Type *VecTy = Op0->getType();
3940 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
3941 // Manually move it with the other PHIs in case PHI recipes above this one
3942 // also inserted non-phi instructions.
3943 // TODO: Remove once VPWidenPointerInductionRecipe is also expanded in
3944 // convertToConcreteRecipes.
3945 VecPhi->moveBefore(State.Builder.GetInsertBlock()->getFirstNonPHIIt());
3946 State.set(this, VecPhi);
3947 }
3948
3949 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3950 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3951 VPSlotTracker &SlotTracker) const {
3952 O << Indent << "WIDEN-PHI ";
3953
3954 printAsOperand(O, SlotTracker);
3955 O << " = phi ";
3956 printPhiOperands(O, SlotTracker);
3957 }
3958 #endif
3959
3960 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3961 // remove VPActiveLaneMaskPHIRecipe.
execute(VPTransformState & State)3962 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3963 BasicBlock *VectorPH =
3964 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
3965 Value *StartMask = State.get(getOperand(0));
3966 PHINode *Phi =
3967 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
3968 Phi->addIncoming(StartMask, VectorPH);
3969 State.set(this, Phi);
3970 }
3971
3972 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3973 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3974 VPSlotTracker &SlotTracker) const {
3975 O << Indent << "ACTIVE-LANE-MASK-PHI ";
3976
3977 printAsOperand(O, SlotTracker);
3978 O << " = phi ";
3979 printOperands(O, SlotTracker);
3980 }
3981 #endif
3982
3983 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const3984 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3985 VPSlotTracker &SlotTracker) const {
3986 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3987
3988 printAsOperand(O, SlotTracker);
3989 O << " = phi ";
3990 printOperands(O, SlotTracker);
3991 }
3992 #endif
3993