1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "LoopVectorizationPlanner.h" 15 #include "VPlan.h" 16 #include "VPlanAnalysis.h" 17 #include "VPlanHelpers.h" 18 #include "VPlanPatternMatch.h" 19 #include "VPlanUtils.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/Twine.h" 23 #include "llvm/Analysis/AssumptionCache.h" 24 #include "llvm/Analysis/IVDescriptors.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/IRBuilder.h" 28 #include "llvm/IR/Instruction.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Intrinsics.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/IR/Value.h" 33 #include "llvm/Support/Casting.h" 34 #include "llvm/Support/CommandLine.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Support/raw_ostream.h" 37 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 38 #include "llvm/Transforms/Utils/LoopUtils.h" 39 #include "llvm/Transforms/Utils/LoopVersioning.h" 40 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 41 #include <cassert> 42 43 using namespace llvm; 44 45 using VectorParts = SmallVector<Value *, 2>; 46 47 #define LV_NAME "loop-vectorize" 48 #define DEBUG_TYPE LV_NAME 49 50 bool VPRecipeBase::mayWriteToMemory() const { 51 switch (getVPDefID()) { 52 case VPExpressionSC: 53 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory(); 54 case VPInstructionSC: 55 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); 56 case VPInterleaveSC: 57 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 58 case VPWidenStoreEVLSC: 59 case VPWidenStoreSC: 60 return true; 61 case VPReplicateSC: 62 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 63 ->mayWriteToMemory(); 64 case VPWidenCallSC: 65 return !cast<VPWidenCallRecipe>(this) 66 ->getCalledScalarFunction() 67 ->onlyReadsMemory(); 68 case VPWidenIntrinsicSC: 69 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory(); 70 case VPCanonicalIVPHISC: 71 case VPBranchOnMaskSC: 72 case VPFirstOrderRecurrencePHISC: 73 case VPReductionPHISC: 74 case VPScalarIVStepsSC: 75 case VPPredInstPHISC: 76 return false; 77 case VPBlendSC: 78 case VPReductionEVLSC: 79 case VPReductionSC: 80 case VPVectorPointerSC: 81 case VPWidenCanonicalIVSC: 82 case VPWidenCastSC: 83 case VPWidenGEPSC: 84 case VPWidenIntOrFpInductionSC: 85 case VPWidenLoadEVLSC: 86 case VPWidenLoadSC: 87 case VPWidenPHISC: 88 case VPWidenSC: 89 case VPWidenSelectSC: { 90 const Instruction *I = 91 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 92 (void)I; 93 assert((!I || !I->mayWriteToMemory()) && 94 "underlying instruction may write to memory"); 95 return false; 96 } 97 default: 98 return true; 99 } 100 } 101 102 bool VPRecipeBase::mayReadFromMemory() const { 103 switch (getVPDefID()) { 104 case VPExpressionSC: 105 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory(); 106 case VPInstructionSC: 107 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); 108 case VPWidenLoadEVLSC: 109 case VPWidenLoadSC: 110 return true; 111 case VPReplicateSC: 112 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 113 ->mayReadFromMemory(); 114 case VPWidenCallSC: 115 return !cast<VPWidenCallRecipe>(this) 116 ->getCalledScalarFunction() 117 ->onlyWritesMemory(); 118 case VPWidenIntrinsicSC: 119 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory(); 120 case VPBranchOnMaskSC: 121 case VPFirstOrderRecurrencePHISC: 122 case VPPredInstPHISC: 123 case VPScalarIVStepsSC: 124 case VPWidenStoreEVLSC: 125 case VPWidenStoreSC: 126 return false; 127 case VPBlendSC: 128 case VPReductionEVLSC: 129 case VPReductionSC: 130 case VPVectorPointerSC: 131 case VPWidenCanonicalIVSC: 132 case VPWidenCastSC: 133 case VPWidenGEPSC: 134 case VPWidenIntOrFpInductionSC: 135 case VPWidenPHISC: 136 case VPWidenSC: 137 case VPWidenSelectSC: { 138 const Instruction *I = 139 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 140 (void)I; 141 assert((!I || !I->mayReadFromMemory()) && 142 "underlying instruction may read from memory"); 143 return false; 144 } 145 default: 146 return true; 147 } 148 } 149 150 bool VPRecipeBase::mayHaveSideEffects() const { 151 switch (getVPDefID()) { 152 case VPExpressionSC: 153 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects(); 154 case VPDerivedIVSC: 155 case VPFirstOrderRecurrencePHISC: 156 case VPPredInstPHISC: 157 case VPVectorEndPointerSC: 158 return false; 159 case VPInstructionSC: 160 return mayWriteToMemory(); 161 case VPWidenCallSC: { 162 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 163 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 164 } 165 case VPWidenIntrinsicSC: 166 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects(); 167 case VPBlendSC: 168 case VPReductionEVLSC: 169 case VPReductionSC: 170 case VPScalarIVStepsSC: 171 case VPVectorPointerSC: 172 case VPWidenCanonicalIVSC: 173 case VPWidenCastSC: 174 case VPWidenGEPSC: 175 case VPWidenIntOrFpInductionSC: 176 case VPWidenPHISC: 177 case VPWidenPointerInductionSC: 178 case VPWidenSC: 179 case VPWidenSelectSC: { 180 const Instruction *I = 181 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 182 (void)I; 183 assert((!I || !I->mayHaveSideEffects()) && 184 "underlying instruction has side-effects"); 185 return false; 186 } 187 case VPInterleaveSC: 188 return mayWriteToMemory(); 189 case VPWidenLoadEVLSC: 190 case VPWidenLoadSC: 191 case VPWidenStoreEVLSC: 192 case VPWidenStoreSC: 193 assert( 194 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 195 mayWriteToMemory() && 196 "mayHaveSideffects result for ingredient differs from this " 197 "implementation"); 198 return mayWriteToMemory(); 199 case VPReplicateSC: { 200 auto *R = cast<VPReplicateRecipe>(this); 201 return R->getUnderlyingInstr()->mayHaveSideEffects(); 202 } 203 default: 204 return true; 205 } 206 } 207 208 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 209 assert(!Parent && "Recipe already in some VPBasicBlock"); 210 assert(InsertPos->getParent() && 211 "Insertion position not in any VPBasicBlock"); 212 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 213 } 214 215 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 216 iplist<VPRecipeBase>::iterator I) { 217 assert(!Parent && "Recipe already in some VPBasicBlock"); 218 assert(I == BB.end() || I->getParent() == &BB); 219 BB.insert(this, I); 220 } 221 222 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 223 assert(!Parent && "Recipe already in some VPBasicBlock"); 224 assert(InsertPos->getParent() && 225 "Insertion position not in any VPBasicBlock"); 226 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 227 } 228 229 void VPRecipeBase::removeFromParent() { 230 assert(getParent() && "Recipe not in any VPBasicBlock"); 231 getParent()->getRecipeList().remove(getIterator()); 232 Parent = nullptr; 233 } 234 235 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 236 assert(getParent() && "Recipe not in any VPBasicBlock"); 237 return getParent()->getRecipeList().erase(getIterator()); 238 } 239 240 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 241 removeFromParent(); 242 insertAfter(InsertPos); 243 } 244 245 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 246 iplist<VPRecipeBase>::iterator I) { 247 removeFromParent(); 248 insertBefore(BB, I); 249 } 250 251 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { 252 // Get the underlying instruction for the recipe, if there is one. It is used 253 // to 254 // * decide if cost computation should be skipped for this recipe, 255 // * apply forced target instruction cost. 256 Instruction *UI = nullptr; 257 if (auto *S = dyn_cast<VPSingleDefRecipe>(this)) 258 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 259 else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this)) 260 UI = IG->getInsertPos(); 261 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this)) 262 UI = &WidenMem->getIngredient(); 263 264 InstructionCost RecipeCost; 265 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) { 266 RecipeCost = 0; 267 } else { 268 RecipeCost = computeCost(VF, Ctx); 269 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && 270 RecipeCost.isValid()) 271 RecipeCost = InstructionCost(ForceTargetInstructionCost); 272 } 273 274 LLVM_DEBUG({ 275 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; 276 dump(); 277 }); 278 return RecipeCost; 279 } 280 281 InstructionCost VPRecipeBase::computeCost(ElementCount VF, 282 VPCostContext &Ctx) const { 283 llvm_unreachable("subclasses should implement computeCost"); 284 } 285 286 bool VPRecipeBase::isPhi() const { 287 return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) || 288 (isa<VPInstruction>(this) && 289 cast<VPInstruction>(this)->getOpcode() == Instruction::PHI) || 290 isa<VPIRPhi>(this); 291 } 292 293 bool VPRecipeBase::isScalarCast() const { 294 auto *VPI = dyn_cast<VPInstruction>(this); 295 return VPI && Instruction::isCast(VPI->getOpcode()); 296 } 297 298 InstructionCost 299 VPPartialReductionRecipe::computeCost(ElementCount VF, 300 VPCostContext &Ctx) const { 301 std::optional<unsigned> Opcode; 302 VPValue *Op = getOperand(0); 303 VPRecipeBase *OpR = Op->getDefiningRecipe(); 304 305 // If the partial reduction is predicated, a select will be operand 0 306 using namespace llvm::VPlanPatternMatch; 307 if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) { 308 OpR = Op->getDefiningRecipe(); 309 } 310 311 Type *InputTypeA = nullptr, *InputTypeB = nullptr; 312 TTI::PartialReductionExtendKind ExtAType = TTI::PR_None, 313 ExtBType = TTI::PR_None; 314 315 auto GetExtendKind = [](VPRecipeBase *R) { 316 if (!R) 317 return TTI::PR_None; 318 auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R); 319 if (!WidenCastR) 320 return TTI::PR_None; 321 if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) 322 return TTI::PR_ZeroExtend; 323 if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) 324 return TTI::PR_SignExtend; 325 return TTI::PR_None; 326 }; 327 328 // Pick out opcode, type/ext information and use sub side effects from a widen 329 // recipe. 330 auto HandleWiden = [&](VPWidenRecipe *Widen) { 331 if (match(Widen, 332 m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Op)))) { 333 Widen = dyn_cast<VPWidenRecipe>(Op->getDefiningRecipe()); 334 } 335 Opcode = Widen->getOpcode(); 336 VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe(); 337 VPRecipeBase *ExtBR = Widen->getOperand(1)->getDefiningRecipe(); 338 InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0) 339 : Widen->getOperand(0)); 340 InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0) 341 : Widen->getOperand(1)); 342 ExtAType = GetExtendKind(ExtAR); 343 ExtBType = GetExtendKind(ExtBR); 344 }; 345 346 if (isa<VPWidenCastRecipe>(OpR)) { 347 InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0)); 348 ExtAType = GetExtendKind(OpR); 349 } else if (isa<VPReductionPHIRecipe>(OpR)) { 350 auto RedPhiOp1R = getOperand(1)->getDefiningRecipe(); 351 if (isa<VPWidenCastRecipe>(RedPhiOp1R)) { 352 InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0)); 353 ExtAType = GetExtendKind(RedPhiOp1R); 354 } else if (auto Widen = dyn_cast<VPWidenRecipe>(RedPhiOp1R)) 355 HandleWiden(Widen); 356 } else if (auto Widen = dyn_cast<VPWidenRecipe>(OpR)) { 357 HandleWiden(Widen); 358 } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(OpR)) { 359 return Reduction->computeCost(VF, Ctx); 360 } 361 auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); 362 return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB, 363 PhiType, VF, ExtAType, ExtBType, 364 Opcode, Ctx.CostKind); 365 } 366 367 void VPPartialReductionRecipe::execute(VPTransformState &State) { 368 auto &Builder = State.Builder; 369 370 assert(getOpcode() == Instruction::Add && 371 "Unhandled partial reduction opcode"); 372 373 Value *BinOpVal = State.get(getOperand(1)); 374 Value *PhiVal = State.get(getOperand(0)); 375 assert(PhiVal && BinOpVal && "Phi and Mul must be set"); 376 377 Type *RetTy = PhiVal->getType(); 378 379 CallInst *V = Builder.CreateIntrinsic( 380 RetTy, Intrinsic::experimental_vector_partial_reduce_add, 381 {PhiVal, BinOpVal}, nullptr, "partial.reduce"); 382 383 State.set(this, V); 384 } 385 386 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 387 void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, 388 VPSlotTracker &SlotTracker) const { 389 O << Indent << "PARTIAL-REDUCE "; 390 printAsOperand(O, SlotTracker); 391 O << " = " << Instruction::getOpcodeName(getOpcode()) << " "; 392 printOperands(O, SlotTracker); 393 } 394 #endif 395 396 FastMathFlags VPIRFlags::getFastMathFlags() const { 397 assert(OpType == OperationType::FPMathOp && 398 "recipe doesn't have fast math flags"); 399 FastMathFlags Res; 400 Res.setAllowReassoc(FMFs.AllowReassoc); 401 Res.setNoNaNs(FMFs.NoNaNs); 402 Res.setNoInfs(FMFs.NoInfs); 403 Res.setNoSignedZeros(FMFs.NoSignedZeros); 404 Res.setAllowReciprocal(FMFs.AllowReciprocal); 405 Res.setAllowContract(FMFs.AllowContract); 406 Res.setApproxFunc(FMFs.ApproxFunc); 407 return Res; 408 } 409 410 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 411 void VPSingleDefRecipe::dump() const { VPDef::dump(); } 412 #endif 413 414 template <unsigned PartOpIdx> 415 VPValue * 416 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const { 417 if (U.getNumOperands() == PartOpIdx + 1) 418 return U.getOperand(PartOpIdx); 419 return nullptr; 420 } 421 422 template <unsigned PartOpIdx> 423 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const { 424 if (auto *UnrollPartOp = getUnrollPartOperand(U)) 425 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue(); 426 return 0; 427 } 428 429 namespace llvm { 430 template class VPUnrollPartAccessor<2>; 431 template class VPUnrollPartAccessor<3>; 432 } 433 434 VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, 435 const VPIRFlags &Flags, DebugLoc DL, 436 const Twine &Name) 437 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL), 438 VPIRMetadata(), Opcode(Opcode), Name(Name.str()) { 439 assert(flagsValidForOpcode(getOpcode()) && 440 "Set flags not supported for the provided opcode"); 441 assert((getNumOperandsForOpcode(Opcode) == -1u || 442 getNumOperandsForOpcode(Opcode) == getNumOperands()) && 443 "number of operands does not match opcode"); 444 } 445 446 #ifndef NDEBUG 447 unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { 448 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode)) 449 return 1; 450 451 if (Instruction::isBinaryOp(Opcode)) 452 return 2; 453 454 switch (Opcode) { 455 case VPInstruction::StepVector: 456 return 0; 457 case Instruction::Alloca: 458 case Instruction::ExtractValue: 459 case Instruction::Freeze: 460 case Instruction::Load: 461 case VPInstruction::AnyOf: 462 case VPInstruction::BranchOnCond: 463 case VPInstruction::CalculateTripCountMinusVF: 464 case VPInstruction::CanonicalIVIncrementForPart: 465 case VPInstruction::ExplicitVectorLength: 466 case VPInstruction::ExtractLastElement: 467 case VPInstruction::ExtractPenultimateElement: 468 case VPInstruction::FirstActiveLane: 469 case VPInstruction::Not: 470 return 1; 471 case Instruction::ICmp: 472 case Instruction::FCmp: 473 case Instruction::Store: 474 case VPInstruction::ActiveLaneMask: 475 case VPInstruction::BranchOnCount: 476 case VPInstruction::ComputeReductionResult: 477 case VPInstruction::FirstOrderRecurrenceSplice: 478 case VPInstruction::LogicalAnd: 479 case VPInstruction::PtrAdd: 480 case VPInstruction::WideIVStep: 481 return 2; 482 case Instruction::Select: 483 case VPInstruction::ComputeAnyOfResult: 484 case VPInstruction::ReductionStartVector: 485 return 3; 486 case VPInstruction::ComputeFindIVResult: 487 return 4; 488 case Instruction::Call: 489 case Instruction::GetElementPtr: 490 case Instruction::PHI: 491 case Instruction::Switch: 492 // Cannot determine the number of operands from the opcode. 493 return -1u; 494 } 495 llvm_unreachable("all cases should be handled above"); 496 } 497 #endif 498 499 bool VPInstruction::doesGeneratePerAllLanes() const { 500 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 501 } 502 503 bool VPInstruction::canGenerateScalarForFirstLane() const { 504 if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) 505 return true; 506 if (isSingleScalar() || isVectorToScalar()) 507 return true; 508 switch (Opcode) { 509 case Instruction::Freeze: 510 case Instruction::ICmp: 511 case Instruction::PHI: 512 case Instruction::Select: 513 case VPInstruction::BranchOnCond: 514 case VPInstruction::BranchOnCount: 515 case VPInstruction::CalculateTripCountMinusVF: 516 case VPInstruction::CanonicalIVIncrementForPart: 517 case VPInstruction::PtrAdd: 518 case VPInstruction::ExplicitVectorLength: 519 case VPInstruction::AnyOf: 520 return true; 521 default: 522 return false; 523 } 524 } 525 526 Value *VPInstruction::generatePerLane(VPTransformState &State, 527 const VPLane &Lane) { 528 IRBuilderBase &Builder = State.Builder; 529 530 assert(getOpcode() == VPInstruction::PtrAdd && 531 "only PtrAdd opcodes are supported for now"); 532 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 533 State.get(getOperand(1), Lane), Name); 534 } 535 536 /// Create a conditional branch using \p Cond branching to the successors of \p 537 /// VPBB. Note that the first successor is always forward (i.e. not created yet) 538 /// while the second successor may already have been created (if it is a header 539 /// block and VPBB is a latch). 540 static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB, 541 VPTransformState &State) { 542 // Replace the temporary unreachable terminator with a new conditional 543 // branch, hooking it up to backward destination (header) for latch blocks 544 // now, and to forward destination(s) later when they are created. 545 // Second successor may be backwards - iff it is already in VPBB2IRBB. 546 VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]); 547 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc); 548 BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB]; 549 BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc); 550 // First successor is always forward, reset it to nullptr 551 CondBr->setSuccessor(0, nullptr); 552 IRBB->getTerminator()->eraseFromParent(); 553 return CondBr; 554 } 555 556 Value *VPInstruction::generate(VPTransformState &State) { 557 IRBuilderBase &Builder = State.Builder; 558 559 if (Instruction::isBinaryOp(getOpcode())) { 560 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 561 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 562 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 563 auto *Res = 564 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 565 if (auto *I = dyn_cast<Instruction>(Res)) 566 applyFlags(*I); 567 return Res; 568 } 569 570 switch (getOpcode()) { 571 case VPInstruction::Not: { 572 Value *A = State.get(getOperand(0)); 573 return Builder.CreateNot(A, Name); 574 } 575 case Instruction::ExtractElement: { 576 assert(State.VF.isVector() && "Only extract elements from vectors"); 577 if (getOperand(1)->isLiveIn()) { 578 unsigned IdxToExtract = 579 cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue(); 580 return State.get(getOperand(0), VPLane(IdxToExtract)); 581 } 582 Value *Vec = State.get(getOperand(0)); 583 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true); 584 return Builder.CreateExtractElement(Vec, Idx, Name); 585 } 586 case Instruction::Freeze: { 587 Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); 588 return Builder.CreateFreeze(Op, Name); 589 } 590 case Instruction::FCmp: 591 case Instruction::ICmp: { 592 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 593 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 594 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 595 return Builder.CreateCmp(getPredicate(), A, B, Name); 596 } 597 case Instruction::PHI: { 598 llvm_unreachable("should be handled by VPPhi::execute"); 599 } 600 case Instruction::Select: { 601 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 602 Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); 603 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); 604 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); 605 return Builder.CreateSelect(Cond, Op1, Op2, Name); 606 } 607 case VPInstruction::ActiveLaneMask: { 608 // Get first lane of vector induction variable. 609 Value *VIVElem0 = State.get(getOperand(0), VPLane(0)); 610 // Get the original loop tripcount. 611 Value *ScalarTC = State.get(getOperand(1), VPLane(0)); 612 613 // If this part of the active lane mask is scalar, generate the CMP directly 614 // to avoid unnecessary extracts. 615 if (State.VF.isScalar()) 616 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 617 Name); 618 619 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 620 auto *PredTy = VectorType::get(Int1Ty, State.VF); 621 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 622 {PredTy, ScalarTC->getType()}, 623 {VIVElem0, ScalarTC}, nullptr, Name); 624 } 625 case VPInstruction::FirstOrderRecurrenceSplice: { 626 // Generate code to combine the previous and current values in vector v3. 627 // 628 // vector.ph: 629 // v_init = vector(..., ..., ..., a[-1]) 630 // br vector.body 631 // 632 // vector.body 633 // i = phi [0, vector.ph], [i+4, vector.body] 634 // v1 = phi [v_init, vector.ph], [v2, vector.body] 635 // v2 = a[i, i+1, i+2, i+3]; 636 // v3 = vector(v1(3), v2(0, 1, 2)) 637 638 auto *V1 = State.get(getOperand(0)); 639 if (!V1->getType()->isVectorTy()) 640 return V1; 641 Value *V2 = State.get(getOperand(1)); 642 return Builder.CreateVectorSplice(V1, V2, -1, Name); 643 } 644 case VPInstruction::CalculateTripCountMinusVF: { 645 unsigned UF = getParent()->getPlan()->getUF(); 646 Value *ScalarTC = State.get(getOperand(0), VPLane(0)); 647 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF); 648 Value *Sub = Builder.CreateSub(ScalarTC, Step); 649 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 650 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 651 return Builder.CreateSelect(Cmp, Sub, Zero); 652 } 653 case VPInstruction::ExplicitVectorLength: { 654 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 655 // be outside of the main loop. 656 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true); 657 // Compute EVL 658 assert(AVL->getType()->isIntegerTy() && 659 "Requested vector length should be an integer."); 660 661 assert(State.VF.isScalable() && "Expected scalable vector factor."); 662 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 663 664 Value *EVL = State.Builder.CreateIntrinsic( 665 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 666 {AVL, VFArg, State.Builder.getTrue()}); 667 return EVL; 668 } 669 case VPInstruction::CanonicalIVIncrementForPart: { 670 unsigned Part = getUnrollPart(*this); 671 auto *IV = State.get(getOperand(0), VPLane(0)); 672 assert(Part != 0 && "Must have a positive part"); 673 // The canonical IV is incremented by the vectorization factor (num of 674 // SIMD elements) times the unroll part. 675 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 676 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 677 hasNoSignedWrap()); 678 } 679 case VPInstruction::BranchOnCond: { 680 Value *Cond = State.get(getOperand(0), VPLane(0)); 681 auto *Br = createCondBranch(Cond, getParent(), State); 682 applyMetadata(*Br); 683 return Br; 684 } 685 case VPInstruction::BranchOnCount: { 686 // First create the compare. 687 Value *IV = State.get(getOperand(0), /*IsScalar*/ true); 688 Value *TC = State.get(getOperand(1), /*IsScalar*/ true); 689 Value *Cond = Builder.CreateICmpEQ(IV, TC); 690 return createCondBranch(Cond, getParent(), State); 691 } 692 case VPInstruction::Broadcast: { 693 return Builder.CreateVectorSplat( 694 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast"); 695 } 696 case VPInstruction::BuildStructVector: { 697 // For struct types, we need to build a new 'wide' struct type, where each 698 // element is widened, i.e., we create a struct of vectors. 699 auto *StructTy = 700 cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0))); 701 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF)); 702 for (const auto &[LaneIndex, Op] : enumerate(operands())) { 703 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements(); 704 FieldIndex++) { 705 Value *ScalarValue = 706 Builder.CreateExtractValue(State.get(Op, true), FieldIndex); 707 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex); 708 VectorValue = 709 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex); 710 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex); 711 } 712 } 713 return Res; 714 } 715 case VPInstruction::BuildVector: { 716 auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0)); 717 auto NumOfElements = ElementCount::getFixed(getNumOperands()); 718 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements)); 719 for (const auto &[Idx, Op] : enumerate(operands())) 720 Res = State.Builder.CreateInsertElement(Res, State.get(Op, true), 721 State.Builder.getInt32(Idx)); 722 return Res; 723 } 724 case VPInstruction::ReductionStartVector: { 725 if (State.VF.isScalar()) 726 return State.get(getOperand(0), true); 727 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 728 Builder.setFastMathFlags(getFastMathFlags()); 729 // If this start vector is scaled then it should produce a vector with fewer 730 // elements than the VF. 731 ElementCount VF = State.VF.divideCoefficientBy( 732 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue()); 733 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true)); 734 Constant *Zero = Builder.getInt32(0); 735 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true), 736 Zero); 737 } 738 case VPInstruction::ComputeAnyOfResult: { 739 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 740 // and will be removed by breaking up the recipe further. 741 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 742 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 743 Value *ReducedPartRdx = State.get(getOperand(2)); 744 for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx) 745 ReducedPartRdx = Builder.CreateBinOp( 746 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode( 747 RecurKind::AnyOf), 748 State.get(getOperand(Idx)), ReducedPartRdx, "bin.rdx"); 749 return createAnyOfReduction(Builder, ReducedPartRdx, 750 State.get(getOperand(1), VPLane(0)), OrigPhi); 751 } 752 case VPInstruction::ComputeFindIVResult: { 753 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 754 // and will be removed by breaking up the recipe further. 755 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 756 // Get its reduction variable descriptor. 757 RecurKind RK = PhiR->getRecurrenceKind(); 758 assert(RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && 759 "Unexpected reduction kind"); 760 assert(!PhiR->isInLoop() && 761 "In-loop FindLastIV reduction is not supported yet"); 762 763 // The recipe's operands are the reduction phi, the start value, the 764 // sentinel value, followed by one operand for each part of the reduction. 765 unsigned UF = getNumOperands() - 3; 766 Value *ReducedPartRdx = State.get(getOperand(3)); 767 RecurKind MinMaxKind; 768 bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RK); 769 if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) 770 MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax; 771 else 772 MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin; 773 for (unsigned Part = 1; Part < UF; ++Part) 774 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 775 State.get(getOperand(3 + Part))); 776 777 Value *Start = State.get(getOperand(1), true); 778 Value *Sentinel = getOperand(2)->getLiveInIRValue(); 779 return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start, 780 Sentinel); 781 } 782 case VPInstruction::ComputeReductionResult: { 783 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 784 // and will be removed by breaking up the recipe further. 785 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 786 // Get its reduction variable descriptor. 787 788 RecurKind RK = PhiR->getRecurrenceKind(); 789 assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && 790 "should be handled by ComputeFindIVResult"); 791 792 // The recipe's operands are the reduction phi, followed by one operand for 793 // each part of the reduction. 794 unsigned UF = getNumOperands() - 1; 795 VectorParts RdxParts(UF); 796 for (unsigned Part = 0; Part < UF; ++Part) 797 RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop()); 798 799 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 800 if (hasFastMathFlags()) 801 Builder.setFastMathFlags(getFastMathFlags()); 802 803 // Reduce all of the unrolled parts into a single vector. 804 Value *ReducedPartRdx = RdxParts[0]; 805 if (PhiR->isOrdered()) { 806 ReducedPartRdx = RdxParts[UF - 1]; 807 } else { 808 // Floating-point operations should have some FMF to enable the reduction. 809 for (unsigned Part = 1; Part < UF; ++Part) { 810 Value *RdxPart = RdxParts[Part]; 811 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) 812 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 813 else 814 ReducedPartRdx = Builder.CreateBinOp( 815 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK), 816 RdxPart, ReducedPartRdx, "bin.rdx"); 817 } 818 } 819 820 // Create the reduction after the loop. Note that inloop reductions create 821 // the target reduction in the loop using a Reduction recipe. 822 if (State.VF.isVector() && !PhiR->isInLoop()) { 823 // TODO: Support in-order reductions based on the recurrence descriptor. 824 // All ops in the reduction inherit fast-math-flags from the recurrence 825 // descriptor. 826 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK); 827 } 828 829 return ReducedPartRdx; 830 } 831 case VPInstruction::ExtractLastElement: 832 case VPInstruction::ExtractPenultimateElement: { 833 unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2; 834 Value *Res; 835 if (State.VF.isVector()) { 836 assert(Offset <= State.VF.getKnownMinValue() && 837 "invalid offset to extract from"); 838 // Extract lane VF - Offset from the operand. 839 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset)); 840 } else { 841 assert(Offset <= 1 && "invalid offset to extract from"); 842 Res = State.get(getOperand(0)); 843 } 844 if (isa<ExtractElementInst>(Res)) 845 Res->setName(Name); 846 return Res; 847 } 848 case VPInstruction::LogicalAnd: { 849 Value *A = State.get(getOperand(0)); 850 Value *B = State.get(getOperand(1)); 851 return Builder.CreateLogicalAnd(A, B, Name); 852 } 853 case VPInstruction::PtrAdd: { 854 assert(vputils::onlyFirstLaneUsed(this) && 855 "can only generate first lane for PtrAdd"); 856 Value *Ptr = State.get(getOperand(0), VPLane(0)); 857 Value *Addend = State.get(getOperand(1), VPLane(0)); 858 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); 859 } 860 case VPInstruction::AnyOf: { 861 Value *Res = State.get(getOperand(0)); 862 for (VPValue *Op : drop_begin(operands())) 863 Res = Builder.CreateOr(Res, State.get(Op)); 864 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res); 865 } 866 case VPInstruction::FirstActiveLane: { 867 if (getNumOperands() == 1) { 868 Value *Mask = State.get(getOperand(0)); 869 return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask, 870 true, Name); 871 } 872 // If there are multiple operands, create a chain of selects to pick the 873 // first operand with an active lane and add the number of lanes of the 874 // preceding operands. 875 Value *RuntimeVF = 876 getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF); 877 unsigned LastOpIdx = getNumOperands() - 1; 878 Value *Res = nullptr; 879 for (int Idx = LastOpIdx; Idx >= 0; --Idx) { 880 Value *TrailingZeros = Builder.CreateCountTrailingZeroElems( 881 Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name); 882 Value *Current = Builder.CreateAdd( 883 Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros); 884 if (Res) { 885 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF); 886 Res = Builder.CreateSelect(Cmp, Current, Res); 887 } else { 888 Res = Current; 889 } 890 } 891 892 return Res; 893 } 894 default: 895 llvm_unreachable("Unsupported opcode for instruction"); 896 } 897 } 898 899 InstructionCost VPInstruction::computeCost(ElementCount VF, 900 VPCostContext &Ctx) const { 901 if (Instruction::isBinaryOp(getOpcode())) { 902 Type *ResTy = Ctx.Types.inferScalarType(this); 903 if (!vputils::onlyFirstLaneUsed(this)) 904 ResTy = toVectorTy(ResTy, VF); 905 906 if (!getUnderlyingValue()) { 907 switch (getOpcode()) { 908 case Instruction::FMul: 909 return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind); 910 default: 911 // TODO: Compute cost for VPInstructions without underlying values once 912 // the legacy cost model has been retired. 913 return 0; 914 } 915 } 916 917 assert(!doesGeneratePerAllLanes() && 918 "Should only generate a vector value or single scalar, not scalars " 919 "for all lanes."); 920 return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind); 921 } 922 923 switch (getOpcode()) { 924 case Instruction::ExtractElement: { 925 // Add on the cost of extracting the element. 926 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 927 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, 928 Ctx.CostKind); 929 } 930 case VPInstruction::AnyOf: { 931 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 932 return Ctx.TTI.getArithmeticReductionCost( 933 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind); 934 } 935 case VPInstruction::FirstActiveLane: { 936 // Calculate the cost of determining the lane index. 937 auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 938 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, 939 Type::getInt64Ty(Ctx.LLVMCtx), 940 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); 941 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); 942 } 943 case VPInstruction::FirstOrderRecurrenceSplice: { 944 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?"); 945 SmallVector<int> Mask(VF.getKnownMinValue()); 946 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 947 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 948 949 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 950 cast<VectorType>(VectorTy), 951 cast<VectorType>(VectorTy), Mask, 952 Ctx.CostKind, VF.getKnownMinValue() - 1); 953 } 954 case VPInstruction::ActiveLaneMask: { 955 Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0)); 956 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF); 957 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, 958 {ArgTy, ArgTy}); 959 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); 960 } 961 case VPInstruction::ExplicitVectorLength: { 962 Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0)); 963 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx); 964 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx); 965 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length, 966 I32Ty, {Arg0Ty, I32Ty, I1Ty}); 967 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); 968 } 969 case VPInstruction::ExtractPenultimateElement: 970 if (VF == ElementCount::getScalable(1)) 971 return InstructionCost::getInvalid(); 972 LLVM_FALLTHROUGH; 973 default: 974 // TODO: Compute cost other VPInstructions once the legacy cost model has 975 // been retired. 976 assert(!getUnderlyingValue() && 977 "unexpected VPInstruction witht underlying value"); 978 return 0; 979 } 980 } 981 982 bool VPInstruction::isVectorToScalar() const { 983 return getOpcode() == VPInstruction::ExtractLastElement || 984 getOpcode() == VPInstruction::ExtractPenultimateElement || 985 getOpcode() == Instruction::ExtractElement || 986 getOpcode() == VPInstruction::FirstActiveLane || 987 getOpcode() == VPInstruction::ComputeAnyOfResult || 988 getOpcode() == VPInstruction::ComputeFindIVResult || 989 getOpcode() == VPInstruction::ComputeReductionResult || 990 getOpcode() == VPInstruction::AnyOf; 991 } 992 993 bool VPInstruction::isSingleScalar() const { 994 return getOpcode() == Instruction::PHI || isScalarCast(); 995 } 996 997 void VPInstruction::execute(VPTransformState &State) { 998 assert(!State.Lane && "VPInstruction executing an Lane"); 999 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 1000 assert(flagsValidForOpcode(getOpcode()) && 1001 "Set flags not supported for the provided opcode"); 1002 if (hasFastMathFlags()) 1003 State.Builder.setFastMathFlags(getFastMathFlags()); 1004 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && 1005 (vputils::onlyFirstLaneUsed(this) || 1006 isVectorToScalar() || isSingleScalar()); 1007 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 1008 if (GeneratesPerAllLanes) { 1009 for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue(); 1010 Lane != NumLanes; ++Lane) { 1011 Value *GeneratedValue = generatePerLane(State, VPLane(Lane)); 1012 assert(GeneratedValue && "generatePerLane must produce a value"); 1013 State.set(this, GeneratedValue, VPLane(Lane)); 1014 } 1015 return; 1016 } 1017 1018 Value *GeneratedValue = generate(State); 1019 if (!hasResult()) 1020 return; 1021 assert(GeneratedValue && "generate must produce a value"); 1022 assert((((GeneratedValue->getType()->isVectorTy() || 1023 GeneratedValue->getType()->isStructTy()) == 1024 !GeneratesPerFirstLaneOnly) || 1025 State.VF.isScalar()) && 1026 "scalar value but not only first lane defined"); 1027 State.set(this, GeneratedValue, 1028 /*IsScalar*/ GeneratesPerFirstLaneOnly); 1029 } 1030 1031 bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { 1032 if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) 1033 return false; 1034 switch (getOpcode()) { 1035 case Instruction::ExtractElement: 1036 case Instruction::Freeze: 1037 case Instruction::FCmp: 1038 case Instruction::ICmp: 1039 case Instruction::Select: 1040 case VPInstruction::AnyOf: 1041 case VPInstruction::BuildStructVector: 1042 case VPInstruction::BuildVector: 1043 case VPInstruction::CalculateTripCountMinusVF: 1044 case VPInstruction::CanonicalIVIncrementForPart: 1045 case VPInstruction::ExtractLastElement: 1046 case VPInstruction::ExtractPenultimateElement: 1047 case VPInstruction::FirstActiveLane: 1048 case VPInstruction::FirstOrderRecurrenceSplice: 1049 case VPInstruction::LogicalAnd: 1050 case VPInstruction::Not: 1051 case VPInstruction::PtrAdd: 1052 case VPInstruction::WideIVStep: 1053 case VPInstruction::StepVector: 1054 case VPInstruction::ReductionStartVector: 1055 return false; 1056 default: 1057 return true; 1058 } 1059 } 1060 1061 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 1062 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 1063 if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) 1064 return vputils::onlyFirstLaneUsed(this); 1065 1066 switch (getOpcode()) { 1067 default: 1068 return false; 1069 case Instruction::ExtractElement: 1070 return Op == getOperand(1); 1071 case Instruction::PHI: 1072 return true; 1073 case Instruction::FCmp: 1074 case Instruction::ICmp: 1075 case Instruction::Select: 1076 case Instruction::Or: 1077 case Instruction::Freeze: 1078 // TODO: Cover additional opcodes. 1079 return vputils::onlyFirstLaneUsed(this); 1080 case VPInstruction::ActiveLaneMask: 1081 case VPInstruction::ExplicitVectorLength: 1082 case VPInstruction::CalculateTripCountMinusVF: 1083 case VPInstruction::CanonicalIVIncrementForPart: 1084 case VPInstruction::BranchOnCount: 1085 case VPInstruction::BranchOnCond: 1086 case VPInstruction::Broadcast: 1087 case VPInstruction::ReductionStartVector: 1088 return true; 1089 case VPInstruction::PtrAdd: 1090 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this); 1091 case VPInstruction::ComputeAnyOfResult: 1092 case VPInstruction::ComputeFindIVResult: 1093 return Op == getOperand(1); 1094 }; 1095 llvm_unreachable("switch should return"); 1096 } 1097 1098 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 1099 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 1100 if (Instruction::isBinaryOp(getOpcode())) 1101 return vputils::onlyFirstPartUsed(this); 1102 1103 switch (getOpcode()) { 1104 default: 1105 return false; 1106 case Instruction::FCmp: 1107 case Instruction::ICmp: 1108 case Instruction::Select: 1109 return vputils::onlyFirstPartUsed(this); 1110 case VPInstruction::BranchOnCount: 1111 case VPInstruction::BranchOnCond: 1112 case VPInstruction::CanonicalIVIncrementForPart: 1113 return true; 1114 }; 1115 llvm_unreachable("switch should return"); 1116 } 1117 1118 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1119 void VPInstruction::dump() const { 1120 VPSlotTracker SlotTracker(getParent()->getPlan()); 1121 print(dbgs(), "", SlotTracker); 1122 } 1123 1124 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 1125 VPSlotTracker &SlotTracker) const { 1126 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " "; 1127 1128 if (hasResult()) { 1129 printAsOperand(O, SlotTracker); 1130 O << " = "; 1131 } 1132 1133 switch (getOpcode()) { 1134 case VPInstruction::Not: 1135 O << "not"; 1136 break; 1137 case VPInstruction::SLPLoad: 1138 O << "combined load"; 1139 break; 1140 case VPInstruction::SLPStore: 1141 O << "combined store"; 1142 break; 1143 case VPInstruction::ActiveLaneMask: 1144 O << "active lane mask"; 1145 break; 1146 case VPInstruction::ExplicitVectorLength: 1147 O << "EXPLICIT-VECTOR-LENGTH"; 1148 break; 1149 case VPInstruction::FirstOrderRecurrenceSplice: 1150 O << "first-order splice"; 1151 break; 1152 case VPInstruction::BranchOnCond: 1153 O << "branch-on-cond"; 1154 break; 1155 case VPInstruction::CalculateTripCountMinusVF: 1156 O << "TC > VF ? TC - VF : 0"; 1157 break; 1158 case VPInstruction::CanonicalIVIncrementForPart: 1159 O << "VF * Part +"; 1160 break; 1161 case VPInstruction::BranchOnCount: 1162 O << "branch-on-count"; 1163 break; 1164 case VPInstruction::Broadcast: 1165 O << "broadcast"; 1166 break; 1167 case VPInstruction::BuildStructVector: 1168 O << "buildstructvector"; 1169 break; 1170 case VPInstruction::BuildVector: 1171 O << "buildvector"; 1172 break; 1173 case VPInstruction::ExtractLastElement: 1174 O << "extract-last-element"; 1175 break; 1176 case VPInstruction::ExtractPenultimateElement: 1177 O << "extract-penultimate-element"; 1178 break; 1179 case VPInstruction::ComputeAnyOfResult: 1180 O << "compute-anyof-result"; 1181 break; 1182 case VPInstruction::ComputeFindIVResult: 1183 O << "compute-find-iv-result"; 1184 break; 1185 case VPInstruction::ComputeReductionResult: 1186 O << "compute-reduction-result"; 1187 break; 1188 case VPInstruction::LogicalAnd: 1189 O << "logical-and"; 1190 break; 1191 case VPInstruction::PtrAdd: 1192 O << "ptradd"; 1193 break; 1194 case VPInstruction::AnyOf: 1195 O << "any-of"; 1196 break; 1197 case VPInstruction::FirstActiveLane: 1198 O << "first-active-lane"; 1199 break; 1200 case VPInstruction::ReductionStartVector: 1201 O << "reduction-start-vector"; 1202 break; 1203 default: 1204 O << Instruction::getOpcodeName(getOpcode()); 1205 } 1206 1207 printFlags(O); 1208 printOperands(O, SlotTracker); 1209 1210 if (auto DL = getDebugLoc()) { 1211 O << ", !dbg "; 1212 DL.print(O); 1213 } 1214 } 1215 #endif 1216 1217 void VPInstructionWithType::execute(VPTransformState &State) { 1218 State.setDebugLocFrom(getDebugLoc()); 1219 if (isScalarCast()) { 1220 Value *Op = State.get(getOperand(0), VPLane(0)); 1221 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()), 1222 Op, ResultTy); 1223 State.set(this, Cast, VPLane(0)); 1224 return; 1225 } 1226 switch (getOpcode()) { 1227 case VPInstruction::StepVector: { 1228 Value *StepVector = 1229 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF)); 1230 State.set(this, StepVector); 1231 break; 1232 } 1233 default: 1234 llvm_unreachable("opcode not implemented yet"); 1235 } 1236 } 1237 1238 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1239 void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent, 1240 VPSlotTracker &SlotTracker) const { 1241 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " "; 1242 printAsOperand(O, SlotTracker); 1243 O << " = "; 1244 1245 switch (getOpcode()) { 1246 case VPInstruction::WideIVStep: 1247 O << "wide-iv-step "; 1248 printOperands(O, SlotTracker); 1249 break; 1250 case VPInstruction::StepVector: 1251 O << "step-vector " << *ResultTy; 1252 break; 1253 default: 1254 assert(Instruction::isCast(getOpcode()) && "unhandled opcode"); 1255 O << Instruction::getOpcodeName(getOpcode()) << " "; 1256 printOperands(O, SlotTracker); 1257 O << " to " << *ResultTy; 1258 } 1259 } 1260 #endif 1261 1262 void VPPhi::execute(VPTransformState &State) { 1263 State.setDebugLocFrom(getDebugLoc()); 1264 PHINode *NewPhi = State.Builder.CreatePHI( 1265 State.TypeAnalysis.inferScalarType(this), 2, getName()); 1266 unsigned NumIncoming = getNumIncoming(); 1267 if (getParent() != getParent()->getPlan()->getScalarPreheader()) { 1268 // TODO: Fixup all incoming values of header phis once recipes defining them 1269 // are introduced. 1270 NumIncoming = 1; 1271 } 1272 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) { 1273 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0)); 1274 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx)); 1275 NewPhi->addIncoming(IncV, PredBB); 1276 } 1277 State.set(this, NewPhi, VPLane(0)); 1278 } 1279 1280 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1281 void VPPhi::print(raw_ostream &O, const Twine &Indent, 1282 VPSlotTracker &SlotTracker) const { 1283 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " "; 1284 printAsOperand(O, SlotTracker); 1285 O << " = phi "; 1286 printPhiOperands(O, SlotTracker); 1287 } 1288 #endif 1289 1290 VPIRInstruction *VPIRInstruction ::create(Instruction &I) { 1291 if (auto *Phi = dyn_cast<PHINode>(&I)) 1292 return new VPIRPhi(*Phi); 1293 return new VPIRInstruction(I); 1294 } 1295 1296 void VPIRInstruction::execute(VPTransformState &State) { 1297 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 && 1298 "PHINodes must be handled by VPIRPhi"); 1299 // Advance the insert point after the wrapped IR instruction. This allows 1300 // interleaving VPIRInstructions and other recipes. 1301 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); 1302 } 1303 1304 InstructionCost VPIRInstruction::computeCost(ElementCount VF, 1305 VPCostContext &Ctx) const { 1306 // The recipe wraps an existing IR instruction on the border of VPlan's scope, 1307 // hence it does not contribute to the cost-modeling for the VPlan. 1308 return 0; 1309 } 1310 1311 void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) { 1312 assert(isa<PHINode>(getInstruction()) && 1313 "can only update exiting operands to phi nodes"); 1314 assert(getNumOperands() > 0 && "must have at least one operand"); 1315 VPValue *Exiting = getOperand(0); 1316 if (Exiting->isLiveIn()) 1317 return; 1318 1319 Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting}); 1320 setOperand(0, Exiting); 1321 } 1322 1323 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1324 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, 1325 VPSlotTracker &SlotTracker) const { 1326 O << Indent << "IR " << I; 1327 } 1328 #endif 1329 1330 void VPIRPhi::execute(VPTransformState &State) { 1331 PHINode *Phi = &getIRPhi(); 1332 for (const auto &[Idx, Op] : enumerate(operands())) { 1333 VPValue *ExitValue = Op; 1334 auto Lane = vputils::isSingleScalar(ExitValue) 1335 ? VPLane::getFirstLane() 1336 : VPLane::getLastLaneForVF(State.VF); 1337 VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; 1338 auto *PredVPBB = Pred->getExitingBasicBlock(); 1339 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 1340 // Set insertion point in PredBB in case an extract needs to be generated. 1341 // TODO: Model extracts explicitly. 1342 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); 1343 Value *V = State.get(ExitValue, VPLane(Lane)); 1344 // If there is no existing block for PredBB in the phi, add a new incoming 1345 // value. Otherwise update the existing incoming value for PredBB. 1346 if (Phi->getBasicBlockIndex(PredBB) == -1) 1347 Phi->addIncoming(V, PredBB); 1348 else 1349 Phi->setIncomingValueForBlock(PredBB, V); 1350 } 1351 1352 // Advance the insert point after the wrapped IR instruction. This allows 1353 // interleaving VPIRInstructions and other recipes. 1354 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator())); 1355 } 1356 1357 void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const { 1358 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe()); 1359 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() && 1360 "Number of phi operands must match number of predecessors"); 1361 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock); 1362 R->removeOperand(Position); 1363 } 1364 1365 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1366 void VPPhiAccessors::printPhiOperands(raw_ostream &O, 1367 VPSlotTracker &SlotTracker) const { 1368 interleaveComma(enumerate(getAsRecipe()->operands()), O, 1369 [this, &O, &SlotTracker](auto Op) { 1370 O << "[ "; 1371 Op.value()->printAsOperand(O, SlotTracker); 1372 O << ", "; 1373 getIncomingBlock(Op.index())->printAsOperand(O); 1374 O << " ]"; 1375 }); 1376 } 1377 #endif 1378 1379 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1380 void VPIRPhi::print(raw_ostream &O, const Twine &Indent, 1381 VPSlotTracker &SlotTracker) const { 1382 VPIRInstruction::print(O, Indent, SlotTracker); 1383 1384 if (getNumOperands() != 0) { 1385 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": "; 1386 interleaveComma( 1387 enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) { 1388 Op.value()->printAsOperand(O, SlotTracker); 1389 O << " from "; 1390 getParent()->getPredecessors()[Op.index()]->printAsOperand(O); 1391 }); 1392 O << ")"; 1393 } 1394 } 1395 #endif 1396 1397 VPIRMetadata::VPIRMetadata(Instruction &I, LoopVersioning *LVer) 1398 : VPIRMetadata(I) { 1399 if (!LVer || !isa<LoadInst, StoreInst>(&I)) 1400 return; 1401 const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(&I); 1402 if (AliasScopeMD) 1403 Metadata.emplace_back(LLVMContext::MD_alias_scope, AliasScopeMD); 1404 if (NoAliasMD) 1405 Metadata.emplace_back(LLVMContext::MD_noalias, NoAliasMD); 1406 } 1407 1408 void VPIRMetadata::applyMetadata(Instruction &I) const { 1409 for (const auto &[Kind, Node] : Metadata) 1410 I.setMetadata(Kind, Node); 1411 } 1412 1413 void VPWidenCallRecipe::execute(VPTransformState &State) { 1414 assert(State.VF.isVector() && "not widening"); 1415 assert(Variant != nullptr && "Can't create vector function."); 1416 1417 FunctionType *VFTy = Variant->getFunctionType(); 1418 // Add return type if intrinsic is overloaded on it. 1419 SmallVector<Value *, 4> Args; 1420 for (const auto &I : enumerate(args())) { 1421 Value *Arg; 1422 // Some vectorized function variants may also take a scalar argument, 1423 // e.g. linear parameters for pointers. This needs to be the scalar value 1424 // from the start of the respective part when interleaving. 1425 if (!VFTy->getParamType(I.index())->isVectorTy()) 1426 Arg = State.get(I.value(), VPLane(0)); 1427 else 1428 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); 1429 Args.push_back(Arg); 1430 } 1431 1432 auto *CI = cast_or_null<CallInst>(getUnderlyingValue()); 1433 SmallVector<OperandBundleDef, 1> OpBundles; 1434 if (CI) 1435 CI->getOperandBundlesAsDefs(OpBundles); 1436 1437 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles); 1438 applyFlags(*V); 1439 applyMetadata(*V); 1440 V->setCallingConv(Variant->getCallingConv()); 1441 1442 if (!V->getType()->isVoidTy()) 1443 State.set(this, V); 1444 } 1445 1446 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, 1447 VPCostContext &Ctx) const { 1448 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), 1449 Variant->getFunctionType()->params(), 1450 Ctx.CostKind); 1451 } 1452 1453 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1454 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 1455 VPSlotTracker &SlotTracker) const { 1456 O << Indent << "WIDEN-CALL "; 1457 1458 Function *CalledFn = getCalledScalarFunction(); 1459 if (CalledFn->getReturnType()->isVoidTy()) 1460 O << "void "; 1461 else { 1462 printAsOperand(O, SlotTracker); 1463 O << " = "; 1464 } 1465 1466 O << "call"; 1467 printFlags(O); 1468 O << " @" << CalledFn->getName() << "("; 1469 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) { 1470 Op->printAsOperand(O, SlotTracker); 1471 }); 1472 O << ")"; 1473 1474 O << " (using library function"; 1475 if (Variant->hasName()) 1476 O << ": " << Variant->getName(); 1477 O << ")"; 1478 } 1479 #endif 1480 1481 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { 1482 assert(State.VF.isVector() && "not widening"); 1483 1484 SmallVector<Type *, 2> TysForDecl; 1485 // Add return type if intrinsic is overloaded on it. 1486 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI)) 1487 TysForDecl.push_back(VectorType::get(getResultType(), State.VF)); 1488 SmallVector<Value *, 4> Args; 1489 for (const auto &I : enumerate(operands())) { 1490 // Some intrinsics have a scalar argument - don't replace it with a 1491 // vector. 1492 Value *Arg; 1493 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(), 1494 State.TTI)) 1495 Arg = State.get(I.value(), VPLane(0)); 1496 else 1497 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); 1498 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(), 1499 State.TTI)) 1500 TysForDecl.push_back(Arg->getType()); 1501 Args.push_back(Arg); 1502 } 1503 1504 // Use vector version of the intrinsic. 1505 Module *M = State.Builder.GetInsertBlock()->getModule(); 1506 Function *VectorF = 1507 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl); 1508 assert(VectorF && 1509 "Can't retrieve vector intrinsic or vector-predication intrinsics."); 1510 1511 auto *CI = cast_or_null<CallInst>(getUnderlyingValue()); 1512 SmallVector<OperandBundleDef, 1> OpBundles; 1513 if (CI) 1514 CI->getOperandBundlesAsDefs(OpBundles); 1515 1516 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 1517 1518 applyFlags(*V); 1519 applyMetadata(*V); 1520 1521 if (!V->getType()->isVoidTy()) 1522 State.set(this, V); 1523 } 1524 1525 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, 1526 VPCostContext &Ctx) const { 1527 // Some backends analyze intrinsic arguments to determine cost. Use the 1528 // underlying value for the operand if it has one. Otherwise try to use the 1529 // operand of the underlying call instruction, if there is one. Otherwise 1530 // clear Arguments. 1531 // TODO: Rework TTI interface to be independent of concrete IR values. 1532 SmallVector<const Value *> Arguments; 1533 for (const auto &[Idx, Op] : enumerate(operands())) { 1534 auto *V = Op->getUnderlyingValue(); 1535 if (!V) { 1536 if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { 1537 Arguments.push_back(UI->getArgOperand(Idx)); 1538 continue; 1539 } 1540 Arguments.clear(); 1541 break; 1542 } 1543 Arguments.push_back(V); 1544 } 1545 1546 Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF); 1547 SmallVector<Type *> ParamTys; 1548 for (unsigned I = 0; I != getNumOperands(); ++I) 1549 ParamTys.push_back( 1550 toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); 1551 1552 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. 1553 FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); 1554 IntrinsicCostAttributes CostAttrs( 1555 VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, 1556 dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()), 1557 InstructionCost::getInvalid(), &Ctx.TLI); 1558 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); 1559 } 1560 1561 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { 1562 return Intrinsic::getBaseName(VectorIntrinsicID); 1563 } 1564 1565 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { 1566 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 1567 return all_of(enumerate(operands()), [this, &Op](const auto &X) { 1568 auto [Idx, V] = X; 1569 return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(), 1570 Idx, nullptr); 1571 }); 1572 } 1573 1574 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1575 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent, 1576 VPSlotTracker &SlotTracker) const { 1577 O << Indent << "WIDEN-INTRINSIC "; 1578 if (ResultTy->isVoidTy()) { 1579 O << "void "; 1580 } else { 1581 printAsOperand(O, SlotTracker); 1582 O << " = "; 1583 } 1584 1585 O << "call"; 1586 printFlags(O); 1587 O << getIntrinsicName() << "("; 1588 1589 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { 1590 Op->printAsOperand(O, SlotTracker); 1591 }); 1592 O << ")"; 1593 } 1594 #endif 1595 1596 void VPHistogramRecipe::execute(VPTransformState &State) { 1597 IRBuilderBase &Builder = State.Builder; 1598 1599 Value *Address = State.get(getOperand(0)); 1600 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true); 1601 VectorType *VTy = cast<VectorType>(Address->getType()); 1602 1603 // The histogram intrinsic requires a mask even if the recipe doesn't; 1604 // if the mask operand was omitted then all lanes should be executed and 1605 // we just need to synthesize an all-true mask. 1606 Value *Mask = nullptr; 1607 if (VPValue *VPMask = getMask()) 1608 Mask = State.get(VPMask); 1609 else 1610 Mask = 1611 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1)); 1612 1613 // If this is a subtract, we want to invert the increment amount. We may 1614 // add a separate intrinsic in future, but for now we'll try this. 1615 if (Opcode == Instruction::Sub) 1616 IncAmt = Builder.CreateNeg(IncAmt); 1617 else 1618 assert(Opcode == Instruction::Add && "only add or sub supported for now"); 1619 1620 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add, 1621 {VTy, IncAmt->getType()}, 1622 {Address, IncAmt, Mask}); 1623 } 1624 1625 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, 1626 VPCostContext &Ctx) const { 1627 // FIXME: Take the gather and scatter into account as well. For now we're 1628 // generating the same cost as the fallback path, but we'll likely 1629 // need to create a new TTI method for determining the cost, including 1630 // whether we can use base + vec-of-smaller-indices or just 1631 // vec-of-pointers. 1632 assert(VF.isVector() && "Invalid VF for histogram cost"); 1633 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0)); 1634 VPValue *IncAmt = getOperand(1); 1635 Type *IncTy = Ctx.Types.inferScalarType(IncAmt); 1636 VectorType *VTy = VectorType::get(IncTy, VF); 1637 1638 // Assume that a non-constant update value (or a constant != 1) requires 1639 // a multiply, and add that into the cost. 1640 InstructionCost MulCost = 1641 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind); 1642 if (IncAmt->isLiveIn()) { 1643 ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue()); 1644 1645 if (CI && CI->getZExtValue() == 1) 1646 MulCost = TTI::TCC_Free; 1647 } 1648 1649 // Find the cost of the histogram operation itself. 1650 Type *PtrTy = VectorType::get(AddressTy, VF); 1651 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF); 1652 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, 1653 Type::getVoidTy(Ctx.LLVMCtx), 1654 {PtrTy, IncTy, MaskTy}); 1655 1656 // Add the costs together with the add/sub operation. 1657 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost + 1658 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind); 1659 } 1660 1661 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1662 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent, 1663 VPSlotTracker &SlotTracker) const { 1664 O << Indent << "WIDEN-HISTOGRAM buckets: "; 1665 getOperand(0)->printAsOperand(O, SlotTracker); 1666 1667 if (Opcode == Instruction::Sub) 1668 O << ", dec: "; 1669 else { 1670 assert(Opcode == Instruction::Add); 1671 O << ", inc: "; 1672 } 1673 getOperand(1)->printAsOperand(O, SlotTracker); 1674 1675 if (VPValue *Mask = getMask()) { 1676 O << ", mask: "; 1677 Mask->printAsOperand(O, SlotTracker); 1678 } 1679 } 1680 1681 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 1682 VPSlotTracker &SlotTracker) const { 1683 O << Indent << "WIDEN-SELECT "; 1684 printAsOperand(O, SlotTracker); 1685 O << " = select "; 1686 printFlags(O); 1687 getOperand(0)->printAsOperand(O, SlotTracker); 1688 O << ", "; 1689 getOperand(1)->printAsOperand(O, SlotTracker); 1690 O << ", "; 1691 getOperand(2)->printAsOperand(O, SlotTracker); 1692 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 1693 } 1694 #endif 1695 1696 void VPWidenSelectRecipe::execute(VPTransformState &State) { 1697 // The condition can be loop invariant but still defined inside the 1698 // loop. This means that we can't just use the original 'cond' value. 1699 // We have to take the 'vectorized' value and pick the first lane. 1700 // Instcombine will make this a no-op. 1701 auto *InvarCond = 1702 isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr; 1703 1704 Value *Cond = InvarCond ? InvarCond : State.get(getCond()); 1705 Value *Op0 = State.get(getOperand(1)); 1706 Value *Op1 = State.get(getOperand(2)); 1707 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 1708 State.set(this, Sel); 1709 if (auto *I = dyn_cast<Instruction>(Sel)) { 1710 if (isa<FPMathOperator>(I)) 1711 applyFlags(*I); 1712 applyMetadata(*I); 1713 } 1714 } 1715 1716 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, 1717 VPCostContext &Ctx) const { 1718 SelectInst *SI = cast<SelectInst>(getUnderlyingValue()); 1719 bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions(); 1720 Type *ScalarTy = Ctx.Types.inferScalarType(this); 1721 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1722 1723 VPValue *Op0, *Op1; 1724 using namespace llvm::VPlanPatternMatch; 1725 if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && 1726 (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) || 1727 match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) { 1728 // select x, y, false --> x & y 1729 // select x, true, y --> x | y 1730 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0); 1731 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1); 1732 1733 SmallVector<const Value *, 2> Operands; 1734 if (all_of(operands(), 1735 [](VPValue *Op) { return Op->getUnderlyingValue(); })) 1736 Operands.append(SI->op_begin(), SI->op_end()); 1737 bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))); 1738 return Ctx.TTI.getArithmeticInstrCost( 1739 IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, 1740 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI); 1741 } 1742 1743 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0)); 1744 if (!ScalarCond) 1745 CondTy = VectorType::get(CondTy, VF); 1746 1747 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 1748 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 1749 Pred = Cmp->getPredicate(); 1750 return Ctx.TTI.getCmpSelInstrCost( 1751 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind, 1752 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI); 1753 } 1754 1755 VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) { 1756 AllowReassoc = FMF.allowReassoc(); 1757 NoNaNs = FMF.noNaNs(); 1758 NoInfs = FMF.noInfs(); 1759 NoSignedZeros = FMF.noSignedZeros(); 1760 AllowReciprocal = FMF.allowReciprocal(); 1761 AllowContract = FMF.allowContract(); 1762 ApproxFunc = FMF.approxFunc(); 1763 } 1764 1765 #if !defined(NDEBUG) 1766 bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { 1767 switch (OpType) { 1768 case OperationType::OverflowingBinOp: 1769 return Opcode == Instruction::Add || Opcode == Instruction::Sub || 1770 Opcode == Instruction::Mul || 1771 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart; 1772 case OperationType::Trunc: 1773 return Opcode == Instruction::Trunc; 1774 case OperationType::DisjointOp: 1775 return Opcode == Instruction::Or; 1776 case OperationType::PossiblyExactOp: 1777 return Opcode == Instruction::AShr; 1778 case OperationType::GEPOp: 1779 return Opcode == Instruction::GetElementPtr || 1780 Opcode == VPInstruction::PtrAdd; 1781 case OperationType::FPMathOp: 1782 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 1783 Opcode == Instruction::FSub || Opcode == Instruction::FNeg || 1784 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 1785 Opcode == Instruction::FCmp || Opcode == Instruction::Select || 1786 Opcode == VPInstruction::WideIVStep || 1787 Opcode == VPInstruction::ReductionStartVector || 1788 Opcode == VPInstruction::ComputeReductionResult; 1789 case OperationType::NonNegOp: 1790 return Opcode == Instruction::ZExt; 1791 break; 1792 case OperationType::Cmp: 1793 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp; 1794 case OperationType::Other: 1795 return true; 1796 } 1797 llvm_unreachable("Unknown OperationType enum"); 1798 } 1799 #endif 1800 1801 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1802 void VPIRFlags::printFlags(raw_ostream &O) const { 1803 switch (OpType) { 1804 case OperationType::Cmp: 1805 O << " " << CmpInst::getPredicateName(getPredicate()); 1806 break; 1807 case OperationType::DisjointOp: 1808 if (DisjointFlags.IsDisjoint) 1809 O << " disjoint"; 1810 break; 1811 case OperationType::PossiblyExactOp: 1812 if (ExactFlags.IsExact) 1813 O << " exact"; 1814 break; 1815 case OperationType::OverflowingBinOp: 1816 if (WrapFlags.HasNUW) 1817 O << " nuw"; 1818 if (WrapFlags.HasNSW) 1819 O << " nsw"; 1820 break; 1821 case OperationType::Trunc: 1822 if (TruncFlags.HasNUW) 1823 O << " nuw"; 1824 if (TruncFlags.HasNSW) 1825 O << " nsw"; 1826 break; 1827 case OperationType::FPMathOp: 1828 getFastMathFlags().print(O); 1829 break; 1830 case OperationType::GEPOp: 1831 if (GEPFlags.isInBounds()) 1832 O << " inbounds"; 1833 else if (GEPFlags.hasNoUnsignedSignedWrap()) 1834 O << " nusw"; 1835 if (GEPFlags.hasNoUnsignedWrap()) 1836 O << " nuw"; 1837 break; 1838 case OperationType::NonNegOp: 1839 if (NonNegFlags.NonNeg) 1840 O << " nneg"; 1841 break; 1842 case OperationType::Other: 1843 break; 1844 } 1845 O << " "; 1846 } 1847 #endif 1848 1849 void VPWidenRecipe::execute(VPTransformState &State) { 1850 auto &Builder = State.Builder; 1851 switch (Opcode) { 1852 case Instruction::Call: 1853 case Instruction::Br: 1854 case Instruction::PHI: 1855 case Instruction::GetElementPtr: 1856 case Instruction::Select: 1857 llvm_unreachable("This instruction is handled by a different recipe."); 1858 case Instruction::UDiv: 1859 case Instruction::SDiv: 1860 case Instruction::SRem: 1861 case Instruction::URem: 1862 case Instruction::Add: 1863 case Instruction::FAdd: 1864 case Instruction::Sub: 1865 case Instruction::FSub: 1866 case Instruction::FNeg: 1867 case Instruction::Mul: 1868 case Instruction::FMul: 1869 case Instruction::FDiv: 1870 case Instruction::FRem: 1871 case Instruction::Shl: 1872 case Instruction::LShr: 1873 case Instruction::AShr: 1874 case Instruction::And: 1875 case Instruction::Or: 1876 case Instruction::Xor: { 1877 // Just widen unops and binops. 1878 SmallVector<Value *, 2> Ops; 1879 for (VPValue *VPOp : operands()) 1880 Ops.push_back(State.get(VPOp)); 1881 1882 Value *V = Builder.CreateNAryOp(Opcode, Ops); 1883 1884 if (auto *VecOp = dyn_cast<Instruction>(V)) { 1885 applyFlags(*VecOp); 1886 applyMetadata(*VecOp); 1887 } 1888 1889 // Use this vector value for all users of the original instruction. 1890 State.set(this, V); 1891 break; 1892 } 1893 case Instruction::ExtractValue: { 1894 assert(getNumOperands() == 2 && "expected single level extractvalue"); 1895 Value *Op = State.get(getOperand(0)); 1896 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 1897 Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue()); 1898 State.set(this, Extract); 1899 break; 1900 } 1901 case Instruction::Freeze: { 1902 Value *Op = State.get(getOperand(0)); 1903 Value *Freeze = Builder.CreateFreeze(Op); 1904 State.set(this, Freeze); 1905 break; 1906 } 1907 case Instruction::ICmp: 1908 case Instruction::FCmp: { 1909 // Widen compares. Generate vector compares. 1910 bool FCmp = Opcode == Instruction::FCmp; 1911 Value *A = State.get(getOperand(0)); 1912 Value *B = State.get(getOperand(1)); 1913 Value *C = nullptr; 1914 if (FCmp) { 1915 // Propagate fast math flags. 1916 C = Builder.CreateFCmpFMF( 1917 getPredicate(), A, B, 1918 dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1919 } else { 1920 C = Builder.CreateICmp(getPredicate(), A, B); 1921 } 1922 if (auto *I = dyn_cast<Instruction>(C)) 1923 applyMetadata(*I); 1924 State.set(this, C); 1925 break; 1926 } 1927 default: 1928 // This instruction is not vectorized by simple widening. 1929 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1930 << Instruction::getOpcodeName(Opcode)); 1931 llvm_unreachable("Unhandled instruction!"); 1932 } // end of switch. 1933 1934 #if !defined(NDEBUG) 1935 // Verify that VPlan type inference results agree with the type of the 1936 // generated values. 1937 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == 1938 State.get(this)->getType() && 1939 "inferred type and type from generated instructions do not match"); 1940 #endif 1941 } 1942 1943 InstructionCost VPWidenRecipe::computeCost(ElementCount VF, 1944 VPCostContext &Ctx) const { 1945 switch (Opcode) { 1946 case Instruction::FNeg: { 1947 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1948 return Ctx.TTI.getArithmeticInstrCost( 1949 Opcode, VectorTy, Ctx.CostKind, 1950 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1951 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); 1952 } 1953 1954 case Instruction::UDiv: 1955 case Instruction::SDiv: 1956 case Instruction::SRem: 1957 case Instruction::URem: 1958 // More complex computation, let the legacy cost-model handle this for now. 1959 return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); 1960 case Instruction::Add: 1961 case Instruction::FAdd: 1962 case Instruction::Sub: 1963 case Instruction::FSub: 1964 case Instruction::Mul: 1965 case Instruction::FMul: 1966 case Instruction::FDiv: 1967 case Instruction::FRem: 1968 case Instruction::Shl: 1969 case Instruction::LShr: 1970 case Instruction::AShr: 1971 case Instruction::And: 1972 case Instruction::Or: 1973 case Instruction::Xor: { 1974 VPValue *RHS = getOperand(1); 1975 // Certain instructions can be cheaper to vectorize if they have a constant 1976 // second vector operand. One example of this are shifts on x86. 1977 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS); 1978 1979 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && 1980 getOperand(1)->isDefinedOutsideLoopRegions()) 1981 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; 1982 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1983 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1984 1985 SmallVector<const Value *, 4> Operands; 1986 if (CtxI) 1987 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); 1988 return Ctx.TTI.getArithmeticInstrCost( 1989 Opcode, VectorTy, Ctx.CostKind, 1990 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1991 RHSInfo, Operands, CtxI, &Ctx.TLI); 1992 } 1993 case Instruction::Freeze: { 1994 // This opcode is unknown. Assume that it is the same as 'mul'. 1995 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1996 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 1997 Ctx.CostKind); 1998 } 1999 case Instruction::ExtractValue: { 2000 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue, 2001 Ctx.CostKind); 2002 } 2003 case Instruction::ICmp: 2004 case Instruction::FCmp: { 2005 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 2006 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 2007 return Ctx.TTI.getCmpSelInstrCost( 2008 Opcode, VectorTy, CmpInst::makeCmpResultType(VectorTy), getPredicate(), 2009 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None}, 2010 {TTI::OK_AnyValue, TTI::OP_None}, CtxI); 2011 } 2012 default: 2013 llvm_unreachable("Unsupported opcode for instruction"); 2014 } 2015 } 2016 2017 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2018 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 2019 VPSlotTracker &SlotTracker) const { 2020 O << Indent << "WIDEN "; 2021 printAsOperand(O, SlotTracker); 2022 O << " = " << Instruction::getOpcodeName(Opcode); 2023 printFlags(O); 2024 printOperands(O, SlotTracker); 2025 } 2026 #endif 2027 2028 void VPWidenCastRecipe::execute(VPTransformState &State) { 2029 auto &Builder = State.Builder; 2030 /// Vectorize casts. 2031 assert(State.VF.isVector() && "Not vectorizing?"); 2032 Type *DestTy = VectorType::get(getResultType(), State.VF); 2033 VPValue *Op = getOperand(0); 2034 Value *A = State.get(Op); 2035 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 2036 State.set(this, Cast); 2037 if (auto *CastOp = dyn_cast<Instruction>(Cast)) { 2038 applyFlags(*CastOp); 2039 applyMetadata(*CastOp); 2040 } 2041 } 2042 2043 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, 2044 VPCostContext &Ctx) const { 2045 // TODO: In some cases, VPWidenCastRecipes are created but not considered in 2046 // the legacy cost model, including truncates/extends when evaluating a 2047 // reduction in a smaller type. 2048 if (!getUnderlyingValue()) 2049 return 0; 2050 // Computes the CastContextHint from a recipes that may access memory. 2051 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { 2052 if (VF.isScalar()) 2053 return TTI::CastContextHint::Normal; 2054 if (isa<VPInterleaveRecipe>(R)) 2055 return TTI::CastContextHint::Interleave; 2056 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) 2057 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked 2058 : TTI::CastContextHint::Normal; 2059 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R); 2060 if (WidenMemoryRecipe == nullptr) 2061 return TTI::CastContextHint::None; 2062 if (!WidenMemoryRecipe->isConsecutive()) 2063 return TTI::CastContextHint::GatherScatter; 2064 if (WidenMemoryRecipe->isReverse()) 2065 return TTI::CastContextHint::Reversed; 2066 if (WidenMemoryRecipe->isMasked()) 2067 return TTI::CastContextHint::Masked; 2068 return TTI::CastContextHint::Normal; 2069 }; 2070 2071 VPValue *Operand = getOperand(0); 2072 TTI::CastContextHint CCH = TTI::CastContextHint::None; 2073 // For Trunc/FPTrunc, get the context from the only user. 2074 if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) && 2075 !hasMoreThanOneUniqueUser() && getNumUsers() > 0) { 2076 if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin())) 2077 CCH = ComputeCCH(StoreRecipe); 2078 } 2079 // For Z/Sext, get the context from the operand. 2080 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 2081 Opcode == Instruction::FPExt) { 2082 if (Operand->isLiveIn()) 2083 CCH = TTI::CastContextHint::Normal; 2084 else if (Operand->getDefiningRecipe()) 2085 CCH = ComputeCCH(Operand->getDefiningRecipe()); 2086 } 2087 2088 auto *SrcTy = 2089 cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF)); 2090 auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF)); 2091 // Arm TTI will use the underlying instruction to determine the cost. 2092 return Ctx.TTI.getCastInstrCost( 2093 Opcode, DestTy, SrcTy, CCH, Ctx.CostKind, 2094 dyn_cast_if_present<Instruction>(getUnderlyingValue())); 2095 } 2096 2097 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2098 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 2099 VPSlotTracker &SlotTracker) const { 2100 O << Indent << "WIDEN-CAST "; 2101 printAsOperand(O, SlotTracker); 2102 O << " = " << Instruction::getOpcodeName(Opcode); 2103 printFlags(O); 2104 printOperands(O, SlotTracker); 2105 O << " to " << *getResultType(); 2106 } 2107 #endif 2108 2109 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF, 2110 VPCostContext &Ctx) const { 2111 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); 2112 } 2113 2114 /// A helper function that returns an integer or floating-point constant with 2115 /// value C. 2116 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 2117 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 2118 : ConstantFP::get(Ty, C); 2119 } 2120 2121 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2122 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 2123 VPSlotTracker &SlotTracker) const { 2124 O << Indent; 2125 printAsOperand(O, SlotTracker); 2126 O << " = WIDEN-INDUCTION "; 2127 printOperands(O, SlotTracker); 2128 2129 if (auto *TI = getTruncInst()) 2130 O << " (truncated to " << *TI->getType() << ")"; 2131 } 2132 #endif 2133 2134 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 2135 // The step may be defined by a recipe in the preheader (e.g. if it requires 2136 // SCEV expansion), but for the canonical induction the step is required to be 2137 // 1, which is represented as live-in. 2138 if (getStepValue()->getDefiningRecipe()) 2139 return false; 2140 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 2141 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 2142 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 2143 return StartC && StartC->isZero() && StepC && StepC->isOne() && 2144 getScalarType() == CanIV->getScalarType(); 2145 } 2146 2147 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2148 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 2149 VPSlotTracker &SlotTracker) const { 2150 O << Indent; 2151 printAsOperand(O, SlotTracker); 2152 O << " = DERIVED-IV "; 2153 getStartValue()->printAsOperand(O, SlotTracker); 2154 O << " + "; 2155 getOperand(1)->printAsOperand(O, SlotTracker); 2156 O << " * "; 2157 getStepValue()->printAsOperand(O, SlotTracker); 2158 } 2159 #endif 2160 2161 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 2162 // Fast-math-flags propagate from the original induction instruction. 2163 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 2164 if (hasFastMathFlags()) 2165 State.Builder.setFastMathFlags(getFastMathFlags()); 2166 2167 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2168 /// variable on which to base the steps, \p Step is the size of the step. 2169 2170 Value *BaseIV = State.get(getOperand(0), VPLane(0)); 2171 Value *Step = State.get(getStepValue(), VPLane(0)); 2172 IRBuilderBase &Builder = State.Builder; 2173 2174 // Ensure step has the same type as that of scalar IV. 2175 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 2176 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 2177 2178 // We build scalar steps for both integer and floating-point induction 2179 // variables. Here, we determine the kind of arithmetic we will perform. 2180 Instruction::BinaryOps AddOp; 2181 Instruction::BinaryOps MulOp; 2182 if (BaseIVTy->isIntegerTy()) { 2183 AddOp = Instruction::Add; 2184 MulOp = Instruction::Mul; 2185 } else { 2186 AddOp = InductionOpcode; 2187 MulOp = Instruction::FMul; 2188 } 2189 2190 // Determine the number of scalars we need to generate for each unroll 2191 // iteration. 2192 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 2193 // Compute the scalar steps and save the results in State. 2194 Type *IntStepTy = 2195 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 2196 Type *VecIVTy = nullptr; 2197 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2198 if (!FirstLaneOnly && State.VF.isScalable()) { 2199 VecIVTy = VectorType::get(BaseIVTy, State.VF); 2200 UnitStepVec = 2201 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2202 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2203 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 2204 } 2205 2206 unsigned StartLane = 0; 2207 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2208 if (State.Lane) { 2209 StartLane = State.Lane->getKnownLane(); 2210 EndLane = StartLane + 1; 2211 } 2212 Value *StartIdx0; 2213 if (getUnrollPart(*this) == 0) 2214 StartIdx0 = ConstantInt::get(IntStepTy, 0); 2215 else { 2216 StartIdx0 = State.get(getOperand(2), true); 2217 if (getUnrollPart(*this) != 1) { 2218 StartIdx0 = 2219 Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(), 2220 getUnrollPart(*this))); 2221 } 2222 StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy); 2223 } 2224 2225 if (!FirstLaneOnly && State.VF.isScalable()) { 2226 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2227 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2228 if (BaseIVTy->isFloatingPointTy()) 2229 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2230 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2231 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2232 State.set(this, Add); 2233 // It's useful to record the lane values too for the known minimum number 2234 // of elements so we do those below. This improves the code quality when 2235 // trying to extract the first element, for example. 2236 } 2237 2238 if (BaseIVTy->isFloatingPointTy()) 2239 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 2240 2241 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 2242 Value *StartIdx = Builder.CreateBinOp( 2243 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 2244 // The step returned by `createStepForVF` is a runtime-evaluated value 2245 // when VF is scalable. Otherwise, it should be folded into a Constant. 2246 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2247 "Expected StartIdx to be folded to a constant when VF is not " 2248 "scalable"); 2249 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2250 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 2251 State.set(this, Add, VPLane(Lane)); 2252 } 2253 } 2254 2255 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2256 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 2257 VPSlotTracker &SlotTracker) const { 2258 O << Indent; 2259 printAsOperand(O, SlotTracker); 2260 O << " = SCALAR-STEPS "; 2261 printOperands(O, SlotTracker); 2262 } 2263 #endif 2264 2265 void VPWidenGEPRecipe::execute(VPTransformState &State) { 2266 assert(State.VF.isVector() && "not widening"); 2267 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 2268 // Construct a vector GEP by widening the operands of the scalar GEP as 2269 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 2270 // results in a vector of pointers when at least one operand of the GEP 2271 // is vector-typed. Thus, to keep the representation compact, we only use 2272 // vector-typed operands for loop-varying values. 2273 2274 if (areAllOperandsInvariant()) { 2275 // If we are vectorizing, but the GEP has only loop-invariant operands, 2276 // the GEP we build (by only using vector-typed operands for 2277 // loop-varying values) would be a scalar pointer. Thus, to ensure we 2278 // produce a vector of pointers, we need to either arbitrarily pick an 2279 // operand to broadcast, or broadcast a clone of the original GEP. 2280 // Here, we broadcast a clone of the original. 2281 // 2282 // TODO: If at some point we decide to scalarize instructions having 2283 // loop-invariant operands, this special case will no longer be 2284 // required. We would add the scalarization decision to 2285 // collectLoopScalars() and teach getVectorValue() to broadcast 2286 // the lane-zero scalar value. 2287 SmallVector<Value *> Ops; 2288 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 2289 Ops.push_back(State.get(getOperand(I), VPLane(0))); 2290 2291 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 2292 ArrayRef(Ops).drop_front(), "", 2293 getGEPNoWrapFlags()); 2294 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP); 2295 State.set(this, Splat); 2296 } else { 2297 // If the GEP has at least one loop-varying operand, we are sure to 2298 // produce a vector of pointers unless VF is scalar. 2299 // The pointer operand of the new GEP. If it's loop-invariant, we 2300 // won't broadcast it. 2301 auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0)) 2302 : State.get(getOperand(0)); 2303 2304 // Collect all the indices for the new GEP. If any index is 2305 // loop-invariant, we won't broadcast it. 2306 SmallVector<Value *, 4> Indices; 2307 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 2308 VPValue *Operand = getOperand(I); 2309 if (isIndexLoopInvariant(I - 1)) 2310 Indices.push_back(State.get(Operand, VPLane(0))); 2311 else 2312 Indices.push_back(State.get(Operand)); 2313 } 2314 2315 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 2316 // but it should be a vector, otherwise. 2317 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 2318 Indices, "", getGEPNoWrapFlags()); 2319 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 2320 "NewGEP is not a pointer vector"); 2321 State.set(this, NewGEP); 2322 } 2323 } 2324 2325 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2326 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 2327 VPSlotTracker &SlotTracker) const { 2328 O << Indent << "WIDEN-GEP "; 2329 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 2330 for (size_t I = 0; I < getNumOperands() - 1; ++I) 2331 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 2332 2333 O << " "; 2334 printAsOperand(O, SlotTracker); 2335 O << " = getelementptr"; 2336 printFlags(O); 2337 printOperands(O, SlotTracker); 2338 } 2339 #endif 2340 2341 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride, 2342 unsigned CurrentPart, IRBuilderBase &Builder) { 2343 // Use i32 for the gep index type when the value is constant, 2344 // or query DataLayout for a more suitable index type otherwise. 2345 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); 2346 return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0)) 2347 ? DL.getIndexType(Builder.getPtrTy(0)) 2348 : Builder.getInt32Ty(); 2349 } 2350 2351 void VPVectorEndPointerRecipe::execute(VPTransformState &State) { 2352 auto &Builder = State.Builder; 2353 unsigned CurrentPart = getUnrollPart(*this); 2354 bool IsUnitStride = Stride == 1 || Stride == -1; 2355 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true, 2356 IsUnitStride, CurrentPart, Builder); 2357 2358 // The wide store needs to start at the last vector element. 2359 Value *RunTimeVF = State.get(getVFValue(), VPLane(0)); 2360 if (IndexTy != RunTimeVF->getType()) 2361 RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy); 2362 // NumElt = Stride * CurrentPart * RunTimeVF 2363 Value *NumElt = Builder.CreateMul( 2364 ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF); 2365 // LastLane = Stride * (RunTimeVF - 1) 2366 Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1)); 2367 if (Stride != 1) 2368 LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane); 2369 Value *Ptr = State.get(getOperand(0), VPLane(0)); 2370 Value *ResultPtr = 2371 Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags()); 2372 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", 2373 getGEPNoWrapFlags()); 2374 2375 State.set(this, ResultPtr, /*IsScalar*/ true); 2376 } 2377 2378 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2379 void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent, 2380 VPSlotTracker &SlotTracker) const { 2381 O << Indent; 2382 printAsOperand(O, SlotTracker); 2383 O << " = vector-end-pointer"; 2384 printFlags(O); 2385 printOperands(O, SlotTracker); 2386 } 2387 #endif 2388 2389 void VPVectorPointerRecipe::execute(VPTransformState &State) { 2390 auto &Builder = State.Builder; 2391 unsigned CurrentPart = getUnrollPart(*this); 2392 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, 2393 /*IsUnitStride*/ true, CurrentPart, Builder); 2394 Value *Ptr = State.get(getOperand(0), VPLane(0)); 2395 2396 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); 2397 Value *ResultPtr = 2398 Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); 2399 2400 State.set(this, ResultPtr, /*IsScalar*/ true); 2401 } 2402 2403 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2404 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 2405 VPSlotTracker &SlotTracker) const { 2406 O << Indent; 2407 printAsOperand(O, SlotTracker); 2408 O << " = vector-pointer "; 2409 2410 printOperands(O, SlotTracker); 2411 } 2412 #endif 2413 2414 void VPBlendRecipe::execute(VPTransformState &State) { 2415 assert(isNormalized() && "Expected blend to be normalized!"); 2416 // We know that all PHIs in non-header blocks are converted into 2417 // selects, so we don't have to worry about the insertion order and we 2418 // can just use the builder. 2419 // At this point we generate the predication tree. There may be 2420 // duplications since this is a simple recursive scan, but future 2421 // optimizations will clean it up. 2422 2423 unsigned NumIncoming = getNumIncomingValues(); 2424 2425 // Generate a sequence of selects of the form: 2426 // SELECT(Mask3, In3, 2427 // SELECT(Mask2, In2, 2428 // SELECT(Mask1, In1, 2429 // In0))) 2430 // Note that Mask0 is never used: lanes for which no path reaches this phi and 2431 // are essentially undef are taken from In0. 2432 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 2433 Value *Result = nullptr; 2434 for (unsigned In = 0; In < NumIncoming; ++In) { 2435 // We might have single edge PHIs (blocks) - use an identity 2436 // 'select' for the first PHI operand. 2437 Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed); 2438 if (In == 0) 2439 Result = In0; // Initialize with the first incoming value. 2440 else { 2441 // Select between the current value and the previous incoming edge 2442 // based on the incoming mask. 2443 Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed); 2444 Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi"); 2445 } 2446 } 2447 State.set(this, Result, OnlyFirstLaneUsed); 2448 } 2449 2450 InstructionCost VPBlendRecipe::computeCost(ElementCount VF, 2451 VPCostContext &Ctx) const { 2452 // Handle cases where only the first lane is used the same way as the legacy 2453 // cost model. 2454 if (vputils::onlyFirstLaneUsed(this)) 2455 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); 2456 2457 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 2458 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); 2459 return (getNumIncomingValues() - 1) * 2460 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy, 2461 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind); 2462 } 2463 2464 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2465 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 2466 VPSlotTracker &SlotTracker) const { 2467 O << Indent << "BLEND "; 2468 printAsOperand(O, SlotTracker); 2469 O << " ="; 2470 if (getNumIncomingValues() == 1) { 2471 // Not a User of any mask: not really blending, this is a 2472 // single-predecessor phi. 2473 O << " "; 2474 getIncomingValue(0)->printAsOperand(O, SlotTracker); 2475 } else { 2476 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 2477 O << " "; 2478 getIncomingValue(I)->printAsOperand(O, SlotTracker); 2479 if (I == 0) 2480 continue; 2481 O << "/"; 2482 getMask(I)->printAsOperand(O, SlotTracker); 2483 } 2484 } 2485 } 2486 #endif 2487 2488 void VPReductionRecipe::execute(VPTransformState &State) { 2489 assert(!State.Lane && "Reduction being replicated."); 2490 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 2491 RecurKind Kind = getRecurrenceKind(); 2492 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && 2493 "In-loop AnyOf reductions aren't currently supported"); 2494 // Propagate the fast-math flags carried by the underlying instruction. 2495 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 2496 State.Builder.setFastMathFlags(getFastMathFlags()); 2497 Value *NewVecOp = State.get(getVecOp()); 2498 if (VPValue *Cond = getCondOp()) { 2499 Value *NewCond = State.get(Cond, State.VF.isScalar()); 2500 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 2501 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 2502 2503 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags()); 2504 if (State.VF.isVector()) 2505 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start); 2506 2507 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); 2508 NewVecOp = Select; 2509 } 2510 Value *NewRed; 2511 Value *NextInChain; 2512 if (IsOrdered) { 2513 if (State.VF.isVector()) 2514 NewRed = 2515 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain); 2516 else 2517 NewRed = State.Builder.CreateBinOp( 2518 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), 2519 PrevInChain, NewVecOp); 2520 PrevInChain = NewRed; 2521 NextInChain = NewRed; 2522 } else { 2523 PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 2524 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind); 2525 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 2526 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain); 2527 else 2528 NextInChain = State.Builder.CreateBinOp( 2529 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed, 2530 PrevInChain); 2531 } 2532 State.set(this, NextInChain, /*IsScalar*/ true); 2533 } 2534 2535 void VPReductionEVLRecipe::execute(VPTransformState &State) { 2536 assert(!State.Lane && "Reduction being replicated."); 2537 2538 auto &Builder = State.Builder; 2539 // Propagate the fast-math flags carried by the underlying instruction. 2540 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); 2541 Builder.setFastMathFlags(getFastMathFlags()); 2542 2543 RecurKind Kind = getRecurrenceKind(); 2544 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true); 2545 Value *VecOp = State.get(getVecOp()); 2546 Value *EVL = State.get(getEVL(), VPLane(0)); 2547 2548 Value *Mask; 2549 if (VPValue *CondOp = getCondOp()) 2550 Mask = State.get(CondOp); 2551 else 2552 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2553 2554 Value *NewRed; 2555 if (isOrdered()) { 2556 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL); 2557 } else { 2558 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL); 2559 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 2560 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); 2561 else 2562 NewRed = Builder.CreateBinOp( 2563 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed, 2564 Prev); 2565 } 2566 State.set(this, NewRed, /*IsScalar*/ true); 2567 } 2568 2569 InstructionCost VPReductionRecipe::computeCost(ElementCount VF, 2570 VPCostContext &Ctx) const { 2571 RecurKind RdxKind = getRecurrenceKind(); 2572 Type *ElementTy = Ctx.Types.inferScalarType(this); 2573 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF)); 2574 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind); 2575 FastMathFlags FMFs = getFastMathFlags(); 2576 std::optional<FastMathFlags> OptionalFMF = 2577 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt; 2578 2579 // TODO: Support any-of reductions. 2580 assert( 2581 (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || 2582 ForceTargetInstructionCost.getNumOccurrences() > 0) && 2583 "Any-of reduction not implemented in VPlan-based cost model currently."); 2584 2585 // Note that TTI should model the cost of moving result to the scalar register 2586 // and the BinOp cost in the getMinMaxReductionCost(). 2587 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { 2588 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); 2589 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind); 2590 } 2591 2592 // Note that TTI should model the cost of moving result to the scalar register 2593 // and the BinOp cost in the getArithmeticReductionCost(). 2594 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF, 2595 Ctx.CostKind); 2596 } 2597 2598 VPExpressionRecipe::VPExpressionRecipe( 2599 ExpressionTypes ExpressionType, 2600 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes) 2601 : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}), 2602 ExpressionRecipes(SetVector<VPSingleDefRecipe *>( 2603 ExpressionRecipes.begin(), ExpressionRecipes.end()) 2604 .takeVector()), 2605 ExpressionType(ExpressionType) { 2606 assert(!ExpressionRecipes.empty() && "Nothing to combine?"); 2607 assert( 2608 none_of(ExpressionRecipes, 2609 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && 2610 "expression cannot contain recipes with side-effects"); 2611 2612 // Maintain a copy of the expression recipes as a set of users. 2613 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers; 2614 for (auto *R : ExpressionRecipes) 2615 ExpressionRecipesAsSetOfUsers.insert(R); 2616 2617 // Recipes in the expression, except the last one, must only be used by 2618 // (other) recipes inside the expression. If there are other users, external 2619 // to the expression, use a clone of the recipe for external users. 2620 for (VPSingleDefRecipe *R : ExpressionRecipes) { 2621 if (R != ExpressionRecipes.back() && 2622 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) { 2623 return !ExpressionRecipesAsSetOfUsers.contains(U); 2624 })) { 2625 // There are users outside of the expression. Clone the recipe and use the 2626 // clone those external users. 2627 VPSingleDefRecipe *CopyForExtUsers = R->clone(); 2628 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers]( 2629 VPUser &U, unsigned) { 2630 return !ExpressionRecipesAsSetOfUsers.contains(&U); 2631 }); 2632 CopyForExtUsers->insertBefore(R); 2633 } 2634 if (R->getParent()) 2635 R->removeFromParent(); 2636 } 2637 2638 // Internalize all external operands to the expression recipes. To do so, 2639 // create new temporary VPValues for all operands defined by a recipe outside 2640 // the expression. The original operands are added as operands of the 2641 // VPExpressionRecipe itself. 2642 for (auto *R : ExpressionRecipes) { 2643 for (const auto &[Idx, Op] : enumerate(R->operands())) { 2644 auto *Def = Op->getDefiningRecipe(); 2645 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def)) 2646 continue; 2647 addOperand(Op); 2648 LiveInPlaceholders.push_back(new VPValue()); 2649 R->setOperand(Idx, LiveInPlaceholders.back()); 2650 } 2651 } 2652 } 2653 2654 void VPExpressionRecipe::decompose() { 2655 for (auto *R : ExpressionRecipes) 2656 R->insertBefore(this); 2657 2658 for (const auto &[Idx, Op] : enumerate(operands())) 2659 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op); 2660 2661 replaceAllUsesWith(ExpressionRecipes.back()); 2662 ExpressionRecipes.clear(); 2663 } 2664 2665 InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, 2666 VPCostContext &Ctx) const { 2667 Type *RedTy = Ctx.Types.inferScalarType(this); 2668 auto *SrcVecTy = cast<VectorType>( 2669 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)); 2670 assert(RedTy->isIntegerTy() && 2671 "VPExpressionRecipe only supports integer types currently."); 2672 switch (ExpressionType) { 2673 case ExpressionTypes::ExtendedReduction: { 2674 unsigned Opcode = RecurrenceDescriptor::getOpcode( 2675 cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind()); 2676 return Ctx.TTI.getExtendedReductionCost( 2677 Opcode, 2678 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == 2679 Instruction::ZExt, 2680 RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); 2681 } 2682 case ExpressionTypes::MulAccReduction: 2683 return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind); 2684 2685 case ExpressionTypes::ExtMulAccReduction: 2686 return Ctx.TTI.getMulAccReductionCost( 2687 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == 2688 Instruction::ZExt, 2689 RedTy, SrcVecTy, Ctx.CostKind); 2690 } 2691 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum"); 2692 } 2693 2694 bool VPExpressionRecipe::mayReadOrWriteMemory() const { 2695 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) { 2696 return R->mayReadFromMemory() || R->mayWriteToMemory(); 2697 }); 2698 } 2699 2700 bool VPExpressionRecipe::mayHaveSideEffects() const { 2701 assert( 2702 none_of(ExpressionRecipes, 2703 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && 2704 "expression cannot contain recipes with side-effects"); 2705 return false; 2706 } 2707 2708 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2709 2710 void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, 2711 VPSlotTracker &SlotTracker) const { 2712 O << Indent << "EXPRESSION "; 2713 printAsOperand(O, SlotTracker); 2714 O << " = "; 2715 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back()); 2716 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); 2717 2718 switch (ExpressionType) { 2719 case ExpressionTypes::ExtendedReduction: { 2720 getOperand(1)->printAsOperand(O, SlotTracker); 2721 O << " +"; 2722 O << " reduce." << Instruction::getOpcodeName(Opcode) << " ("; 2723 getOperand(0)->printAsOperand(O, SlotTracker); 2724 Red->printFlags(O); 2725 2726 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); 2727 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " 2728 << *Ext0->getResultType(); 2729 if (Red->isConditional()) { 2730 O << ", "; 2731 Red->getCondOp()->printAsOperand(O, SlotTracker); 2732 } 2733 O << ")"; 2734 break; 2735 } 2736 case ExpressionTypes::MulAccReduction: 2737 case ExpressionTypes::ExtMulAccReduction: { 2738 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); 2739 O << " + "; 2740 O << "reduce." 2741 << Instruction::getOpcodeName( 2742 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) 2743 << " ("; 2744 O << "mul"; 2745 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction; 2746 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2] 2747 : ExpressionRecipes[0]); 2748 Mul->printFlags(O); 2749 if (IsExtended) 2750 O << "("; 2751 getOperand(0)->printAsOperand(O, SlotTracker); 2752 if (IsExtended) { 2753 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); 2754 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " 2755 << *Ext0->getResultType() << "), ("; 2756 } else { 2757 O << ", "; 2758 } 2759 getOperand(1)->printAsOperand(O, SlotTracker); 2760 if (IsExtended) { 2761 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]); 2762 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " 2763 << *Ext1->getResultType() << ")"; 2764 } 2765 if (Red->isConditional()) { 2766 O << ", "; 2767 Red->getCondOp()->printAsOperand(O, SlotTracker); 2768 } 2769 O << ")"; 2770 break; 2771 } 2772 } 2773 } 2774 2775 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 2776 VPSlotTracker &SlotTracker) const { 2777 O << Indent << "REDUCE "; 2778 printAsOperand(O, SlotTracker); 2779 O << " = "; 2780 getChainOp()->printAsOperand(O, SlotTracker); 2781 O << " +"; 2782 printFlags(O); 2783 O << " reduce." 2784 << Instruction::getOpcodeName( 2785 RecurrenceDescriptor::getOpcode(getRecurrenceKind())) 2786 << " ("; 2787 getVecOp()->printAsOperand(O, SlotTracker); 2788 if (isConditional()) { 2789 O << ", "; 2790 getCondOp()->printAsOperand(O, SlotTracker); 2791 } 2792 O << ")"; 2793 } 2794 2795 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2796 VPSlotTracker &SlotTracker) const { 2797 O << Indent << "REDUCE "; 2798 printAsOperand(O, SlotTracker); 2799 O << " = "; 2800 getChainOp()->printAsOperand(O, SlotTracker); 2801 O << " +"; 2802 printFlags(O); 2803 O << " vp.reduce." 2804 << Instruction::getOpcodeName( 2805 RecurrenceDescriptor::getOpcode(getRecurrenceKind())) 2806 << " ("; 2807 getVecOp()->printAsOperand(O, SlotTracker); 2808 O << ", "; 2809 getEVL()->printAsOperand(O, SlotTracker); 2810 if (isConditional()) { 2811 O << ", "; 2812 getCondOp()->printAsOperand(O, SlotTracker); 2813 } 2814 O << ")"; 2815 } 2816 2817 #endif 2818 2819 /// A helper function to scalarize a single Instruction in the innermost loop. 2820 /// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue 2821 /// operands from \p RepRecipe instead of \p Instr's operands. 2822 static void scalarizeInstruction(const Instruction *Instr, 2823 VPReplicateRecipe *RepRecipe, 2824 const VPLane &Lane, VPTransformState &State) { 2825 assert((!Instr->getType()->isAggregateType() || 2826 canVectorizeTy(Instr->getType())) && 2827 "Expected vectorizable or non-aggregate type."); 2828 2829 // Does this instruction return a value ? 2830 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2831 2832 Instruction *Cloned = Instr->clone(); 2833 if (!IsVoidRetTy) { 2834 Cloned->setName(Instr->getName() + ".cloned"); 2835 #if !defined(NDEBUG) 2836 // Verify that VPlan type inference results agree with the type of the 2837 // generated values. 2838 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && 2839 "inferred type and type from generated instructions do not match"); 2840 #endif 2841 } 2842 2843 RepRecipe->applyFlags(*Cloned); 2844 RepRecipe->applyMetadata(*Cloned); 2845 2846 if (auto DL = RepRecipe->getDebugLoc()) 2847 State.setDebugLocFrom(DL); 2848 2849 // Replace the operands of the cloned instructions with their scalar 2850 // equivalents in the new loop. 2851 for (const auto &I : enumerate(RepRecipe->operands())) { 2852 auto InputLane = Lane; 2853 VPValue *Operand = I.value(); 2854 if (vputils::isSingleScalar(Operand)) 2855 InputLane = VPLane::getFirstLane(); 2856 Cloned->setOperand(I.index(), State.get(Operand, InputLane)); 2857 } 2858 2859 // Place the cloned scalar in the new loop. 2860 State.Builder.Insert(Cloned); 2861 2862 State.set(RepRecipe, Cloned, Lane); 2863 2864 // If we just cloned a new assumption, add it the assumption cache. 2865 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2866 State.AC->registerAssumption(II); 2867 2868 assert( 2869 (RepRecipe->getParent()->getParent() || 2870 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || 2871 all_of(RepRecipe->operands(), 2872 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && 2873 "Expected a recipe is either within a region or all of its operands " 2874 "are defined outside the vectorized region."); 2875 } 2876 2877 void VPReplicateRecipe::execute(VPTransformState &State) { 2878 Instruction *UI = getUnderlyingInstr(); 2879 2880 if (!State.Lane) { 2881 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions " 2882 "must have already been unrolled"); 2883 scalarizeInstruction(UI, this, VPLane(0), State); 2884 return; 2885 } 2886 2887 assert((State.VF.isScalar() || !isSingleScalar()) && 2888 "uniform recipe shouldn't be predicated"); 2889 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 2890 scalarizeInstruction(UI, this, *State.Lane, State); 2891 // Insert scalar instance packing it into a vector. 2892 if (State.VF.isVector() && shouldPack()) { 2893 Value *WideValue = 2894 State.Lane->isFirstLane() 2895 ? PoisonValue::get(VectorType::get(UI->getType(), State.VF)) 2896 : State.get(this); 2897 State.set(this, State.packScalarIntoVectorizedValue(this, WideValue, 2898 *State.Lane)); 2899 } 2900 } 2901 2902 bool VPReplicateRecipe::shouldPack() const { 2903 // Find if the recipe is used by a widened recipe via an intervening 2904 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 2905 return any_of(users(), [](const VPUser *U) { 2906 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 2907 return any_of(PredR->users(), [PredR](const VPUser *U) { 2908 return !U->usesScalars(PredR); 2909 }); 2910 return false; 2911 }); 2912 } 2913 2914 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, 2915 VPCostContext &Ctx) const { 2916 Instruction *UI = cast<Instruction>(getUnderlyingValue()); 2917 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan 2918 // transform, avoid computing their cost multiple times for now. 2919 Ctx.SkipCostComputation.insert(UI); 2920 2921 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2922 Type *ResultTy = Ctx.Types.inferScalarType(this); 2923 switch (UI->getOpcode()) { 2924 case Instruction::GetElementPtr: 2925 // We mark this instruction as zero-cost because the cost of GEPs in 2926 // vectorized code depends on whether the corresponding memory instruction 2927 // is scalarized or not. Therefore, we handle GEPs with the memory 2928 // instruction cost. 2929 return 0; 2930 case Instruction::Add: 2931 case Instruction::Sub: 2932 case Instruction::FAdd: 2933 case Instruction::FSub: 2934 case Instruction::Mul: 2935 case Instruction::FMul: 2936 case Instruction::FDiv: 2937 case Instruction::FRem: 2938 case Instruction::Shl: 2939 case Instruction::LShr: 2940 case Instruction::AShr: 2941 case Instruction::And: 2942 case Instruction::Or: 2943 case Instruction::Xor: { 2944 auto Op2Info = Ctx.getOperandInfo(getOperand(1)); 2945 SmallVector<const Value *, 4> Operands(UI->operand_values()); 2946 return Ctx.TTI.getArithmeticInstrCost( 2947 UI->getOpcode(), ResultTy, CostKind, 2948 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 2949 Op2Info, Operands, UI, &Ctx.TLI) * 2950 (isSingleScalar() ? 1 : VF.getFixedValue()); 2951 } 2952 } 2953 2954 return Ctx.getLegacyCost(UI, VF); 2955 } 2956 2957 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2958 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 2959 VPSlotTracker &SlotTracker) const { 2960 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE "); 2961 2962 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 2963 printAsOperand(O, SlotTracker); 2964 O << " = "; 2965 } 2966 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 2967 O << "call"; 2968 printFlags(O); 2969 O << "@" << CB->getCalledFunction()->getName() << "("; 2970 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 2971 O, [&O, &SlotTracker](VPValue *Op) { 2972 Op->printAsOperand(O, SlotTracker); 2973 }); 2974 O << ")"; 2975 } else { 2976 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 2977 printFlags(O); 2978 printOperands(O, SlotTracker); 2979 } 2980 2981 if (shouldPack()) 2982 O << " (S->V)"; 2983 } 2984 #endif 2985 2986 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 2987 assert(State.Lane && "Branch on Mask works only on single instance."); 2988 2989 VPValue *BlockInMask = getOperand(0); 2990 Value *ConditionBit = State.get(BlockInMask, *State.Lane); 2991 2992 // Replace the temporary unreachable terminator with a new conditional branch, 2993 // whose two destinations will be set later when they are created. 2994 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 2995 assert(isa<UnreachableInst>(CurrentTerminator) && 2996 "Expected to replace unreachable terminator with conditional branch."); 2997 auto CondBr = 2998 State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr); 2999 CondBr->setSuccessor(0, nullptr); 3000 CurrentTerminator->eraseFromParent(); 3001 } 3002 3003 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, 3004 VPCostContext &Ctx) const { 3005 // The legacy cost model doesn't assign costs to branches for individual 3006 // replicate regions. Match the current behavior in the VPlan cost model for 3007 // now. 3008 return 0; 3009 } 3010 3011 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 3012 assert(State.Lane && "Predicated instruction PHI works per instance."); 3013 Instruction *ScalarPredInst = 3014 cast<Instruction>(State.get(getOperand(0), *State.Lane)); 3015 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 3016 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 3017 assert(PredicatingBB && "Predicated block has no single predecessor."); 3018 assert(isa<VPReplicateRecipe>(getOperand(0)) && 3019 "operand must be VPReplicateRecipe"); 3020 3021 // By current pack/unpack logic we need to generate only a single phi node: if 3022 // a vector value for the predicated instruction exists at this point it means 3023 // the instruction has vector users only, and a phi for the vector value is 3024 // needed. In this case the recipe of the predicated instruction is marked to 3025 // also do that packing, thereby "hoisting" the insert-element sequence. 3026 // Otherwise, a phi node for the scalar value is needed. 3027 if (State.hasVectorValue(getOperand(0))) { 3028 Value *VectorValue = State.get(getOperand(0)); 3029 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 3030 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 3031 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 3032 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 3033 if (State.hasVectorValue(this)) 3034 State.reset(this, VPhi); 3035 else 3036 State.set(this, VPhi); 3037 // NOTE: Currently we need to update the value of the operand, so the next 3038 // predicated iteration inserts its generated value in the correct vector. 3039 State.reset(getOperand(0), VPhi); 3040 } else { 3041 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane()) 3042 return; 3043 3044 Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0)); 3045 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 3046 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 3047 PredicatingBB); 3048 Phi->addIncoming(ScalarPredInst, PredicatedBB); 3049 if (State.hasScalarValue(this, *State.Lane)) 3050 State.reset(this, Phi, *State.Lane); 3051 else 3052 State.set(this, Phi, *State.Lane); 3053 // NOTE: Currently we need to update the value of the operand, so the next 3054 // predicated iteration inserts its generated value in the correct vector. 3055 State.reset(getOperand(0), Phi, *State.Lane); 3056 } 3057 } 3058 3059 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3060 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3061 VPSlotTracker &SlotTracker) const { 3062 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 3063 printAsOperand(O, SlotTracker); 3064 O << " = "; 3065 printOperands(O, SlotTracker); 3066 } 3067 #endif 3068 3069 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, 3070 VPCostContext &Ctx) const { 3071 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 3072 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3073 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr())) 3074 ->getAddressSpace(); 3075 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) 3076 ? Instruction::Load 3077 : Instruction::Store; 3078 3079 if (!Consecutive) { 3080 // TODO: Using the original IR may not be accurate. 3081 // Currently, ARM will use the underlying IR to calculate gather/scatter 3082 // instruction cost. 3083 const Value *Ptr = getLoadStorePointerOperand(&Ingredient); 3084 assert(!Reverse && 3085 "Inconsecutive memory access should not have the order."); 3086 return Ctx.TTI.getAddressComputationCost(Ty) + 3087 Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment, 3088 Ctx.CostKind, &Ingredient); 3089 } 3090 3091 InstructionCost Cost = 0; 3092 if (IsMasked) { 3093 Cost += 3094 Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind); 3095 } else { 3096 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( 3097 isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0) 3098 : getOperand(1)); 3099 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind, 3100 OpInfo, &Ingredient); 3101 } 3102 if (!Reverse) 3103 return Cost; 3104 3105 return Cost += Ctx.TTI.getShuffleCost( 3106 TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty), 3107 cast<VectorType>(Ty), {}, Ctx.CostKind, 0); 3108 } 3109 3110 void VPWidenLoadRecipe::execute(VPTransformState &State) { 3111 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 3112 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 3113 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3114 bool CreateGather = !isConsecutive(); 3115 3116 auto &Builder = State.Builder; 3117 Value *Mask = nullptr; 3118 if (auto *VPMask = getMask()) { 3119 // Mask reversal is only needed for non-all-one (null) masks, as reverse 3120 // of a null all-one mask is a null mask. 3121 Mask = State.get(VPMask); 3122 if (isReverse()) 3123 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 3124 } 3125 3126 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather); 3127 Value *NewLI; 3128 if (CreateGather) { 3129 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, 3130 "wide.masked.gather"); 3131 } else if (Mask) { 3132 NewLI = 3133 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, 3134 PoisonValue::get(DataTy), "wide.masked.load"); 3135 } else { 3136 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); 3137 } 3138 applyMetadata(*cast<Instruction>(NewLI)); 3139 if (Reverse) 3140 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 3141 State.set(this, NewLI); 3142 } 3143 3144 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3145 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 3146 VPSlotTracker &SlotTracker) const { 3147 O << Indent << "WIDEN "; 3148 printAsOperand(O, SlotTracker); 3149 O << " = load "; 3150 printOperands(O, SlotTracker); 3151 } 3152 #endif 3153 3154 /// Use all-true mask for reverse rather than actual mask, as it avoids a 3155 /// dependence w/o affecting the result. 3156 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, 3157 Value *EVL, const Twine &Name) { 3158 VectorType *ValTy = cast<VectorType>(Operand->getType()); 3159 Value *AllTrueMask = 3160 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); 3161 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, 3162 {Operand, AllTrueMask, EVL}, nullptr, Name); 3163 } 3164 3165 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 3166 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 3167 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 3168 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3169 bool CreateGather = !isConsecutive(); 3170 3171 auto &Builder = State.Builder; 3172 CallInst *NewLI; 3173 Value *EVL = State.get(getEVL(), VPLane(0)); 3174 Value *Addr = State.get(getAddr(), !CreateGather); 3175 Value *Mask = nullptr; 3176 if (VPValue *VPMask = getMask()) { 3177 Mask = State.get(VPMask); 3178 if (isReverse()) 3179 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 3180 } else { 3181 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 3182 } 3183 3184 if (CreateGather) { 3185 NewLI = 3186 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, 3187 nullptr, "wide.masked.gather"); 3188 } else { 3189 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load, 3190 {Addr, Mask, EVL}, nullptr, "vp.op.load"); 3191 } 3192 NewLI->addParamAttr( 3193 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); 3194 applyMetadata(*NewLI); 3195 Instruction *Res = NewLI; 3196 if (isReverse()) 3197 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); 3198 State.set(this, Res); 3199 } 3200 3201 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, 3202 VPCostContext &Ctx) const { 3203 if (!Consecutive || IsMasked) 3204 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 3205 3206 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 3207 // here because the EVL recipes using EVL to replace the tail mask. But in the 3208 // legacy model, it will always calculate the cost of mask. 3209 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 3210 // don't need to compare to the legacy cost model. 3211 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 3212 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3213 unsigned AS = getLoadStoreAddressSpace(&Ingredient); 3214 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 3215 Instruction::Load, Ty, Alignment, AS, Ctx.CostKind); 3216 if (!Reverse) 3217 return Cost; 3218 3219 return Cost + Ctx.TTI.getShuffleCost( 3220 TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty), 3221 cast<VectorType>(Ty), {}, Ctx.CostKind, 0); 3222 } 3223 3224 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3225 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 3226 VPSlotTracker &SlotTracker) const { 3227 O << Indent << "WIDEN "; 3228 printAsOperand(O, SlotTracker); 3229 O << " = vp.load "; 3230 printOperands(O, SlotTracker); 3231 } 3232 #endif 3233 3234 void VPWidenStoreRecipe::execute(VPTransformState &State) { 3235 VPValue *StoredVPValue = getStoredValue(); 3236 bool CreateScatter = !isConsecutive(); 3237 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3238 3239 auto &Builder = State.Builder; 3240 3241 Value *Mask = nullptr; 3242 if (auto *VPMask = getMask()) { 3243 // Mask reversal is only needed for non-all-one (null) masks, as reverse 3244 // of a null all-one mask is a null mask. 3245 Mask = State.get(VPMask); 3246 if (isReverse()) 3247 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 3248 } 3249 3250 Value *StoredVal = State.get(StoredVPValue); 3251 if (isReverse()) { 3252 // If we store to reverse consecutive memory locations, then we need 3253 // to reverse the order of elements in the stored value. 3254 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 3255 // We don't want to update the value in the map as it might be used in 3256 // another expression. So don't call resetVectorValue(StoredVal). 3257 } 3258 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter); 3259 Instruction *NewSI = nullptr; 3260 if (CreateScatter) 3261 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); 3262 else if (Mask) 3263 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); 3264 else 3265 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); 3266 applyMetadata(*NewSI); 3267 } 3268 3269 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3270 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 3271 VPSlotTracker &SlotTracker) const { 3272 O << Indent << "WIDEN store "; 3273 printOperands(O, SlotTracker); 3274 } 3275 #endif 3276 3277 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { 3278 VPValue *StoredValue = getStoredValue(); 3279 bool CreateScatter = !isConsecutive(); 3280 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3281 3282 auto &Builder = State.Builder; 3283 3284 CallInst *NewSI = nullptr; 3285 Value *StoredVal = State.get(StoredValue); 3286 Value *EVL = State.get(getEVL(), VPLane(0)); 3287 if (isReverse()) 3288 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); 3289 Value *Mask = nullptr; 3290 if (VPValue *VPMask = getMask()) { 3291 Mask = State.get(VPMask); 3292 if (isReverse()) 3293 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 3294 } else { 3295 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 3296 } 3297 Value *Addr = State.get(getAddr(), !CreateScatter); 3298 if (CreateScatter) { 3299 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 3300 Intrinsic::vp_scatter, 3301 {StoredVal, Addr, Mask, EVL}); 3302 } else { 3303 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 3304 Intrinsic::vp_store, 3305 {StoredVal, Addr, Mask, EVL}); 3306 } 3307 NewSI->addParamAttr( 3308 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); 3309 applyMetadata(*NewSI); 3310 } 3311 3312 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, 3313 VPCostContext &Ctx) const { 3314 if (!Consecutive || IsMasked) 3315 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 3316 3317 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 3318 // here because the EVL recipes using EVL to replace the tail mask. But in the 3319 // legacy model, it will always calculate the cost of mask. 3320 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 3321 // don't need to compare to the legacy cost model. 3322 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 3323 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3324 unsigned AS = getLoadStoreAddressSpace(&Ingredient); 3325 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 3326 Instruction::Store, Ty, Alignment, AS, Ctx.CostKind); 3327 if (!Reverse) 3328 return Cost; 3329 3330 return Cost + Ctx.TTI.getShuffleCost( 3331 TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty), 3332 cast<VectorType>(Ty), {}, Ctx.CostKind, 0); 3333 } 3334 3335 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3336 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 3337 VPSlotTracker &SlotTracker) const { 3338 O << Indent << "WIDEN vp.store "; 3339 printOperands(O, SlotTracker); 3340 } 3341 #endif 3342 3343 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, 3344 VectorType *DstVTy, const DataLayout &DL) { 3345 // Verify that V is a vector type with same number of elements as DstVTy. 3346 auto VF = DstVTy->getElementCount(); 3347 auto *SrcVecTy = cast<VectorType>(V->getType()); 3348 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 3349 Type *SrcElemTy = SrcVecTy->getElementType(); 3350 Type *DstElemTy = DstVTy->getElementType(); 3351 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3352 "Vector elements must have same size"); 3353 3354 // Do a direct cast if element types are castable. 3355 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3356 return Builder.CreateBitOrPointerCast(V, DstVTy); 3357 } 3358 // V cannot be directly casted to desired vector type. 3359 // May happen when V is a floating point vector but DstVTy is a vector of 3360 // pointers or vice-versa. Handle this using a two-step bitcast using an 3361 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3362 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3363 "Only one type should be a pointer type"); 3364 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3365 "Only one type should be a floating point type"); 3366 Type *IntTy = 3367 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3368 auto *VecIntTy = VectorType::get(IntTy, VF); 3369 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3370 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 3371 } 3372 3373 /// Return a vector containing interleaved elements from multiple 3374 /// smaller input vectors. 3375 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 3376 const Twine &Name) { 3377 unsigned Factor = Vals.size(); 3378 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 3379 3380 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 3381 #ifndef NDEBUG 3382 for (Value *Val : Vals) 3383 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 3384 #endif 3385 3386 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 3387 // must use intrinsics to interleave. 3388 if (VecTy->isScalableTy()) { 3389 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors"); 3390 VectorType *InterleaveTy = 3391 VectorType::get(VecTy->getElementType(), 3392 VecTy->getElementCount().multiplyCoefficientBy(Factor)); 3393 return Builder.CreateIntrinsic(InterleaveTy, 3394 getInterleaveIntrinsicID(Factor), Vals, 3395 /*FMFSource=*/nullptr, Name); 3396 } 3397 3398 // Fixed length. Start by concatenating all vectors into a wide vector. 3399 Value *WideVec = concatenateVectors(Builder, Vals); 3400 3401 // Interleave the elements into the wide vector. 3402 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 3403 return Builder.CreateShuffleVector( 3404 WideVec, createInterleaveMask(NumElts, Factor), Name); 3405 } 3406 3407 // Try to vectorize the interleave group that \p Instr belongs to. 3408 // 3409 // E.g. Translate following interleaved load group (factor = 3): 3410 // for (i = 0; i < N; i+=3) { 3411 // R = Pic[i]; // Member of index 0 3412 // G = Pic[i+1]; // Member of index 1 3413 // B = Pic[i+2]; // Member of index 2 3414 // ... // do something to R, G, B 3415 // } 3416 // To: 3417 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 3418 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 3419 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 3420 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 3421 // 3422 // Or translate following interleaved store group (factor = 3): 3423 // for (i = 0; i < N; i+=3) { 3424 // ... do something to R, G, B 3425 // Pic[i] = R; // Member of index 0 3426 // Pic[i+1] = G; // Member of index 1 3427 // Pic[i+2] = B; // Member of index 2 3428 // } 3429 // To: 3430 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 3431 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 3432 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 3433 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 3434 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 3435 void VPInterleaveRecipe::execute(VPTransformState &State) { 3436 assert(!State.Lane && "Interleave group being replicated."); 3437 const InterleaveGroup<Instruction> *Group = IG; 3438 Instruction *Instr = Group->getInsertPos(); 3439 3440 // Prepare for the vector type of the interleaved load/store. 3441 Type *ScalarTy = getLoadStoreType(Instr); 3442 unsigned InterleaveFactor = Group->getFactor(); 3443 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); 3444 3445 VPValue *BlockInMask = getMask(); 3446 VPValue *Addr = getAddr(); 3447 Value *ResAddr = State.get(Addr, VPLane(0)); 3448 Value *PoisonVec = PoisonValue::get(VecTy); 3449 3450 auto CreateGroupMask = [&BlockInMask, &State, 3451 &InterleaveFactor](Value *MaskForGaps) -> Value * { 3452 if (State.VF.isScalable()) { 3453 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 3454 assert(InterleaveFactor <= 8 && 3455 "Unsupported deinterleave factor for scalable vectors"); 3456 auto *ResBlockInMask = State.get(BlockInMask); 3457 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask); 3458 return interleaveVectors(State.Builder, Ops, "interleaved.mask"); 3459 } 3460 3461 if (!BlockInMask) 3462 return MaskForGaps; 3463 3464 Value *ResBlockInMask = State.get(BlockInMask); 3465 Value *ShuffledMask = State.Builder.CreateShuffleVector( 3466 ResBlockInMask, 3467 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()), 3468 "interleaved.mask"); 3469 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, 3470 ShuffledMask, MaskForGaps) 3471 : ShuffledMask; 3472 }; 3473 3474 const DataLayout &DL = Instr->getDataLayout(); 3475 // Vectorize the interleaved load group. 3476 if (isa<LoadInst>(Instr)) { 3477 Value *MaskForGaps = nullptr; 3478 if (NeedsMaskForGaps) { 3479 MaskForGaps = 3480 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group); 3481 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 3482 } 3483 3484 Instruction *NewLoad; 3485 if (BlockInMask || MaskForGaps) { 3486 Value *GroupMask = CreateGroupMask(MaskForGaps); 3487 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr, 3488 Group->getAlign(), GroupMask, 3489 PoisonVec, "wide.masked.vec"); 3490 } else 3491 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr, 3492 Group->getAlign(), "wide.vec"); 3493 Group->addMetadata(NewLoad); 3494 3495 ArrayRef<VPValue *> VPDefs = definedValues(); 3496 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 3497 if (VecTy->isScalableTy()) { 3498 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 3499 // so must use intrinsics to deinterleave. 3500 assert(InterleaveFactor <= 8 && 3501 "Unsupported deinterleave factor for scalable vectors"); 3502 Value *Deinterleave = State.Builder.CreateIntrinsic( 3503 getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(), 3504 NewLoad, 3505 /*FMFSource=*/nullptr, "strided.vec"); 3506 3507 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { 3508 Instruction *Member = Group->getMember(I); 3509 Value *StridedVec = State.Builder.CreateExtractValue(Deinterleave, I); 3510 if (!Member) { 3511 // This value is not needed as it's not used 3512 cast<Instruction>(StridedVec)->eraseFromParent(); 3513 continue; 3514 } 3515 // If this member has different type, cast the result type. 3516 if (Member->getType() != ScalarTy) { 3517 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 3518 StridedVec = 3519 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 3520 } 3521 3522 if (Group->isReverse()) 3523 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 3524 3525 State.set(VPDefs[J], StridedVec); 3526 ++J; 3527 } 3528 3529 return; 3530 } 3531 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 3532 3533 // For each member in the group, shuffle out the appropriate data from the 3534 // wide loads. 3535 unsigned J = 0; 3536 for (unsigned I = 0; I < InterleaveFactor; ++I) { 3537 Instruction *Member = Group->getMember(I); 3538 3539 // Skip the gaps in the group. 3540 if (!Member) 3541 continue; 3542 3543 auto StrideMask = 3544 createStrideMask(I, InterleaveFactor, State.VF.getFixedValue()); 3545 Value *StridedVec = 3546 State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec"); 3547 3548 // If this member has different type, cast the result type. 3549 if (Member->getType() != ScalarTy) { 3550 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 3551 StridedVec = 3552 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 3553 } 3554 3555 if (Group->isReverse()) 3556 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 3557 3558 State.set(VPDefs[J], StridedVec); 3559 ++J; 3560 } 3561 return; 3562 } 3563 3564 // The sub vector type for current instruction. 3565 auto *SubVT = VectorType::get(ScalarTy, State.VF); 3566 3567 // Vectorize the interleaved store group. 3568 Value *MaskForGaps = 3569 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); 3570 assert((!MaskForGaps || !State.VF.isScalable()) && 3571 "masking gaps for scalable vectors is not yet supported."); 3572 ArrayRef<VPValue *> StoredValues = getStoredValues(); 3573 // Collect the stored vector from each member. 3574 SmallVector<Value *, 4> StoredVecs; 3575 unsigned StoredIdx = 0; 3576 for (unsigned i = 0; i < InterleaveFactor; i++) { 3577 assert((Group->getMember(i) || MaskForGaps) && 3578 "Fail to get a member from an interleaved store group"); 3579 Instruction *Member = Group->getMember(i); 3580 3581 // Skip the gaps in the group. 3582 if (!Member) { 3583 Value *Undef = PoisonValue::get(SubVT); 3584 StoredVecs.push_back(Undef); 3585 continue; 3586 } 3587 3588 Value *StoredVec = State.get(StoredValues[StoredIdx]); 3589 ++StoredIdx; 3590 3591 if (Group->isReverse()) 3592 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); 3593 3594 // If this member has different type, cast it to a unified type. 3595 3596 if (StoredVec->getType() != SubVT) 3597 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); 3598 3599 StoredVecs.push_back(StoredVec); 3600 } 3601 3602 // Interleave all the smaller vectors into one wider vector. 3603 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); 3604 Instruction *NewStoreInstr; 3605 if (BlockInMask || MaskForGaps) { 3606 Value *GroupMask = CreateGroupMask(MaskForGaps); 3607 NewStoreInstr = State.Builder.CreateMaskedStore( 3608 IVec, ResAddr, Group->getAlign(), GroupMask); 3609 } else 3610 NewStoreInstr = 3611 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign()); 3612 3613 Group->addMetadata(NewStoreInstr); 3614 } 3615 3616 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3617 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 3618 VPSlotTracker &SlotTracker) const { 3619 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 3620 IG->getInsertPos()->printAsOperand(O, false); 3621 O << ", "; 3622 getAddr()->printAsOperand(O, SlotTracker); 3623 VPValue *Mask = getMask(); 3624 if (Mask) { 3625 O << ", "; 3626 Mask->printAsOperand(O, SlotTracker); 3627 } 3628 3629 unsigned OpIdx = 0; 3630 for (unsigned i = 0; i < IG->getFactor(); ++i) { 3631 if (!IG->getMember(i)) 3632 continue; 3633 if (getNumStoreOperands() > 0) { 3634 O << "\n" << Indent << " store "; 3635 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 3636 O << " to index " << i; 3637 } else { 3638 O << "\n" << Indent << " "; 3639 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 3640 O << " = load from index " << i; 3641 } 3642 ++OpIdx; 3643 } 3644 } 3645 #endif 3646 3647 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, 3648 VPCostContext &Ctx) const { 3649 Instruction *InsertPos = getInsertPos(); 3650 // Find the VPValue index of the interleave group. We need to skip gaps. 3651 unsigned InsertPosIdx = 0; 3652 for (unsigned Idx = 0; IG->getFactor(); ++Idx) 3653 if (auto *Member = IG->getMember(Idx)) { 3654 if (Member == InsertPos) 3655 break; 3656 InsertPosIdx++; 3657 } 3658 Type *ValTy = Ctx.Types.inferScalarType( 3659 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx) 3660 : getStoredValues()[InsertPosIdx]); 3661 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 3662 unsigned AS = getLoadStoreAddressSpace(InsertPos); 3663 3664 unsigned InterleaveFactor = IG->getFactor(); 3665 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 3666 3667 // Holds the indices of existing members in the interleaved group. 3668 SmallVector<unsigned, 4> Indices; 3669 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 3670 if (IG->getMember(IF)) 3671 Indices.push_back(IF); 3672 3673 // Calculate the cost of the whole interleaved group. 3674 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( 3675 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices, 3676 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps); 3677 3678 if (!IG->isReverse()) 3679 return Cost; 3680 3681 return Cost + IG->getNumMembers() * 3682 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 3683 VectorTy, VectorTy, {}, Ctx.CostKind, 3684 0); 3685 } 3686 3687 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3688 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3689 VPSlotTracker &SlotTracker) const { 3690 O << Indent << "EMIT "; 3691 printAsOperand(O, SlotTracker); 3692 O << " = CANONICAL-INDUCTION "; 3693 printOperands(O, SlotTracker); 3694 } 3695 #endif 3696 3697 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 3698 return IsScalarAfterVectorization && 3699 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 3700 } 3701 3702 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 3703 assert(getInductionDescriptor().getKind() == 3704 InductionDescriptor::IK_PtrInduction && 3705 "Not a pointer induction according to InductionDescriptor!"); 3706 assert(State.TypeAnalysis.inferScalarType(this)->isPointerTy() && 3707 "Unexpected type."); 3708 assert(!onlyScalarsGenerated(State.VF.isScalable()) && 3709 "Recipe should have been replaced"); 3710 3711 unsigned CurrentPart = getUnrollPart(*this); 3712 3713 // Build a pointer phi 3714 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 3715 Type *ScStValueType = ScalarStartValue->getType(); 3716 3717 BasicBlock *VectorPH = 3718 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); 3719 PHINode *NewPointerPhi = nullptr; 3720 if (CurrentPart == 0) { 3721 IRBuilder<>::InsertPointGuard Guard(State.Builder); 3722 if (State.Builder.GetInsertPoint() != 3723 State.Builder.GetInsertBlock()->getFirstNonPHIIt()) 3724 State.Builder.SetInsertPoint( 3725 State.Builder.GetInsertBlock()->getFirstNonPHIIt()); 3726 NewPointerPhi = State.Builder.CreatePHI(ScStValueType, 2, "pointer.phi"); 3727 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 3728 NewPointerPhi->setDebugLoc(getDebugLoc()); 3729 } else { 3730 // The recipe has been unrolled. In that case, fetch the single pointer phi 3731 // shared among all unrolled parts of the recipe. 3732 auto *GEP = 3733 cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand())); 3734 NewPointerPhi = cast<PHINode>(GEP->getPointerOperand()); 3735 } 3736 3737 // A pointer induction, performed by using a gep 3738 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); 3739 Value *ScalarStepValue = State.get(getStepValue(), VPLane(0)); 3740 Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue()); 3741 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 3742 // Add induction update using an incorrect block temporarily. The phi node 3743 // will be fixed after VPlan execution. Note that at this point the latch 3744 // block cannot be used, as it does not exist yet. 3745 // TODO: Model increment value in VPlan, by turning the recipe into a 3746 // multi-def and a subclass of VPHeaderPHIRecipe. 3747 if (CurrentPart == 0) { 3748 // The recipe represents the first part of the pointer induction. Create the 3749 // GEP to increment the phi across all unrolled parts. 3750 Value *NumUnrolledElems = State.get(getOperand(2), true); 3751 3752 Value *InductionGEP = GetElementPtrInst::Create( 3753 State.Builder.getInt8Ty(), NewPointerPhi, 3754 State.Builder.CreateMul( 3755 ScalarStepValue, 3756 State.Builder.CreateTrunc(NumUnrolledElems, PhiType)), 3757 "ptr.ind", InductionLoc); 3758 3759 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 3760 } 3761 3762 // Create actual address geps that use the pointer phi as base and a 3763 // vectorized version of the step value (<step*0, ..., step*N>) as offset. 3764 Type *VecPhiType = VectorType::get(PhiType, State.VF); 3765 Value *StartOffsetScalar = State.Builder.CreateMul( 3766 RuntimeVF, ConstantInt::get(PhiType, CurrentPart)); 3767 Value *StartOffset = 3768 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 3769 // Create a vector of consecutive numbers from zero to VF. 3770 StartOffset = State.Builder.CreateAdd( 3771 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 3772 3773 assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) && 3774 "scalar step must be the same across all parts"); 3775 Value *GEP = State.Builder.CreateGEP( 3776 State.Builder.getInt8Ty(), NewPointerPhi, 3777 State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat( 3778 State.VF, ScalarStepValue)), 3779 "vector.gep"); 3780 State.set(this, GEP); 3781 } 3782 3783 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3784 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 3785 VPSlotTracker &SlotTracker) const { 3786 assert((getNumOperands() == 3 || getNumOperands() == 5) && 3787 "unexpected number of operands"); 3788 O << Indent << "EMIT "; 3789 printAsOperand(O, SlotTracker); 3790 O << " = WIDEN-POINTER-INDUCTION "; 3791 getStartValue()->printAsOperand(O, SlotTracker); 3792 O << ", "; 3793 getStepValue()->printAsOperand(O, SlotTracker); 3794 O << ", "; 3795 getOperand(2)->printAsOperand(O, SlotTracker); 3796 if (getNumOperands() == 5) { 3797 O << ", "; 3798 getOperand(3)->printAsOperand(O, SlotTracker); 3799 O << ", "; 3800 getOperand(4)->printAsOperand(O, SlotTracker); 3801 } 3802 } 3803 #endif 3804 3805 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 3806 assert(!State.Lane && "cannot be used in per-lane"); 3807 const DataLayout &DL = SE.getDataLayout(); 3808 SCEVExpander Exp(SE, DL, "induction", /*PreserveLCSSA=*/true); 3809 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 3810 &*State.Builder.GetInsertPoint()); 3811 State.set(this, Res, VPLane(0)); 3812 } 3813 3814 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3815 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 3816 VPSlotTracker &SlotTracker) const { 3817 O << Indent << "EMIT "; 3818 printAsOperand(O, SlotTracker); 3819 O << " = EXPAND SCEV " << *Expr; 3820 } 3821 #endif 3822 3823 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 3824 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true); 3825 Type *STy = CanonicalIV->getType(); 3826 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 3827 ElementCount VF = State.VF; 3828 Value *VStart = VF.isScalar() 3829 ? CanonicalIV 3830 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 3831 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this)); 3832 if (VF.isVector()) { 3833 VStep = Builder.CreateVectorSplat(VF, VStep); 3834 VStep = 3835 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 3836 } 3837 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 3838 State.set(this, CanonicalVectorIV); 3839 } 3840 3841 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3842 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 3843 VPSlotTracker &SlotTracker) const { 3844 O << Indent << "EMIT "; 3845 printAsOperand(O, SlotTracker); 3846 O << " = WIDEN-CANONICAL-INDUCTION "; 3847 printOperands(O, SlotTracker); 3848 } 3849 #endif 3850 3851 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 3852 auto &Builder = State.Builder; 3853 // Create a vector from the initial value. 3854 auto *VectorInit = getStartValue()->getLiveInIRValue(); 3855 3856 Type *VecTy = State.VF.isScalar() 3857 ? VectorInit->getType() 3858 : VectorType::get(VectorInit->getType(), State.VF); 3859 3860 BasicBlock *VectorPH = 3861 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); 3862 if (State.VF.isVector()) { 3863 auto *IdxTy = Builder.getInt32Ty(); 3864 auto *One = ConstantInt::get(IdxTy, 1); 3865 IRBuilder<>::InsertPointGuard Guard(Builder); 3866 Builder.SetInsertPoint(VectorPH->getTerminator()); 3867 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 3868 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3869 VectorInit = Builder.CreateInsertElement( 3870 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 3871 } 3872 3873 // Create a phi node for the new recurrence. 3874 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur"); 3875 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 3876 Phi->addIncoming(VectorInit, VectorPH); 3877 State.set(this, Phi); 3878 } 3879 3880 InstructionCost 3881 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, 3882 VPCostContext &Ctx) const { 3883 if (VF.isScalar()) 3884 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); 3885 3886 return 0; 3887 } 3888 3889 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3890 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 3891 VPSlotTracker &SlotTracker) const { 3892 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 3893 printAsOperand(O, SlotTracker); 3894 O << " = phi "; 3895 printOperands(O, SlotTracker); 3896 } 3897 #endif 3898 3899 void VPReductionPHIRecipe::execute(VPTransformState &State) { 3900 // Reductions do not have to start at zero. They can start with 3901 // any loop invariant values. 3902 VPValue *StartVPV = getStartValue(); 3903 3904 // In order to support recurrences we need to be able to vectorize Phi nodes. 3905 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3906 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3907 // this value when we vectorize all of the instructions that use the PHI. 3908 BasicBlock *VectorPH = 3909 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); 3910 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 3911 Value *StartV = State.get(StartVPV, ScalarPHI); 3912 Type *VecTy = StartV->getType(); 3913 3914 BasicBlock *HeaderBB = State.CFG.PrevBB; 3915 assert(State.CurrentParentLoop->getHeader() == HeaderBB && 3916 "recipe must be in the vector loop header"); 3917 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); 3918 Phi->insertBefore(HeaderBB->getFirstInsertionPt()); 3919 State.set(this, Phi, IsInLoop); 3920 3921 Phi->addIncoming(StartV, VectorPH); 3922 } 3923 3924 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3925 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3926 VPSlotTracker &SlotTracker) const { 3927 O << Indent << "WIDEN-REDUCTION-PHI "; 3928 3929 printAsOperand(O, SlotTracker); 3930 O << " = phi "; 3931 printOperands(O, SlotTracker); 3932 if (VFScaleFactor != 1) 3933 O << " (VF scaled by 1/" << VFScaleFactor << ")"; 3934 } 3935 #endif 3936 3937 void VPWidenPHIRecipe::execute(VPTransformState &State) { 3938 Value *Op0 = State.get(getOperand(0)); 3939 Type *VecTy = Op0->getType(); 3940 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name); 3941 // Manually move it with the other PHIs in case PHI recipes above this one 3942 // also inserted non-phi instructions. 3943 // TODO: Remove once VPWidenPointerInductionRecipe is also expanded in 3944 // convertToConcreteRecipes. 3945 VecPhi->moveBefore(State.Builder.GetInsertBlock()->getFirstNonPHIIt()); 3946 State.set(this, VecPhi); 3947 } 3948 3949 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3950 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3951 VPSlotTracker &SlotTracker) const { 3952 O << Indent << "WIDEN-PHI "; 3953 3954 printAsOperand(O, SlotTracker); 3955 O << " = phi "; 3956 printPhiOperands(O, SlotTracker); 3957 } 3958 #endif 3959 3960 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 3961 // remove VPActiveLaneMaskPHIRecipe. 3962 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 3963 BasicBlock *VectorPH = 3964 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); 3965 Value *StartMask = State.get(getOperand(0)); 3966 PHINode *Phi = 3967 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 3968 Phi->addIncoming(StartMask, VectorPH); 3969 State.set(this, Phi); 3970 } 3971 3972 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3973 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3974 VPSlotTracker &SlotTracker) const { 3975 O << Indent << "ACTIVE-LANE-MASK-PHI "; 3976 3977 printAsOperand(O, SlotTracker); 3978 O << " = phi "; 3979 printOperands(O, SlotTracker); 3980 } 3981 #endif 3982 3983 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3984 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3985 VPSlotTracker &SlotTracker) const { 3986 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 3987 3988 printAsOperand(O, SlotTracker); 3989 O << " = phi "; 3990 printOperands(O, SlotTracker); 3991 } 3992 #endif 3993