1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file contains implementations for different VPlan recipes. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "LoopVectorizationPlanner.h" 15 #include "VPlan.h" 16 #include "VPlanAnalysis.h" 17 #include "VPlanHelpers.h" 18 #include "VPlanPatternMatch.h" 19 #include "VPlanUtils.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/Twine.h" 23 #include "llvm/Analysis/AssumptionCache.h" 24 #include "llvm/Analysis/IVDescriptors.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/IRBuilder.h" 28 #include "llvm/IR/Instruction.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Intrinsics.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/IR/Value.h" 33 #include "llvm/Support/Casting.h" 34 #include "llvm/Support/CommandLine.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Support/raw_ostream.h" 37 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 38 #include "llvm/Transforms/Utils/LoopUtils.h" 39 #include "llvm/Transforms/Utils/LoopVersioning.h" 40 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 41 #include <cassert> 42 43 using namespace llvm; 44 45 using VectorParts = SmallVector<Value *, 2>; 46 47 #define LV_NAME "loop-vectorize" 48 #define DEBUG_TYPE LV_NAME 49 50 bool VPRecipeBase::mayWriteToMemory() const { 51 switch (getVPDefID()) { 52 case VPExpressionSC: 53 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory(); 54 case VPInstructionSC: 55 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); 56 case VPInterleaveSC: 57 return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; 58 case VPWidenStoreEVLSC: 59 case VPWidenStoreSC: 60 return true; 61 case VPReplicateSC: 62 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 63 ->mayWriteToMemory(); 64 case VPWidenCallSC: 65 return !cast<VPWidenCallRecipe>(this) 66 ->getCalledScalarFunction() 67 ->onlyReadsMemory(); 68 case VPWidenIntrinsicSC: 69 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory(); 70 case VPCanonicalIVPHISC: 71 case VPBranchOnMaskSC: 72 case VPFirstOrderRecurrencePHISC: 73 case VPReductionPHISC: 74 case VPScalarIVStepsSC: 75 case VPPredInstPHISC: 76 return false; 77 case VPBlendSC: 78 case VPReductionEVLSC: 79 case VPReductionSC: 80 case VPVectorPointerSC: 81 case VPWidenCanonicalIVSC: 82 case VPWidenCastSC: 83 case VPWidenGEPSC: 84 case VPWidenIntOrFpInductionSC: 85 case VPWidenLoadEVLSC: 86 case VPWidenLoadSC: 87 case VPWidenPHISC: 88 case VPWidenSC: 89 case VPWidenSelectSC: { 90 const Instruction *I = 91 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 92 (void)I; 93 assert((!I || !I->mayWriteToMemory()) && 94 "underlying instruction may write to memory"); 95 return false; 96 } 97 default: 98 return true; 99 } 100 } 101 102 bool VPRecipeBase::mayReadFromMemory() const { 103 switch (getVPDefID()) { 104 case VPExpressionSC: 105 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory(); 106 case VPInstructionSC: 107 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); 108 case VPWidenLoadEVLSC: 109 case VPWidenLoadSC: 110 return true; 111 case VPReplicateSC: 112 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) 113 ->mayReadFromMemory(); 114 case VPWidenCallSC: 115 return !cast<VPWidenCallRecipe>(this) 116 ->getCalledScalarFunction() 117 ->onlyWritesMemory(); 118 case VPWidenIntrinsicSC: 119 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory(); 120 case VPBranchOnMaskSC: 121 case VPFirstOrderRecurrencePHISC: 122 case VPPredInstPHISC: 123 case VPScalarIVStepsSC: 124 case VPWidenStoreEVLSC: 125 case VPWidenStoreSC: 126 return false; 127 case VPBlendSC: 128 case VPReductionEVLSC: 129 case VPReductionSC: 130 case VPVectorPointerSC: 131 case VPWidenCanonicalIVSC: 132 case VPWidenCastSC: 133 case VPWidenGEPSC: 134 case VPWidenIntOrFpInductionSC: 135 case VPWidenPHISC: 136 case VPWidenSC: 137 case VPWidenSelectSC: { 138 const Instruction *I = 139 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 140 (void)I; 141 assert((!I || !I->mayReadFromMemory()) && 142 "underlying instruction may read from memory"); 143 return false; 144 } 145 default: 146 return true; 147 } 148 } 149 150 bool VPRecipeBase::mayHaveSideEffects() const { 151 switch (getVPDefID()) { 152 case VPExpressionSC: 153 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects(); 154 case VPDerivedIVSC: 155 case VPFirstOrderRecurrencePHISC: 156 case VPPredInstPHISC: 157 case VPVectorEndPointerSC: 158 return false; 159 case VPInstructionSC: 160 return mayWriteToMemory(); 161 case VPWidenCallSC: { 162 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); 163 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); 164 } 165 case VPWidenIntrinsicSC: 166 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects(); 167 case VPBlendSC: 168 case VPReductionEVLSC: 169 case VPReductionSC: 170 case VPScalarIVStepsSC: 171 case VPVectorPointerSC: 172 case VPWidenCanonicalIVSC: 173 case VPWidenCastSC: 174 case VPWidenGEPSC: 175 case VPWidenIntOrFpInductionSC: 176 case VPWidenPHISC: 177 case VPWidenPointerInductionSC: 178 case VPWidenSC: 179 case VPWidenSelectSC: { 180 const Instruction *I = 181 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); 182 (void)I; 183 assert((!I || !I->mayHaveSideEffects()) && 184 "underlying instruction has side-effects"); 185 return false; 186 } 187 case VPInterleaveSC: 188 return mayWriteToMemory(); 189 case VPWidenLoadEVLSC: 190 case VPWidenLoadSC: 191 case VPWidenStoreEVLSC: 192 case VPWidenStoreSC: 193 assert( 194 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == 195 mayWriteToMemory() && 196 "mayHaveSideffects result for ingredient differs from this " 197 "implementation"); 198 return mayWriteToMemory(); 199 case VPReplicateSC: { 200 auto *R = cast<VPReplicateRecipe>(this); 201 return R->getUnderlyingInstr()->mayHaveSideEffects(); 202 } 203 default: 204 return true; 205 } 206 } 207 208 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 209 assert(!Parent && "Recipe already in some VPBasicBlock"); 210 assert(InsertPos->getParent() && 211 "Insertion position not in any VPBasicBlock"); 212 InsertPos->getParent()->insert(this, InsertPos->getIterator()); 213 } 214 215 void VPRecipeBase::insertBefore(VPBasicBlock &BB, 216 iplist<VPRecipeBase>::iterator I) { 217 assert(!Parent && "Recipe already in some VPBasicBlock"); 218 assert(I == BB.end() || I->getParent() == &BB); 219 BB.insert(this, I); 220 } 221 222 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 223 assert(!Parent && "Recipe already in some VPBasicBlock"); 224 assert(InsertPos->getParent() && 225 "Insertion position not in any VPBasicBlock"); 226 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); 227 } 228 229 void VPRecipeBase::removeFromParent() { 230 assert(getParent() && "Recipe not in any VPBasicBlock"); 231 getParent()->getRecipeList().remove(getIterator()); 232 Parent = nullptr; 233 } 234 235 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 236 assert(getParent() && "Recipe not in any VPBasicBlock"); 237 return getParent()->getRecipeList().erase(getIterator()); 238 } 239 240 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 241 removeFromParent(); 242 insertAfter(InsertPos); 243 } 244 245 void VPRecipeBase::moveBefore(VPBasicBlock &BB, 246 iplist<VPRecipeBase>::iterator I) { 247 removeFromParent(); 248 insertBefore(BB, I); 249 } 250 251 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { 252 // Get the underlying instruction for the recipe, if there is one. It is used 253 // to 254 // * decide if cost computation should be skipped for this recipe, 255 // * apply forced target instruction cost. 256 Instruction *UI = nullptr; 257 if (auto *S = dyn_cast<VPSingleDefRecipe>(this)) 258 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 259 else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this)) 260 UI = IG->getInsertPos(); 261 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this)) 262 UI = &WidenMem->getIngredient(); 263 264 InstructionCost RecipeCost; 265 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) { 266 RecipeCost = 0; 267 } else { 268 RecipeCost = computeCost(VF, Ctx); 269 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && 270 RecipeCost.isValid()) 271 RecipeCost = InstructionCost(ForceTargetInstructionCost); 272 } 273 274 LLVM_DEBUG({ 275 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; 276 dump(); 277 }); 278 return RecipeCost; 279 } 280 281 InstructionCost VPRecipeBase::computeCost(ElementCount VF, 282 VPCostContext &Ctx) const { 283 llvm_unreachable("subclasses should implement computeCost"); 284 } 285 286 bool VPRecipeBase::isPhi() const { 287 return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) || 288 (isa<VPInstruction>(this) && 289 cast<VPInstruction>(this)->getOpcode() == Instruction::PHI) || 290 isa<VPIRPhi>(this); 291 } 292 293 bool VPRecipeBase::isScalarCast() const { 294 auto *VPI = dyn_cast<VPInstruction>(this); 295 return VPI && Instruction::isCast(VPI->getOpcode()); 296 } 297 298 InstructionCost 299 VPPartialReductionRecipe::computeCost(ElementCount VF, 300 VPCostContext &Ctx) const { 301 std::optional<unsigned> Opcode; 302 VPValue *Op = getOperand(0); 303 VPRecipeBase *OpR = Op->getDefiningRecipe(); 304 305 // If the partial reduction is predicated, a select will be operand 0 306 using namespace llvm::VPlanPatternMatch; 307 if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) { 308 OpR = Op->getDefiningRecipe(); 309 } 310 311 Type *InputTypeA = nullptr, *InputTypeB = nullptr; 312 TTI::PartialReductionExtendKind ExtAType = TTI::PR_None, 313 ExtBType = TTI::PR_None; 314 315 auto GetExtendKind = [](VPRecipeBase *R) { 316 if (!R) 317 return TTI::PR_None; 318 auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R); 319 if (!WidenCastR) 320 return TTI::PR_None; 321 if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) 322 return TTI::PR_ZeroExtend; 323 if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) 324 return TTI::PR_SignExtend; 325 return TTI::PR_None; 326 }; 327 328 // Pick out opcode, type/ext information and use sub side effects from a widen 329 // recipe. 330 auto HandleWiden = [&](VPWidenRecipe *Widen) { 331 if (match(Widen, 332 m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Op)))) { 333 Widen = dyn_cast<VPWidenRecipe>(Op->getDefiningRecipe()); 334 } 335 Opcode = Widen->getOpcode(); 336 VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe(); 337 VPRecipeBase *ExtBR = Widen->getOperand(1)->getDefiningRecipe(); 338 InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0) 339 : Widen->getOperand(0)); 340 InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0) 341 : Widen->getOperand(1)); 342 ExtAType = GetExtendKind(ExtAR); 343 ExtBType = GetExtendKind(ExtBR); 344 }; 345 346 if (isa<VPWidenCastRecipe>(OpR)) { 347 InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0)); 348 ExtAType = GetExtendKind(OpR); 349 } else if (isa<VPReductionPHIRecipe>(OpR)) { 350 auto RedPhiOp1R = getOperand(1)->getDefiningRecipe(); 351 if (isa<VPWidenCastRecipe>(RedPhiOp1R)) { 352 InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0)); 353 ExtAType = GetExtendKind(RedPhiOp1R); 354 } else if (auto Widen = dyn_cast<VPWidenRecipe>(RedPhiOp1R)) 355 HandleWiden(Widen); 356 } else if (auto Widen = dyn_cast<VPWidenRecipe>(OpR)) { 357 HandleWiden(Widen); 358 } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(OpR)) { 359 return Reduction->computeCost(VF, Ctx); 360 } 361 auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); 362 return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB, 363 PhiType, VF, ExtAType, ExtBType, 364 Opcode, Ctx.CostKind); 365 } 366 367 void VPPartialReductionRecipe::execute(VPTransformState &State) { 368 auto &Builder = State.Builder; 369 370 assert(getOpcode() == Instruction::Add && 371 "Unhandled partial reduction opcode"); 372 373 Value *BinOpVal = State.get(getOperand(1)); 374 Value *PhiVal = State.get(getOperand(0)); 375 assert(PhiVal && BinOpVal && "Phi and Mul must be set"); 376 377 Type *RetTy = PhiVal->getType(); 378 379 CallInst *V = Builder.CreateIntrinsic( 380 RetTy, Intrinsic::experimental_vector_partial_reduce_add, 381 {PhiVal, BinOpVal}, nullptr, "partial.reduce"); 382 383 State.set(this, V); 384 } 385 386 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 387 void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, 388 VPSlotTracker &SlotTracker) const { 389 O << Indent << "PARTIAL-REDUCE "; 390 printAsOperand(O, SlotTracker); 391 O << " = " << Instruction::getOpcodeName(getOpcode()) << " "; 392 printOperands(O, SlotTracker); 393 } 394 #endif 395 396 FastMathFlags VPIRFlags::getFastMathFlags() const { 397 assert(OpType == OperationType::FPMathOp && 398 "recipe doesn't have fast math flags"); 399 FastMathFlags Res; 400 Res.setAllowReassoc(FMFs.AllowReassoc); 401 Res.setNoNaNs(FMFs.NoNaNs); 402 Res.setNoInfs(FMFs.NoInfs); 403 Res.setNoSignedZeros(FMFs.NoSignedZeros); 404 Res.setAllowReciprocal(FMFs.AllowReciprocal); 405 Res.setAllowContract(FMFs.AllowContract); 406 Res.setApproxFunc(FMFs.ApproxFunc); 407 return Res; 408 } 409 410 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 411 void VPSingleDefRecipe::dump() const { VPDef::dump(); } 412 #endif 413 414 template <unsigned PartOpIdx> 415 VPValue * 416 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const { 417 if (U.getNumOperands() == PartOpIdx + 1) 418 return U.getOperand(PartOpIdx); 419 return nullptr; 420 } 421 422 template <unsigned PartOpIdx> 423 unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const { 424 if (auto *UnrollPartOp = getUnrollPartOperand(U)) 425 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue(); 426 return 0; 427 } 428 429 namespace llvm { 430 template class VPUnrollPartAccessor<2>; 431 template class VPUnrollPartAccessor<3>; 432 } 433 434 VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, 435 const VPIRFlags &Flags, DebugLoc DL, 436 const Twine &Name) 437 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL), 438 VPIRMetadata(), Opcode(Opcode), Name(Name.str()) { 439 assert(flagsValidForOpcode(getOpcode()) && 440 "Set flags not supported for the provided opcode"); 441 assert((getNumOperandsForOpcode(Opcode) == -1u || 442 getNumOperandsForOpcode(Opcode) == getNumOperands()) && 443 "number of operands does not match opcode"); 444 } 445 446 #ifndef NDEBUG 447 unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { 448 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode)) 449 return 1; 450 451 if (Instruction::isBinaryOp(Opcode)) 452 return 2; 453 454 switch (Opcode) { 455 case VPInstruction::StepVector: 456 return 0; 457 case Instruction::Alloca: 458 case Instruction::ExtractValue: 459 case Instruction::Freeze: 460 case Instruction::Load: 461 case VPInstruction::AnyOf: 462 case VPInstruction::BranchOnCond: 463 case VPInstruction::CalculateTripCountMinusVF: 464 case VPInstruction::CanonicalIVIncrementForPart: 465 case VPInstruction::ExplicitVectorLength: 466 case VPInstruction::ExtractLastElement: 467 case VPInstruction::ExtractPenultimateElement: 468 case VPInstruction::FirstActiveLane: 469 case VPInstruction::Not: 470 return 1; 471 case Instruction::ICmp: 472 case Instruction::FCmp: 473 case Instruction::Store: 474 case VPInstruction::ActiveLaneMask: 475 case VPInstruction::BranchOnCount: 476 case VPInstruction::ComputeReductionResult: 477 case VPInstruction::FirstOrderRecurrenceSplice: 478 case VPInstruction::LogicalAnd: 479 case VPInstruction::PtrAdd: 480 case VPInstruction::WideIVStep: 481 return 2; 482 case Instruction::Select: 483 case VPInstruction::ComputeAnyOfResult: 484 case VPInstruction::ReductionStartVector: 485 return 3; 486 case VPInstruction::ComputeFindIVResult: 487 return 4; 488 case Instruction::Call: 489 case Instruction::GetElementPtr: 490 case Instruction::PHI: 491 case Instruction::Switch: 492 // Cannot determine the number of operands from the opcode. 493 return -1u; 494 } 495 llvm_unreachable("all cases should be handled above"); 496 } 497 #endif 498 499 bool VPInstruction::doesGeneratePerAllLanes() const { 500 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); 501 } 502 503 bool VPInstruction::canGenerateScalarForFirstLane() const { 504 if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) 505 return true; 506 if (isSingleScalar() || isVectorToScalar()) 507 return true; 508 switch (Opcode) { 509 case Instruction::Freeze: 510 case Instruction::ICmp: 511 case Instruction::PHI: 512 case Instruction::Select: 513 case VPInstruction::BranchOnCond: 514 case VPInstruction::BranchOnCount: 515 case VPInstruction::CalculateTripCountMinusVF: 516 case VPInstruction::CanonicalIVIncrementForPart: 517 case VPInstruction::PtrAdd: 518 case VPInstruction::ExplicitVectorLength: 519 case VPInstruction::AnyOf: 520 return true; 521 default: 522 return false; 523 } 524 } 525 526 Value *VPInstruction::generatePerLane(VPTransformState &State, 527 const VPLane &Lane) { 528 IRBuilderBase &Builder = State.Builder; 529 530 assert(getOpcode() == VPInstruction::PtrAdd && 531 "only PtrAdd opcodes are supported for now"); 532 return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), 533 State.get(getOperand(1), Lane), Name); 534 } 535 536 /// Create a conditional branch using \p Cond branching to the successors of \p 537 /// VPBB. Note that the first successor is always forward (i.e. not created yet) 538 /// while the second successor may already have been created (if it is a header 539 /// block and VPBB is a latch). 540 static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB, 541 VPTransformState &State) { 542 // Replace the temporary unreachable terminator with a new conditional 543 // branch, hooking it up to backward destination (header) for latch blocks 544 // now, and to forward destination(s) later when they are created. 545 // Second successor may be backwards - iff it is already in VPBB2IRBB. 546 VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]); 547 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc); 548 BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB]; 549 BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc); 550 // First successor is always forward, reset it to nullptr 551 CondBr->setSuccessor(0, nullptr); 552 IRBB->getTerminator()->eraseFromParent(); 553 return CondBr; 554 } 555 556 Value *VPInstruction::generate(VPTransformState &State) { 557 IRBuilderBase &Builder = State.Builder; 558 559 if (Instruction::isBinaryOp(getOpcode())) { 560 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 561 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 562 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 563 auto *Res = 564 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); 565 if (auto *I = dyn_cast<Instruction>(Res)) 566 applyFlags(*I); 567 return Res; 568 } 569 570 switch (getOpcode()) { 571 case VPInstruction::Not: { 572 Value *A = State.get(getOperand(0)); 573 return Builder.CreateNot(A, Name); 574 } 575 case Instruction::ExtractElement: { 576 assert(State.VF.isVector() && "Only extract elements from vectors"); 577 if (getOperand(1)->isLiveIn()) { 578 unsigned IdxToExtract = 579 cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue(); 580 return State.get(getOperand(0), VPLane(IdxToExtract)); 581 } 582 Value *Vec = State.get(getOperand(0)); 583 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true); 584 return Builder.CreateExtractElement(Vec, Idx, Name); 585 } 586 case Instruction::Freeze: { 587 Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); 588 return Builder.CreateFreeze(Op, Name); 589 } 590 case Instruction::ICmp: { 591 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 592 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); 593 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); 594 return Builder.CreateCmp(getPredicate(), A, B, Name); 595 } 596 case Instruction::PHI: { 597 llvm_unreachable("should be handled by VPPhi::execute"); 598 } 599 case Instruction::Select: { 600 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 601 Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); 602 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); 603 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); 604 return Builder.CreateSelect(Cond, Op1, Op2, Name); 605 } 606 case VPInstruction::ActiveLaneMask: { 607 // Get first lane of vector induction variable. 608 Value *VIVElem0 = State.get(getOperand(0), VPLane(0)); 609 // Get the original loop tripcount. 610 Value *ScalarTC = State.get(getOperand(1), VPLane(0)); 611 612 // If this part of the active lane mask is scalar, generate the CMP directly 613 // to avoid unnecessary extracts. 614 if (State.VF.isScalar()) 615 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, 616 Name); 617 618 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 619 auto *PredTy = VectorType::get(Int1Ty, State.VF); 620 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 621 {PredTy, ScalarTC->getType()}, 622 {VIVElem0, ScalarTC}, nullptr, Name); 623 } 624 case VPInstruction::FirstOrderRecurrenceSplice: { 625 // Generate code to combine the previous and current values in vector v3. 626 // 627 // vector.ph: 628 // v_init = vector(..., ..., ..., a[-1]) 629 // br vector.body 630 // 631 // vector.body 632 // i = phi [0, vector.ph], [i+4, vector.body] 633 // v1 = phi [v_init, vector.ph], [v2, vector.body] 634 // v2 = a[i, i+1, i+2, i+3]; 635 // v3 = vector(v1(3), v2(0, 1, 2)) 636 637 auto *V1 = State.get(getOperand(0)); 638 if (!V1->getType()->isVectorTy()) 639 return V1; 640 Value *V2 = State.get(getOperand(1)); 641 return Builder.CreateVectorSplice(V1, V2, -1, Name); 642 } 643 case VPInstruction::CalculateTripCountMinusVF: { 644 unsigned UF = getParent()->getPlan()->getUF(); 645 Value *ScalarTC = State.get(getOperand(0), VPLane(0)); 646 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF); 647 Value *Sub = Builder.CreateSub(ScalarTC, Step); 648 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); 649 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); 650 return Builder.CreateSelect(Cmp, Sub, Zero); 651 } 652 case VPInstruction::ExplicitVectorLength: { 653 // TODO: Restructure this code with an explicit remainder loop, vsetvli can 654 // be outside of the main loop. 655 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true); 656 // Compute EVL 657 assert(AVL->getType()->isIntegerTy() && 658 "Requested vector length should be an integer."); 659 660 assert(State.VF.isScalable() && "Expected scalable vector factor."); 661 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); 662 663 Value *EVL = State.Builder.CreateIntrinsic( 664 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, 665 {AVL, VFArg, State.Builder.getTrue()}); 666 return EVL; 667 } 668 case VPInstruction::CanonicalIVIncrementForPart: { 669 unsigned Part = getUnrollPart(*this); 670 auto *IV = State.get(getOperand(0), VPLane(0)); 671 assert(Part != 0 && "Must have a positive part"); 672 // The canonical IV is incremented by the vectorization factor (num of 673 // SIMD elements) times the unroll part. 674 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); 675 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), 676 hasNoSignedWrap()); 677 } 678 case VPInstruction::BranchOnCond: { 679 Value *Cond = State.get(getOperand(0), VPLane(0)); 680 auto *Br = createCondBranch(Cond, getParent(), State); 681 applyMetadata(*Br); 682 return Br; 683 } 684 case VPInstruction::BranchOnCount: { 685 // First create the compare. 686 Value *IV = State.get(getOperand(0), /*IsScalar*/ true); 687 Value *TC = State.get(getOperand(1), /*IsScalar*/ true); 688 Value *Cond = Builder.CreateICmpEQ(IV, TC); 689 return createCondBranch(Cond, getParent(), State); 690 } 691 case VPInstruction::Broadcast: { 692 return Builder.CreateVectorSplat( 693 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast"); 694 } 695 case VPInstruction::BuildStructVector: { 696 // For struct types, we need to build a new 'wide' struct type, where each 697 // element is widened, i.e., we create a struct of vectors. 698 auto *StructTy = 699 cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0))); 700 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF)); 701 for (const auto &[LaneIndex, Op] : enumerate(operands())) { 702 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements(); 703 FieldIndex++) { 704 Value *ScalarValue = 705 Builder.CreateExtractValue(State.get(Op, true), FieldIndex); 706 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex); 707 VectorValue = 708 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex); 709 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex); 710 } 711 } 712 return Res; 713 } 714 case VPInstruction::BuildVector: { 715 auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0)); 716 auto NumOfElements = ElementCount::getFixed(getNumOperands()); 717 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements)); 718 for (const auto &[Idx, Op] : enumerate(operands())) 719 Res = State.Builder.CreateInsertElement(Res, State.get(Op, true), 720 State.Builder.getInt32(Idx)); 721 return Res; 722 } 723 case VPInstruction::ReductionStartVector: { 724 if (State.VF.isScalar()) 725 return State.get(getOperand(0), true); 726 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 727 Builder.setFastMathFlags(getFastMathFlags()); 728 // If this start vector is scaled then it should produce a vector with fewer 729 // elements than the VF. 730 ElementCount VF = State.VF.divideCoefficientBy( 731 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue()); 732 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true)); 733 Constant *Zero = Builder.getInt32(0); 734 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true), 735 Zero); 736 } 737 case VPInstruction::ComputeAnyOfResult: { 738 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 739 // and will be removed by breaking up the recipe further. 740 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 741 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 742 Value *ReducedPartRdx = State.get(getOperand(2)); 743 for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx) 744 ReducedPartRdx = Builder.CreateBinOp( 745 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode( 746 RecurKind::AnyOf), 747 State.get(getOperand(Idx)), ReducedPartRdx, "bin.rdx"); 748 return createAnyOfReduction(Builder, ReducedPartRdx, 749 State.get(getOperand(1), VPLane(0)), OrigPhi); 750 } 751 case VPInstruction::ComputeFindIVResult: { 752 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 753 // and will be removed by breaking up the recipe further. 754 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 755 // Get its reduction variable descriptor. 756 RecurKind RK = PhiR->getRecurrenceKind(); 757 assert(RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && 758 "Unexpected reduction kind"); 759 assert(!PhiR->isInLoop() && 760 "In-loop FindLastIV reduction is not supported yet"); 761 762 // The recipe's operands are the reduction phi, the start value, the 763 // sentinel value, followed by one operand for each part of the reduction. 764 unsigned UF = getNumOperands() - 3; 765 Value *ReducedPartRdx = State.get(getOperand(3)); 766 RecurKind MinMaxKind; 767 bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RK); 768 if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) 769 MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax; 770 else 771 MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin; 772 for (unsigned Part = 1; Part < UF; ++Part) 773 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 774 State.get(getOperand(3 + Part))); 775 776 Value *Start = State.get(getOperand(1), true); 777 Value *Sentinel = getOperand(2)->getLiveInIRValue(); 778 return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start, 779 Sentinel); 780 } 781 case VPInstruction::ComputeReductionResult: { 782 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary 783 // and will be removed by breaking up the recipe further. 784 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0)); 785 // Get its reduction variable descriptor. 786 787 RecurKind RK = PhiR->getRecurrenceKind(); 788 assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && 789 "should be handled by ComputeFindIVResult"); 790 791 // The recipe's operands are the reduction phi, followed by one operand for 792 // each part of the reduction. 793 unsigned UF = getNumOperands() - 1; 794 VectorParts RdxParts(UF); 795 for (unsigned Part = 0; Part < UF; ++Part) 796 RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop()); 797 798 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 799 if (hasFastMathFlags()) 800 Builder.setFastMathFlags(getFastMathFlags()); 801 802 // Reduce all of the unrolled parts into a single vector. 803 Value *ReducedPartRdx = RdxParts[0]; 804 if (PhiR->isOrdered()) { 805 ReducedPartRdx = RdxParts[UF - 1]; 806 } else { 807 // Floating-point operations should have some FMF to enable the reduction. 808 for (unsigned Part = 1; Part < UF; ++Part) { 809 Value *RdxPart = RdxParts[Part]; 810 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) 811 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 812 else 813 ReducedPartRdx = Builder.CreateBinOp( 814 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK), 815 RdxPart, ReducedPartRdx, "bin.rdx"); 816 } 817 } 818 819 // Create the reduction after the loop. Note that inloop reductions create 820 // the target reduction in the loop using a Reduction recipe. 821 if (State.VF.isVector() && !PhiR->isInLoop()) { 822 // TODO: Support in-order reductions based on the recurrence descriptor. 823 // All ops in the reduction inherit fast-math-flags from the recurrence 824 // descriptor. 825 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK); 826 } 827 828 return ReducedPartRdx; 829 } 830 case VPInstruction::ExtractLastElement: 831 case VPInstruction::ExtractPenultimateElement: { 832 unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2; 833 Value *Res; 834 if (State.VF.isVector()) { 835 assert(Offset <= State.VF.getKnownMinValue() && 836 "invalid offset to extract from"); 837 // Extract lane VF - Offset from the operand. 838 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset)); 839 } else { 840 assert(Offset <= 1 && "invalid offset to extract from"); 841 Res = State.get(getOperand(0)); 842 } 843 if (isa<ExtractElementInst>(Res)) 844 Res->setName(Name); 845 return Res; 846 } 847 case VPInstruction::LogicalAnd: { 848 Value *A = State.get(getOperand(0)); 849 Value *B = State.get(getOperand(1)); 850 return Builder.CreateLogicalAnd(A, B, Name); 851 } 852 case VPInstruction::PtrAdd: { 853 assert(vputils::onlyFirstLaneUsed(this) && 854 "can only generate first lane for PtrAdd"); 855 Value *Ptr = State.get(getOperand(0), VPLane(0)); 856 Value *Addend = State.get(getOperand(1), VPLane(0)); 857 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); 858 } 859 case VPInstruction::AnyOf: { 860 Value *Res = State.get(getOperand(0)); 861 for (VPValue *Op : drop_begin(operands())) 862 Res = Builder.CreateOr(Res, State.get(Op)); 863 return Builder.CreateOrReduce(Res); 864 } 865 case VPInstruction::FirstActiveLane: { 866 if (getNumOperands() == 1) { 867 Value *Mask = State.get(getOperand(0)); 868 return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask, 869 true, Name); 870 } 871 // If there are multiple operands, create a chain of selects to pick the 872 // first operand with an active lane and add the number of lanes of the 873 // preceding operands. 874 Value *RuntimeVF = 875 getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF); 876 unsigned LastOpIdx = getNumOperands() - 1; 877 Value *Res = nullptr; 878 for (int Idx = LastOpIdx; Idx >= 0; --Idx) { 879 Value *TrailingZeros = Builder.CreateCountTrailingZeroElems( 880 Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name); 881 Value *Current = Builder.CreateAdd( 882 Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros); 883 if (Res) { 884 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF); 885 Res = Builder.CreateSelect(Cmp, Current, Res); 886 } else { 887 Res = Current; 888 } 889 } 890 891 return Res; 892 } 893 default: 894 llvm_unreachable("Unsupported opcode for instruction"); 895 } 896 } 897 898 InstructionCost VPInstruction::computeCost(ElementCount VF, 899 VPCostContext &Ctx) const { 900 if (Instruction::isBinaryOp(getOpcode())) { 901 Type *ResTy = Ctx.Types.inferScalarType(this); 902 if (!vputils::onlyFirstLaneUsed(this)) 903 ResTy = toVectorTy(ResTy, VF); 904 905 if (!getUnderlyingValue()) { 906 switch (getOpcode()) { 907 case Instruction::FMul: 908 return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind); 909 default: 910 // TODO: Compute cost for VPInstructions without underlying values once 911 // the legacy cost model has been retired. 912 return 0; 913 } 914 } 915 916 assert(!doesGeneratePerAllLanes() && 917 "Should only generate a vector value or single scalar, not scalars " 918 "for all lanes."); 919 return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind); 920 } 921 922 switch (getOpcode()) { 923 case Instruction::ExtractElement: { 924 // Add on the cost of extracting the element. 925 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 926 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, 927 Ctx.CostKind); 928 } 929 case VPInstruction::AnyOf: { 930 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 931 return Ctx.TTI.getArithmeticReductionCost( 932 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind); 933 } 934 case VPInstruction::FirstActiveLane: { 935 // Calculate the cost of determining the lane index. 936 auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 937 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, 938 Type::getInt64Ty(Ctx.LLVMCtx), 939 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); 940 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); 941 } 942 case VPInstruction::FirstOrderRecurrenceSplice: { 943 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?"); 944 SmallVector<int> Mask(VF.getKnownMinValue()); 945 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 946 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 947 948 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 949 cast<VectorType>(VectorTy), 950 cast<VectorType>(VectorTy), Mask, 951 Ctx.CostKind, VF.getKnownMinValue() - 1); 952 } 953 case VPInstruction::ActiveLaneMask: { 954 Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0)); 955 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF); 956 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, 957 {ArgTy, ArgTy}); 958 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); 959 } 960 case VPInstruction::ExplicitVectorLength: { 961 Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0)); 962 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx); 963 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx); 964 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length, 965 I32Ty, {Arg0Ty, I32Ty, I1Ty}); 966 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); 967 } 968 case VPInstruction::ExtractPenultimateElement: 969 if (VF == ElementCount::getScalable(1)) 970 return InstructionCost::getInvalid(); 971 LLVM_FALLTHROUGH; 972 default: 973 // TODO: Compute cost other VPInstructions once the legacy cost model has 974 // been retired. 975 assert(!getUnderlyingValue() && 976 "unexpected VPInstruction witht underlying value"); 977 return 0; 978 } 979 } 980 981 bool VPInstruction::isVectorToScalar() const { 982 return getOpcode() == VPInstruction::ExtractLastElement || 983 getOpcode() == VPInstruction::ExtractPenultimateElement || 984 getOpcode() == Instruction::ExtractElement || 985 getOpcode() == VPInstruction::FirstActiveLane || 986 getOpcode() == VPInstruction::ComputeAnyOfResult || 987 getOpcode() == VPInstruction::ComputeFindIVResult || 988 getOpcode() == VPInstruction::ComputeReductionResult || 989 getOpcode() == VPInstruction::AnyOf; 990 } 991 992 bool VPInstruction::isSingleScalar() const { 993 return getOpcode() == Instruction::PHI || isScalarCast(); 994 } 995 996 void VPInstruction::execute(VPTransformState &State) { 997 assert(!State.Lane && "VPInstruction executing an Lane"); 998 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 999 assert(flagsValidForOpcode(getOpcode()) && 1000 "Set flags not supported for the provided opcode"); 1001 if (hasFastMathFlags()) 1002 State.Builder.setFastMathFlags(getFastMathFlags()); 1003 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && 1004 (vputils::onlyFirstLaneUsed(this) || 1005 isVectorToScalar() || isSingleScalar()); 1006 bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); 1007 if (GeneratesPerAllLanes) { 1008 for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue(); 1009 Lane != NumLanes; ++Lane) { 1010 Value *GeneratedValue = generatePerLane(State, VPLane(Lane)); 1011 assert(GeneratedValue && "generatePerLane must produce a value"); 1012 State.set(this, GeneratedValue, VPLane(Lane)); 1013 } 1014 return; 1015 } 1016 1017 Value *GeneratedValue = generate(State); 1018 if (!hasResult()) 1019 return; 1020 assert(GeneratedValue && "generate must produce a value"); 1021 assert((((GeneratedValue->getType()->isVectorTy() || 1022 GeneratedValue->getType()->isStructTy()) == 1023 !GeneratesPerFirstLaneOnly) || 1024 State.VF.isScalar()) && 1025 "scalar value but not only first lane defined"); 1026 State.set(this, GeneratedValue, 1027 /*IsScalar*/ GeneratesPerFirstLaneOnly); 1028 } 1029 1030 bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { 1031 if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) 1032 return false; 1033 switch (getOpcode()) { 1034 case Instruction::ExtractElement: 1035 case Instruction::Freeze: 1036 case Instruction::ICmp: 1037 case Instruction::Select: 1038 case VPInstruction::AnyOf: 1039 case VPInstruction::BuildStructVector: 1040 case VPInstruction::BuildVector: 1041 case VPInstruction::CalculateTripCountMinusVF: 1042 case VPInstruction::CanonicalIVIncrementForPart: 1043 case VPInstruction::ExtractLastElement: 1044 case VPInstruction::ExtractPenultimateElement: 1045 case VPInstruction::FirstActiveLane: 1046 case VPInstruction::FirstOrderRecurrenceSplice: 1047 case VPInstruction::LogicalAnd: 1048 case VPInstruction::Not: 1049 case VPInstruction::PtrAdd: 1050 case VPInstruction::WideIVStep: 1051 case VPInstruction::StepVector: 1052 case VPInstruction::ReductionStartVector: 1053 return false; 1054 default: 1055 return true; 1056 } 1057 } 1058 1059 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { 1060 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 1061 if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) 1062 return vputils::onlyFirstLaneUsed(this); 1063 1064 switch (getOpcode()) { 1065 default: 1066 return false; 1067 case Instruction::ExtractElement: 1068 return Op == getOperand(1); 1069 case Instruction::PHI: 1070 return true; 1071 case Instruction::ICmp: 1072 case Instruction::Select: 1073 case Instruction::Or: 1074 case Instruction::Freeze: 1075 // TODO: Cover additional opcodes. 1076 return vputils::onlyFirstLaneUsed(this); 1077 case VPInstruction::ActiveLaneMask: 1078 case VPInstruction::ExplicitVectorLength: 1079 case VPInstruction::CalculateTripCountMinusVF: 1080 case VPInstruction::CanonicalIVIncrementForPart: 1081 case VPInstruction::BranchOnCount: 1082 case VPInstruction::BranchOnCond: 1083 case VPInstruction::Broadcast: 1084 case VPInstruction::ReductionStartVector: 1085 return true; 1086 case VPInstruction::PtrAdd: 1087 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this); 1088 case VPInstruction::ComputeAnyOfResult: 1089 case VPInstruction::ComputeFindIVResult: 1090 return Op == getOperand(1); 1091 }; 1092 llvm_unreachable("switch should return"); 1093 } 1094 1095 bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { 1096 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 1097 if (Instruction::isBinaryOp(getOpcode())) 1098 return vputils::onlyFirstPartUsed(this); 1099 1100 switch (getOpcode()) { 1101 default: 1102 return false; 1103 case Instruction::ICmp: 1104 case Instruction::Select: 1105 return vputils::onlyFirstPartUsed(this); 1106 case VPInstruction::BranchOnCount: 1107 case VPInstruction::BranchOnCond: 1108 case VPInstruction::CanonicalIVIncrementForPart: 1109 return true; 1110 }; 1111 llvm_unreachable("switch should return"); 1112 } 1113 1114 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1115 void VPInstruction::dump() const { 1116 VPSlotTracker SlotTracker(getParent()->getPlan()); 1117 print(dbgs(), "", SlotTracker); 1118 } 1119 1120 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 1121 VPSlotTracker &SlotTracker) const { 1122 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " "; 1123 1124 if (hasResult()) { 1125 printAsOperand(O, SlotTracker); 1126 O << " = "; 1127 } 1128 1129 switch (getOpcode()) { 1130 case VPInstruction::Not: 1131 O << "not"; 1132 break; 1133 case VPInstruction::SLPLoad: 1134 O << "combined load"; 1135 break; 1136 case VPInstruction::SLPStore: 1137 O << "combined store"; 1138 break; 1139 case VPInstruction::ActiveLaneMask: 1140 O << "active lane mask"; 1141 break; 1142 case VPInstruction::ExplicitVectorLength: 1143 O << "EXPLICIT-VECTOR-LENGTH"; 1144 break; 1145 case VPInstruction::FirstOrderRecurrenceSplice: 1146 O << "first-order splice"; 1147 break; 1148 case VPInstruction::BranchOnCond: 1149 O << "branch-on-cond"; 1150 break; 1151 case VPInstruction::CalculateTripCountMinusVF: 1152 O << "TC > VF ? TC - VF : 0"; 1153 break; 1154 case VPInstruction::CanonicalIVIncrementForPart: 1155 O << "VF * Part +"; 1156 break; 1157 case VPInstruction::BranchOnCount: 1158 O << "branch-on-count"; 1159 break; 1160 case VPInstruction::Broadcast: 1161 O << "broadcast"; 1162 break; 1163 case VPInstruction::BuildStructVector: 1164 O << "buildstructvector"; 1165 break; 1166 case VPInstruction::BuildVector: 1167 O << "buildvector"; 1168 break; 1169 case VPInstruction::ExtractLastElement: 1170 O << "extract-last-element"; 1171 break; 1172 case VPInstruction::ExtractPenultimateElement: 1173 O << "extract-penultimate-element"; 1174 break; 1175 case VPInstruction::ComputeAnyOfResult: 1176 O << "compute-anyof-result"; 1177 break; 1178 case VPInstruction::ComputeFindIVResult: 1179 O << "compute-find-iv-result"; 1180 break; 1181 case VPInstruction::ComputeReductionResult: 1182 O << "compute-reduction-result"; 1183 break; 1184 case VPInstruction::LogicalAnd: 1185 O << "logical-and"; 1186 break; 1187 case VPInstruction::PtrAdd: 1188 O << "ptradd"; 1189 break; 1190 case VPInstruction::AnyOf: 1191 O << "any-of"; 1192 break; 1193 case VPInstruction::FirstActiveLane: 1194 O << "first-active-lane"; 1195 break; 1196 case VPInstruction::ReductionStartVector: 1197 O << "reduction-start-vector"; 1198 break; 1199 default: 1200 O << Instruction::getOpcodeName(getOpcode()); 1201 } 1202 1203 printFlags(O); 1204 printOperands(O, SlotTracker); 1205 1206 if (auto DL = getDebugLoc()) { 1207 O << ", !dbg "; 1208 DL.print(O); 1209 } 1210 } 1211 #endif 1212 1213 void VPInstructionWithType::execute(VPTransformState &State) { 1214 State.setDebugLocFrom(getDebugLoc()); 1215 if (isScalarCast()) { 1216 Value *Op = State.get(getOperand(0), VPLane(0)); 1217 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()), 1218 Op, ResultTy); 1219 State.set(this, Cast, VPLane(0)); 1220 return; 1221 } 1222 switch (getOpcode()) { 1223 case VPInstruction::StepVector: { 1224 Value *StepVector = 1225 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF)); 1226 State.set(this, StepVector); 1227 break; 1228 } 1229 default: 1230 llvm_unreachable("opcode not implemented yet"); 1231 } 1232 } 1233 1234 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1235 void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent, 1236 VPSlotTracker &SlotTracker) const { 1237 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " "; 1238 printAsOperand(O, SlotTracker); 1239 O << " = "; 1240 1241 switch (getOpcode()) { 1242 case VPInstruction::WideIVStep: 1243 O << "wide-iv-step "; 1244 printOperands(O, SlotTracker); 1245 break; 1246 case VPInstruction::StepVector: 1247 O << "step-vector " << *ResultTy; 1248 break; 1249 default: 1250 assert(Instruction::isCast(getOpcode()) && "unhandled opcode"); 1251 O << Instruction::getOpcodeName(getOpcode()) << " "; 1252 printOperands(O, SlotTracker); 1253 O << " to " << *ResultTy; 1254 } 1255 } 1256 #endif 1257 1258 void VPPhi::execute(VPTransformState &State) { 1259 State.setDebugLocFrom(getDebugLoc()); 1260 PHINode *NewPhi = State.Builder.CreatePHI( 1261 State.TypeAnalysis.inferScalarType(this), 2, getName()); 1262 unsigned NumIncoming = getNumIncoming(); 1263 if (getParent() != getParent()->getPlan()->getScalarPreheader()) { 1264 // TODO: Fixup all incoming values of header phis once recipes defining them 1265 // are introduced. 1266 NumIncoming = 1; 1267 } 1268 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) { 1269 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0)); 1270 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx)); 1271 NewPhi->addIncoming(IncV, PredBB); 1272 } 1273 State.set(this, NewPhi, VPLane(0)); 1274 } 1275 1276 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1277 void VPPhi::print(raw_ostream &O, const Twine &Indent, 1278 VPSlotTracker &SlotTracker) const { 1279 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " "; 1280 printAsOperand(O, SlotTracker); 1281 O << " = phi "; 1282 printPhiOperands(O, SlotTracker); 1283 } 1284 #endif 1285 1286 VPIRInstruction *VPIRInstruction ::create(Instruction &I) { 1287 if (auto *Phi = dyn_cast<PHINode>(&I)) 1288 return new VPIRPhi(*Phi); 1289 return new VPIRInstruction(I); 1290 } 1291 1292 void VPIRInstruction::execute(VPTransformState &State) { 1293 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 && 1294 "PHINodes must be handled by VPIRPhi"); 1295 // Advance the insert point after the wrapped IR instruction. This allows 1296 // interleaving VPIRInstructions and other recipes. 1297 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); 1298 } 1299 1300 InstructionCost VPIRInstruction::computeCost(ElementCount VF, 1301 VPCostContext &Ctx) const { 1302 // The recipe wraps an existing IR instruction on the border of VPlan's scope, 1303 // hence it does not contribute to the cost-modeling for the VPlan. 1304 return 0; 1305 } 1306 1307 void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) { 1308 assert(isa<PHINode>(getInstruction()) && 1309 "can only update exiting operands to phi nodes"); 1310 assert(getNumOperands() > 0 && "must have at least one operand"); 1311 VPValue *Exiting = getOperand(0); 1312 if (Exiting->isLiveIn()) 1313 return; 1314 1315 Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting}); 1316 setOperand(0, Exiting); 1317 } 1318 1319 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1320 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, 1321 VPSlotTracker &SlotTracker) const { 1322 O << Indent << "IR " << I; 1323 } 1324 #endif 1325 1326 void VPIRPhi::execute(VPTransformState &State) { 1327 PHINode *Phi = &getIRPhi(); 1328 for (const auto &[Idx, Op] : enumerate(operands())) { 1329 VPValue *ExitValue = Op; 1330 auto Lane = vputils::isSingleScalar(ExitValue) 1331 ? VPLane::getFirstLane() 1332 : VPLane::getLastLaneForVF(State.VF); 1333 VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; 1334 auto *PredVPBB = Pred->getExitingBasicBlock(); 1335 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; 1336 // Set insertion point in PredBB in case an extract needs to be generated. 1337 // TODO: Model extracts explicitly. 1338 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); 1339 Value *V = State.get(ExitValue, VPLane(Lane)); 1340 // If there is no existing block for PredBB in the phi, add a new incoming 1341 // value. Otherwise update the existing incoming value for PredBB. 1342 if (Phi->getBasicBlockIndex(PredBB) == -1) 1343 Phi->addIncoming(V, PredBB); 1344 else 1345 Phi->setIncomingValueForBlock(PredBB, V); 1346 } 1347 1348 // Advance the insert point after the wrapped IR instruction. This allows 1349 // interleaving VPIRInstructions and other recipes. 1350 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator())); 1351 } 1352 1353 void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const { 1354 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe()); 1355 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() && 1356 "Number of phi operands must match number of predecessors"); 1357 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock); 1358 R->removeOperand(Position); 1359 } 1360 1361 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1362 void VPPhiAccessors::printPhiOperands(raw_ostream &O, 1363 VPSlotTracker &SlotTracker) const { 1364 interleaveComma(enumerate(getAsRecipe()->operands()), O, 1365 [this, &O, &SlotTracker](auto Op) { 1366 O << "[ "; 1367 Op.value()->printAsOperand(O, SlotTracker); 1368 O << ", "; 1369 getIncomingBlock(Op.index())->printAsOperand(O); 1370 O << " ]"; 1371 }); 1372 } 1373 #endif 1374 1375 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1376 void VPIRPhi::print(raw_ostream &O, const Twine &Indent, 1377 VPSlotTracker &SlotTracker) const { 1378 VPIRInstruction::print(O, Indent, SlotTracker); 1379 1380 if (getNumOperands() != 0) { 1381 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": "; 1382 interleaveComma( 1383 enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) { 1384 Op.value()->printAsOperand(O, SlotTracker); 1385 O << " from "; 1386 getParent()->getPredecessors()[Op.index()]->printAsOperand(O); 1387 }); 1388 O << ")"; 1389 } 1390 } 1391 #endif 1392 1393 VPIRMetadata::VPIRMetadata(Instruction &I, LoopVersioning *LVer) 1394 : VPIRMetadata(I) { 1395 if (!LVer || !isa<LoadInst, StoreInst>(&I)) 1396 return; 1397 const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(&I); 1398 if (AliasScopeMD) 1399 Metadata.emplace_back(LLVMContext::MD_alias_scope, AliasScopeMD); 1400 if (NoAliasMD) 1401 Metadata.emplace_back(LLVMContext::MD_noalias, NoAliasMD); 1402 } 1403 1404 void VPIRMetadata::applyMetadata(Instruction &I) const { 1405 for (const auto &[Kind, Node] : Metadata) 1406 I.setMetadata(Kind, Node); 1407 } 1408 1409 void VPWidenCallRecipe::execute(VPTransformState &State) { 1410 assert(State.VF.isVector() && "not widening"); 1411 assert(Variant != nullptr && "Can't create vector function."); 1412 1413 FunctionType *VFTy = Variant->getFunctionType(); 1414 // Add return type if intrinsic is overloaded on it. 1415 SmallVector<Value *, 4> Args; 1416 for (const auto &I : enumerate(args())) { 1417 Value *Arg; 1418 // Some vectorized function variants may also take a scalar argument, 1419 // e.g. linear parameters for pointers. This needs to be the scalar value 1420 // from the start of the respective part when interleaving. 1421 if (!VFTy->getParamType(I.index())->isVectorTy()) 1422 Arg = State.get(I.value(), VPLane(0)); 1423 else 1424 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); 1425 Args.push_back(Arg); 1426 } 1427 1428 auto *CI = cast_or_null<CallInst>(getUnderlyingValue()); 1429 SmallVector<OperandBundleDef, 1> OpBundles; 1430 if (CI) 1431 CI->getOperandBundlesAsDefs(OpBundles); 1432 1433 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles); 1434 applyFlags(*V); 1435 applyMetadata(*V); 1436 V->setCallingConv(Variant->getCallingConv()); 1437 1438 if (!V->getType()->isVoidTy()) 1439 State.set(this, V); 1440 } 1441 1442 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, 1443 VPCostContext &Ctx) const { 1444 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), 1445 Variant->getFunctionType()->params(), 1446 Ctx.CostKind); 1447 } 1448 1449 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1450 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 1451 VPSlotTracker &SlotTracker) const { 1452 O << Indent << "WIDEN-CALL "; 1453 1454 Function *CalledFn = getCalledScalarFunction(); 1455 if (CalledFn->getReturnType()->isVoidTy()) 1456 O << "void "; 1457 else { 1458 printAsOperand(O, SlotTracker); 1459 O << " = "; 1460 } 1461 1462 O << "call"; 1463 printFlags(O); 1464 O << " @" << CalledFn->getName() << "("; 1465 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) { 1466 Op->printAsOperand(O, SlotTracker); 1467 }); 1468 O << ")"; 1469 1470 O << " (using library function"; 1471 if (Variant->hasName()) 1472 O << ": " << Variant->getName(); 1473 O << ")"; 1474 } 1475 #endif 1476 1477 void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { 1478 assert(State.VF.isVector() && "not widening"); 1479 1480 SmallVector<Type *, 2> TysForDecl; 1481 // Add return type if intrinsic is overloaded on it. 1482 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI)) 1483 TysForDecl.push_back(VectorType::get(getResultType(), State.VF)); 1484 SmallVector<Value *, 4> Args; 1485 for (const auto &I : enumerate(operands())) { 1486 // Some intrinsics have a scalar argument - don't replace it with a 1487 // vector. 1488 Value *Arg; 1489 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(), 1490 State.TTI)) 1491 Arg = State.get(I.value(), VPLane(0)); 1492 else 1493 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); 1494 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(), 1495 State.TTI)) 1496 TysForDecl.push_back(Arg->getType()); 1497 Args.push_back(Arg); 1498 } 1499 1500 // Use vector version of the intrinsic. 1501 Module *M = State.Builder.GetInsertBlock()->getModule(); 1502 Function *VectorF = 1503 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl); 1504 assert(VectorF && 1505 "Can't retrieve vector intrinsic or vector-predication intrinsics."); 1506 1507 auto *CI = cast_or_null<CallInst>(getUnderlyingValue()); 1508 SmallVector<OperandBundleDef, 1> OpBundles; 1509 if (CI) 1510 CI->getOperandBundlesAsDefs(OpBundles); 1511 1512 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); 1513 1514 applyFlags(*V); 1515 applyMetadata(*V); 1516 1517 if (!V->getType()->isVoidTy()) 1518 State.set(this, V); 1519 } 1520 1521 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, 1522 VPCostContext &Ctx) const { 1523 // Some backends analyze intrinsic arguments to determine cost. Use the 1524 // underlying value for the operand if it has one. Otherwise try to use the 1525 // operand of the underlying call instruction, if there is one. Otherwise 1526 // clear Arguments. 1527 // TODO: Rework TTI interface to be independent of concrete IR values. 1528 SmallVector<const Value *> Arguments; 1529 for (const auto &[Idx, Op] : enumerate(operands())) { 1530 auto *V = Op->getUnderlyingValue(); 1531 if (!V) { 1532 if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { 1533 Arguments.push_back(UI->getArgOperand(Idx)); 1534 continue; 1535 } 1536 Arguments.clear(); 1537 break; 1538 } 1539 Arguments.push_back(V); 1540 } 1541 1542 Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF); 1543 SmallVector<Type *> ParamTys; 1544 for (unsigned I = 0; I != getNumOperands(); ++I) 1545 ParamTys.push_back( 1546 toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); 1547 1548 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. 1549 FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); 1550 IntrinsicCostAttributes CostAttrs( 1551 VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, 1552 dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()), 1553 InstructionCost::getInvalid(), &Ctx.TLI); 1554 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); 1555 } 1556 1557 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { 1558 return Intrinsic::getBaseName(VectorIntrinsicID); 1559 } 1560 1561 bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { 1562 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); 1563 return all_of(enumerate(operands()), [this, &Op](const auto &X) { 1564 auto [Idx, V] = X; 1565 return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(), 1566 Idx, nullptr); 1567 }); 1568 } 1569 1570 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1571 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent, 1572 VPSlotTracker &SlotTracker) const { 1573 O << Indent << "WIDEN-INTRINSIC "; 1574 if (ResultTy->isVoidTy()) { 1575 O << "void "; 1576 } else { 1577 printAsOperand(O, SlotTracker); 1578 O << " = "; 1579 } 1580 1581 O << "call"; 1582 printFlags(O); 1583 O << getIntrinsicName() << "("; 1584 1585 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { 1586 Op->printAsOperand(O, SlotTracker); 1587 }); 1588 O << ")"; 1589 } 1590 #endif 1591 1592 void VPHistogramRecipe::execute(VPTransformState &State) { 1593 IRBuilderBase &Builder = State.Builder; 1594 1595 Value *Address = State.get(getOperand(0)); 1596 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true); 1597 VectorType *VTy = cast<VectorType>(Address->getType()); 1598 1599 // The histogram intrinsic requires a mask even if the recipe doesn't; 1600 // if the mask operand was omitted then all lanes should be executed and 1601 // we just need to synthesize an all-true mask. 1602 Value *Mask = nullptr; 1603 if (VPValue *VPMask = getMask()) 1604 Mask = State.get(VPMask); 1605 else 1606 Mask = 1607 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1)); 1608 1609 // If this is a subtract, we want to invert the increment amount. We may 1610 // add a separate intrinsic in future, but for now we'll try this. 1611 if (Opcode == Instruction::Sub) 1612 IncAmt = Builder.CreateNeg(IncAmt); 1613 else 1614 assert(Opcode == Instruction::Add && "only add or sub supported for now"); 1615 1616 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add, 1617 {VTy, IncAmt->getType()}, 1618 {Address, IncAmt, Mask}); 1619 } 1620 1621 InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, 1622 VPCostContext &Ctx) const { 1623 // FIXME: Take the gather and scatter into account as well. For now we're 1624 // generating the same cost as the fallback path, but we'll likely 1625 // need to create a new TTI method for determining the cost, including 1626 // whether we can use base + vec-of-smaller-indices or just 1627 // vec-of-pointers. 1628 assert(VF.isVector() && "Invalid VF for histogram cost"); 1629 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0)); 1630 VPValue *IncAmt = getOperand(1); 1631 Type *IncTy = Ctx.Types.inferScalarType(IncAmt); 1632 VectorType *VTy = VectorType::get(IncTy, VF); 1633 1634 // Assume that a non-constant update value (or a constant != 1) requires 1635 // a multiply, and add that into the cost. 1636 InstructionCost MulCost = 1637 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind); 1638 if (IncAmt->isLiveIn()) { 1639 ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue()); 1640 1641 if (CI && CI->getZExtValue() == 1) 1642 MulCost = TTI::TCC_Free; 1643 } 1644 1645 // Find the cost of the histogram operation itself. 1646 Type *PtrTy = VectorType::get(AddressTy, VF); 1647 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF); 1648 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, 1649 Type::getVoidTy(Ctx.LLVMCtx), 1650 {PtrTy, IncTy, MaskTy}); 1651 1652 // Add the costs together with the add/sub operation. 1653 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost + 1654 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind); 1655 } 1656 1657 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1658 void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent, 1659 VPSlotTracker &SlotTracker) const { 1660 O << Indent << "WIDEN-HISTOGRAM buckets: "; 1661 getOperand(0)->printAsOperand(O, SlotTracker); 1662 1663 if (Opcode == Instruction::Sub) 1664 O << ", dec: "; 1665 else { 1666 assert(Opcode == Instruction::Add); 1667 O << ", inc: "; 1668 } 1669 getOperand(1)->printAsOperand(O, SlotTracker); 1670 1671 if (VPValue *Mask = getMask()) { 1672 O << ", mask: "; 1673 Mask->printAsOperand(O, SlotTracker); 1674 } 1675 } 1676 1677 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 1678 VPSlotTracker &SlotTracker) const { 1679 O << Indent << "WIDEN-SELECT "; 1680 printAsOperand(O, SlotTracker); 1681 O << " = select "; 1682 printFlags(O); 1683 getOperand(0)->printAsOperand(O, SlotTracker); 1684 O << ", "; 1685 getOperand(1)->printAsOperand(O, SlotTracker); 1686 O << ", "; 1687 getOperand(2)->printAsOperand(O, SlotTracker); 1688 O << (isInvariantCond() ? " (condition is loop invariant)" : ""); 1689 } 1690 #endif 1691 1692 void VPWidenSelectRecipe::execute(VPTransformState &State) { 1693 // The condition can be loop invariant but still defined inside the 1694 // loop. This means that we can't just use the original 'cond' value. 1695 // We have to take the 'vectorized' value and pick the first lane. 1696 // Instcombine will make this a no-op. 1697 auto *InvarCond = 1698 isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr; 1699 1700 Value *Cond = InvarCond ? InvarCond : State.get(getCond()); 1701 Value *Op0 = State.get(getOperand(1)); 1702 Value *Op1 = State.get(getOperand(2)); 1703 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 1704 State.set(this, Sel); 1705 if (auto *I = dyn_cast<Instruction>(Sel)) { 1706 if (isa<FPMathOperator>(I)) 1707 applyFlags(*I); 1708 applyMetadata(*I); 1709 } 1710 } 1711 1712 InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, 1713 VPCostContext &Ctx) const { 1714 SelectInst *SI = cast<SelectInst>(getUnderlyingValue()); 1715 bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions(); 1716 Type *ScalarTy = Ctx.Types.inferScalarType(this); 1717 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1718 1719 VPValue *Op0, *Op1; 1720 using namespace llvm::VPlanPatternMatch; 1721 if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && 1722 (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) || 1723 match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) { 1724 // select x, y, false --> x & y 1725 // select x, true, y --> x | y 1726 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0); 1727 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1); 1728 1729 SmallVector<const Value *, 2> Operands; 1730 if (all_of(operands(), 1731 [](VPValue *Op) { return Op->getUnderlyingValue(); })) 1732 Operands.append(SI->op_begin(), SI->op_end()); 1733 bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))); 1734 return Ctx.TTI.getArithmeticInstrCost( 1735 IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, 1736 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI); 1737 } 1738 1739 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0)); 1740 if (!ScalarCond) 1741 CondTy = VectorType::get(CondTy, VF); 1742 1743 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 1744 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 1745 Pred = Cmp->getPredicate(); 1746 return Ctx.TTI.getCmpSelInstrCost( 1747 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind, 1748 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI); 1749 } 1750 1751 VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) { 1752 AllowReassoc = FMF.allowReassoc(); 1753 NoNaNs = FMF.noNaNs(); 1754 NoInfs = FMF.noInfs(); 1755 NoSignedZeros = FMF.noSignedZeros(); 1756 AllowReciprocal = FMF.allowReciprocal(); 1757 AllowContract = FMF.allowContract(); 1758 ApproxFunc = FMF.approxFunc(); 1759 } 1760 1761 #if !defined(NDEBUG) 1762 bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { 1763 switch (OpType) { 1764 case OperationType::OverflowingBinOp: 1765 return Opcode == Instruction::Add || Opcode == Instruction::Sub || 1766 Opcode == Instruction::Mul || 1767 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart; 1768 case OperationType::Trunc: 1769 return Opcode == Instruction::Trunc; 1770 case OperationType::DisjointOp: 1771 return Opcode == Instruction::Or; 1772 case OperationType::PossiblyExactOp: 1773 return Opcode == Instruction::AShr; 1774 case OperationType::GEPOp: 1775 return Opcode == Instruction::GetElementPtr || 1776 Opcode == VPInstruction::PtrAdd; 1777 case OperationType::FPMathOp: 1778 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 1779 Opcode == Instruction::FSub || Opcode == Instruction::FNeg || 1780 Opcode == Instruction::FDiv || Opcode == Instruction::FRem || 1781 Opcode == Instruction::FCmp || Opcode == Instruction::Select || 1782 Opcode == VPInstruction::WideIVStep || 1783 Opcode == VPInstruction::ReductionStartVector || 1784 Opcode == VPInstruction::ComputeReductionResult; 1785 case OperationType::NonNegOp: 1786 return Opcode == Instruction::ZExt; 1787 break; 1788 case OperationType::Cmp: 1789 return Opcode == Instruction::ICmp; 1790 case OperationType::Other: 1791 return true; 1792 } 1793 llvm_unreachable("Unknown OperationType enum"); 1794 } 1795 #endif 1796 1797 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1798 void VPIRFlags::printFlags(raw_ostream &O) const { 1799 switch (OpType) { 1800 case OperationType::Cmp: 1801 O << " " << CmpInst::getPredicateName(getPredicate()); 1802 break; 1803 case OperationType::DisjointOp: 1804 if (DisjointFlags.IsDisjoint) 1805 O << " disjoint"; 1806 break; 1807 case OperationType::PossiblyExactOp: 1808 if (ExactFlags.IsExact) 1809 O << " exact"; 1810 break; 1811 case OperationType::OverflowingBinOp: 1812 if (WrapFlags.HasNUW) 1813 O << " nuw"; 1814 if (WrapFlags.HasNSW) 1815 O << " nsw"; 1816 break; 1817 case OperationType::Trunc: 1818 if (TruncFlags.HasNUW) 1819 O << " nuw"; 1820 if (TruncFlags.HasNSW) 1821 O << " nsw"; 1822 break; 1823 case OperationType::FPMathOp: 1824 getFastMathFlags().print(O); 1825 break; 1826 case OperationType::GEPOp: 1827 if (GEPFlags.isInBounds()) 1828 O << " inbounds"; 1829 else if (GEPFlags.hasNoUnsignedSignedWrap()) 1830 O << " nusw"; 1831 if (GEPFlags.hasNoUnsignedWrap()) 1832 O << " nuw"; 1833 break; 1834 case OperationType::NonNegOp: 1835 if (NonNegFlags.NonNeg) 1836 O << " nneg"; 1837 break; 1838 case OperationType::Other: 1839 break; 1840 } 1841 O << " "; 1842 } 1843 #endif 1844 1845 void VPWidenRecipe::execute(VPTransformState &State) { 1846 auto &Builder = State.Builder; 1847 switch (Opcode) { 1848 case Instruction::Call: 1849 case Instruction::Br: 1850 case Instruction::PHI: 1851 case Instruction::GetElementPtr: 1852 case Instruction::Select: 1853 llvm_unreachable("This instruction is handled by a different recipe."); 1854 case Instruction::UDiv: 1855 case Instruction::SDiv: 1856 case Instruction::SRem: 1857 case Instruction::URem: 1858 case Instruction::Add: 1859 case Instruction::FAdd: 1860 case Instruction::Sub: 1861 case Instruction::FSub: 1862 case Instruction::FNeg: 1863 case Instruction::Mul: 1864 case Instruction::FMul: 1865 case Instruction::FDiv: 1866 case Instruction::FRem: 1867 case Instruction::Shl: 1868 case Instruction::LShr: 1869 case Instruction::AShr: 1870 case Instruction::And: 1871 case Instruction::Or: 1872 case Instruction::Xor: { 1873 // Just widen unops and binops. 1874 SmallVector<Value *, 2> Ops; 1875 for (VPValue *VPOp : operands()) 1876 Ops.push_back(State.get(VPOp)); 1877 1878 Value *V = Builder.CreateNAryOp(Opcode, Ops); 1879 1880 if (auto *VecOp = dyn_cast<Instruction>(V)) { 1881 applyFlags(*VecOp); 1882 applyMetadata(*VecOp); 1883 } 1884 1885 // Use this vector value for all users of the original instruction. 1886 State.set(this, V); 1887 break; 1888 } 1889 case Instruction::ExtractValue: { 1890 assert(getNumOperands() == 2 && "expected single level extractvalue"); 1891 Value *Op = State.get(getOperand(0)); 1892 auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); 1893 Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue()); 1894 State.set(this, Extract); 1895 break; 1896 } 1897 case Instruction::Freeze: { 1898 Value *Op = State.get(getOperand(0)); 1899 Value *Freeze = Builder.CreateFreeze(Op); 1900 State.set(this, Freeze); 1901 break; 1902 } 1903 case Instruction::ICmp: 1904 case Instruction::FCmp: { 1905 // Widen compares. Generate vector compares. 1906 bool FCmp = Opcode == Instruction::FCmp; 1907 Value *A = State.get(getOperand(0)); 1908 Value *B = State.get(getOperand(1)); 1909 Value *C = nullptr; 1910 if (FCmp) { 1911 // Propagate fast math flags. 1912 C = Builder.CreateFCmpFMF( 1913 getPredicate(), A, B, 1914 dyn_cast_or_null<Instruction>(getUnderlyingValue())); 1915 } else { 1916 C = Builder.CreateICmp(getPredicate(), A, B); 1917 } 1918 if (auto *I = dyn_cast<Instruction>(C)) 1919 applyMetadata(*I); 1920 State.set(this, C); 1921 break; 1922 } 1923 default: 1924 // This instruction is not vectorized by simple widening. 1925 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " 1926 << Instruction::getOpcodeName(Opcode)); 1927 llvm_unreachable("Unhandled instruction!"); 1928 } // end of switch. 1929 1930 #if !defined(NDEBUG) 1931 // Verify that VPlan type inference results agree with the type of the 1932 // generated values. 1933 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == 1934 State.get(this)->getType() && 1935 "inferred type and type from generated instructions do not match"); 1936 #endif 1937 } 1938 1939 InstructionCost VPWidenRecipe::computeCost(ElementCount VF, 1940 VPCostContext &Ctx) const { 1941 switch (Opcode) { 1942 case Instruction::FNeg: { 1943 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1944 return Ctx.TTI.getArithmeticInstrCost( 1945 Opcode, VectorTy, Ctx.CostKind, 1946 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1947 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); 1948 } 1949 1950 case Instruction::UDiv: 1951 case Instruction::SDiv: 1952 case Instruction::SRem: 1953 case Instruction::URem: 1954 // More complex computation, let the legacy cost-model handle this for now. 1955 return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); 1956 case Instruction::Add: 1957 case Instruction::FAdd: 1958 case Instruction::Sub: 1959 case Instruction::FSub: 1960 case Instruction::Mul: 1961 case Instruction::FMul: 1962 case Instruction::FDiv: 1963 case Instruction::FRem: 1964 case Instruction::Shl: 1965 case Instruction::LShr: 1966 case Instruction::AShr: 1967 case Instruction::And: 1968 case Instruction::Or: 1969 case Instruction::Xor: { 1970 VPValue *RHS = getOperand(1); 1971 // Certain instructions can be cheaper to vectorize if they have a constant 1972 // second vector operand. One example of this are shifts on x86. 1973 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS); 1974 1975 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && 1976 getOperand(1)->isDefinedOutsideLoopRegions()) 1977 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; 1978 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1979 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 1980 1981 SmallVector<const Value *, 4> Operands; 1982 if (CtxI) 1983 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); 1984 return Ctx.TTI.getArithmeticInstrCost( 1985 Opcode, VectorTy, Ctx.CostKind, 1986 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 1987 RHSInfo, Operands, CtxI, &Ctx.TLI); 1988 } 1989 case Instruction::Freeze: { 1990 // This opcode is unknown. Assume that it is the same as 'mul'. 1991 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 1992 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 1993 Ctx.CostKind); 1994 } 1995 case Instruction::ExtractValue: { 1996 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue, 1997 Ctx.CostKind); 1998 } 1999 case Instruction::ICmp: 2000 case Instruction::FCmp: { 2001 Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue()); 2002 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); 2003 return Ctx.TTI.getCmpSelInstrCost( 2004 Opcode, VectorTy, CmpInst::makeCmpResultType(VectorTy), getPredicate(), 2005 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None}, 2006 {TTI::OK_AnyValue, TTI::OP_None}, CtxI); 2007 } 2008 default: 2009 llvm_unreachable("Unsupported opcode for instruction"); 2010 } 2011 } 2012 2013 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2014 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 2015 VPSlotTracker &SlotTracker) const { 2016 O << Indent << "WIDEN "; 2017 printAsOperand(O, SlotTracker); 2018 O << " = " << Instruction::getOpcodeName(Opcode); 2019 printFlags(O); 2020 printOperands(O, SlotTracker); 2021 } 2022 #endif 2023 2024 void VPWidenCastRecipe::execute(VPTransformState &State) { 2025 auto &Builder = State.Builder; 2026 /// Vectorize casts. 2027 assert(State.VF.isVector() && "Not vectorizing?"); 2028 Type *DestTy = VectorType::get(getResultType(), State.VF); 2029 VPValue *Op = getOperand(0); 2030 Value *A = State.get(Op); 2031 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); 2032 State.set(this, Cast); 2033 if (auto *CastOp = dyn_cast<Instruction>(Cast)) { 2034 applyFlags(*CastOp); 2035 applyMetadata(*CastOp); 2036 } 2037 } 2038 2039 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, 2040 VPCostContext &Ctx) const { 2041 // TODO: In some cases, VPWidenCastRecipes are created but not considered in 2042 // the legacy cost model, including truncates/extends when evaluating a 2043 // reduction in a smaller type. 2044 if (!getUnderlyingValue()) 2045 return 0; 2046 // Computes the CastContextHint from a recipes that may access memory. 2047 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { 2048 if (VF.isScalar()) 2049 return TTI::CastContextHint::Normal; 2050 if (isa<VPInterleaveRecipe>(R)) 2051 return TTI::CastContextHint::Interleave; 2052 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) 2053 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked 2054 : TTI::CastContextHint::Normal; 2055 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R); 2056 if (WidenMemoryRecipe == nullptr) 2057 return TTI::CastContextHint::None; 2058 if (!WidenMemoryRecipe->isConsecutive()) 2059 return TTI::CastContextHint::GatherScatter; 2060 if (WidenMemoryRecipe->isReverse()) 2061 return TTI::CastContextHint::Reversed; 2062 if (WidenMemoryRecipe->isMasked()) 2063 return TTI::CastContextHint::Masked; 2064 return TTI::CastContextHint::Normal; 2065 }; 2066 2067 VPValue *Operand = getOperand(0); 2068 TTI::CastContextHint CCH = TTI::CastContextHint::None; 2069 // For Trunc/FPTrunc, get the context from the only user. 2070 if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) && 2071 !hasMoreThanOneUniqueUser() && getNumUsers() > 0) { 2072 if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin())) 2073 CCH = ComputeCCH(StoreRecipe); 2074 } 2075 // For Z/Sext, get the context from the operand. 2076 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 2077 Opcode == Instruction::FPExt) { 2078 if (Operand->isLiveIn()) 2079 CCH = TTI::CastContextHint::Normal; 2080 else if (Operand->getDefiningRecipe()) 2081 CCH = ComputeCCH(Operand->getDefiningRecipe()); 2082 } 2083 2084 auto *SrcTy = 2085 cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF)); 2086 auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF)); 2087 // Arm TTI will use the underlying instruction to determine the cost. 2088 return Ctx.TTI.getCastInstrCost( 2089 Opcode, DestTy, SrcTy, CCH, Ctx.CostKind, 2090 dyn_cast_if_present<Instruction>(getUnderlyingValue())); 2091 } 2092 2093 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2094 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, 2095 VPSlotTracker &SlotTracker) const { 2096 O << Indent << "WIDEN-CAST "; 2097 printAsOperand(O, SlotTracker); 2098 O << " = " << Instruction::getOpcodeName(Opcode); 2099 printFlags(O); 2100 printOperands(O, SlotTracker); 2101 O << " to " << *getResultType(); 2102 } 2103 #endif 2104 2105 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF, 2106 VPCostContext &Ctx) const { 2107 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); 2108 } 2109 2110 /// A helper function that returns an integer or floating-point constant with 2111 /// value C. 2112 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 2113 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 2114 : ConstantFP::get(Ty, C); 2115 } 2116 2117 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2118 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 2119 VPSlotTracker &SlotTracker) const { 2120 O << Indent; 2121 printAsOperand(O, SlotTracker); 2122 O << " = WIDEN-INDUCTION "; 2123 printOperands(O, SlotTracker); 2124 2125 if (auto *TI = getTruncInst()) 2126 O << " (truncated to " << *TI->getType() << ")"; 2127 } 2128 #endif 2129 2130 bool VPWidenIntOrFpInductionRecipe::isCanonical() const { 2131 // The step may be defined by a recipe in the preheader (e.g. if it requires 2132 // SCEV expansion), but for the canonical induction the step is required to be 2133 // 1, which is represented as live-in. 2134 if (getStepValue()->getDefiningRecipe()) 2135 return false; 2136 auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); 2137 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); 2138 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); 2139 return StartC && StartC->isZero() && StepC && StepC->isOne() && 2140 getScalarType() == CanIV->getScalarType(); 2141 } 2142 2143 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2144 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, 2145 VPSlotTracker &SlotTracker) const { 2146 O << Indent; 2147 printAsOperand(O, SlotTracker); 2148 O << " = DERIVED-IV "; 2149 getStartValue()->printAsOperand(O, SlotTracker); 2150 O << " + "; 2151 getOperand(1)->printAsOperand(O, SlotTracker); 2152 O << " * "; 2153 getStepValue()->printAsOperand(O, SlotTracker); 2154 } 2155 #endif 2156 2157 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 2158 // Fast-math-flags propagate from the original induction instruction. 2159 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 2160 if (hasFastMathFlags()) 2161 State.Builder.setFastMathFlags(getFastMathFlags()); 2162 2163 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2164 /// variable on which to base the steps, \p Step is the size of the step. 2165 2166 Value *BaseIV = State.get(getOperand(0), VPLane(0)); 2167 Value *Step = State.get(getStepValue(), VPLane(0)); 2168 IRBuilderBase &Builder = State.Builder; 2169 2170 // Ensure step has the same type as that of scalar IV. 2171 Type *BaseIVTy = BaseIV->getType()->getScalarType(); 2172 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); 2173 2174 // We build scalar steps for both integer and floating-point induction 2175 // variables. Here, we determine the kind of arithmetic we will perform. 2176 Instruction::BinaryOps AddOp; 2177 Instruction::BinaryOps MulOp; 2178 if (BaseIVTy->isIntegerTy()) { 2179 AddOp = Instruction::Add; 2180 MulOp = Instruction::Mul; 2181 } else { 2182 AddOp = InductionOpcode; 2183 MulOp = Instruction::FMul; 2184 } 2185 2186 // Determine the number of scalars we need to generate for each unroll 2187 // iteration. 2188 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); 2189 // Compute the scalar steps and save the results in State. 2190 Type *IntStepTy = 2191 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); 2192 Type *VecIVTy = nullptr; 2193 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2194 if (!FirstLaneOnly && State.VF.isScalable()) { 2195 VecIVTy = VectorType::get(BaseIVTy, State.VF); 2196 UnitStepVec = 2197 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2198 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2199 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); 2200 } 2201 2202 unsigned StartLane = 0; 2203 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2204 if (State.Lane) { 2205 StartLane = State.Lane->getKnownLane(); 2206 EndLane = StartLane + 1; 2207 } 2208 Value *StartIdx0; 2209 if (getUnrollPart(*this) == 0) 2210 StartIdx0 = ConstantInt::get(IntStepTy, 0); 2211 else { 2212 StartIdx0 = State.get(getOperand(2), true); 2213 if (getUnrollPart(*this) != 1) { 2214 StartIdx0 = 2215 Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(), 2216 getUnrollPart(*this))); 2217 } 2218 StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy); 2219 } 2220 2221 if (!FirstLaneOnly && State.VF.isScalable()) { 2222 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2223 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2224 if (BaseIVTy->isFloatingPointTy()) 2225 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2226 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2227 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2228 State.set(this, Add); 2229 // It's useful to record the lane values too for the known minimum number 2230 // of elements so we do those below. This improves the code quality when 2231 // trying to extract the first element, for example. 2232 } 2233 2234 if (BaseIVTy->isFloatingPointTy()) 2235 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); 2236 2237 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 2238 Value *StartIdx = Builder.CreateBinOp( 2239 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); 2240 // The step returned by `createStepForVF` is a runtime-evaluated value 2241 // when VF is scalable. Otherwise, it should be folded into a Constant. 2242 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2243 "Expected StartIdx to be folded to a constant when VF is not " 2244 "scalable"); 2245 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2246 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); 2247 State.set(this, Add, VPLane(Lane)); 2248 } 2249 } 2250 2251 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2252 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, 2253 VPSlotTracker &SlotTracker) const { 2254 O << Indent; 2255 printAsOperand(O, SlotTracker); 2256 O << " = SCALAR-STEPS "; 2257 printOperands(O, SlotTracker); 2258 } 2259 #endif 2260 2261 void VPWidenGEPRecipe::execute(VPTransformState &State) { 2262 assert(State.VF.isVector() && "not widening"); 2263 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 2264 // Construct a vector GEP by widening the operands of the scalar GEP as 2265 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 2266 // results in a vector of pointers when at least one operand of the GEP 2267 // is vector-typed. Thus, to keep the representation compact, we only use 2268 // vector-typed operands for loop-varying values. 2269 2270 if (areAllOperandsInvariant()) { 2271 // If we are vectorizing, but the GEP has only loop-invariant operands, 2272 // the GEP we build (by only using vector-typed operands for 2273 // loop-varying values) would be a scalar pointer. Thus, to ensure we 2274 // produce a vector of pointers, we need to either arbitrarily pick an 2275 // operand to broadcast, or broadcast a clone of the original GEP. 2276 // Here, we broadcast a clone of the original. 2277 // 2278 // TODO: If at some point we decide to scalarize instructions having 2279 // loop-invariant operands, this special case will no longer be 2280 // required. We would add the scalarization decision to 2281 // collectLoopScalars() and teach getVectorValue() to broadcast 2282 // the lane-zero scalar value. 2283 SmallVector<Value *> Ops; 2284 for (unsigned I = 0, E = getNumOperands(); I != E; I++) 2285 Ops.push_back(State.get(getOperand(I), VPLane(0))); 2286 2287 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], 2288 ArrayRef(Ops).drop_front(), "", 2289 getGEPNoWrapFlags()); 2290 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP); 2291 State.set(this, Splat); 2292 } else { 2293 // If the GEP has at least one loop-varying operand, we are sure to 2294 // produce a vector of pointers unless VF is scalar. 2295 // The pointer operand of the new GEP. If it's loop-invariant, we 2296 // won't broadcast it. 2297 auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0)) 2298 : State.get(getOperand(0)); 2299 2300 // Collect all the indices for the new GEP. If any index is 2301 // loop-invariant, we won't broadcast it. 2302 SmallVector<Value *, 4> Indices; 2303 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 2304 VPValue *Operand = getOperand(I); 2305 if (isIndexLoopInvariant(I - 1)) 2306 Indices.push_back(State.get(Operand, VPLane(0))); 2307 else 2308 Indices.push_back(State.get(Operand)); 2309 } 2310 2311 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 2312 // but it should be a vector, otherwise. 2313 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 2314 Indices, "", getGEPNoWrapFlags()); 2315 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 2316 "NewGEP is not a pointer vector"); 2317 State.set(this, NewGEP); 2318 } 2319 } 2320 2321 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2322 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 2323 VPSlotTracker &SlotTracker) const { 2324 O << Indent << "WIDEN-GEP "; 2325 O << (isPointerLoopInvariant() ? "Inv" : "Var"); 2326 for (size_t I = 0; I < getNumOperands() - 1; ++I) 2327 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; 2328 2329 O << " "; 2330 printAsOperand(O, SlotTracker); 2331 O << " = getelementptr"; 2332 printFlags(O); 2333 printOperands(O, SlotTracker); 2334 } 2335 #endif 2336 2337 static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride, 2338 unsigned CurrentPart, IRBuilderBase &Builder) { 2339 // Use i32 for the gep index type when the value is constant, 2340 // or query DataLayout for a more suitable index type otherwise. 2341 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); 2342 return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0)) 2343 ? DL.getIndexType(Builder.getPtrTy(0)) 2344 : Builder.getInt32Ty(); 2345 } 2346 2347 void VPVectorEndPointerRecipe::execute(VPTransformState &State) { 2348 auto &Builder = State.Builder; 2349 unsigned CurrentPart = getUnrollPart(*this); 2350 bool IsUnitStride = Stride == 1 || Stride == -1; 2351 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true, 2352 IsUnitStride, CurrentPart, Builder); 2353 2354 // The wide store needs to start at the last vector element. 2355 Value *RunTimeVF = State.get(getVFValue(), VPLane(0)); 2356 if (IndexTy != RunTimeVF->getType()) 2357 RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy); 2358 // NumElt = Stride * CurrentPart * RunTimeVF 2359 Value *NumElt = Builder.CreateMul( 2360 ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF); 2361 // LastLane = Stride * (RunTimeVF - 1) 2362 Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1)); 2363 if (Stride != 1) 2364 LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane); 2365 Value *Ptr = State.get(getOperand(0), VPLane(0)); 2366 Value *ResultPtr = 2367 Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags()); 2368 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", 2369 getGEPNoWrapFlags()); 2370 2371 State.set(this, ResultPtr, /*IsScalar*/ true); 2372 } 2373 2374 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2375 void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent, 2376 VPSlotTracker &SlotTracker) const { 2377 O << Indent; 2378 printAsOperand(O, SlotTracker); 2379 O << " = vector-end-pointer"; 2380 printFlags(O); 2381 printOperands(O, SlotTracker); 2382 } 2383 #endif 2384 2385 void VPVectorPointerRecipe::execute(VPTransformState &State) { 2386 auto &Builder = State.Builder; 2387 unsigned CurrentPart = getUnrollPart(*this); 2388 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, 2389 /*IsUnitStride*/ true, CurrentPart, Builder); 2390 Value *Ptr = State.get(getOperand(0), VPLane(0)); 2391 2392 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); 2393 Value *ResultPtr = 2394 Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); 2395 2396 State.set(this, ResultPtr, /*IsScalar*/ true); 2397 } 2398 2399 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2400 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, 2401 VPSlotTracker &SlotTracker) const { 2402 O << Indent; 2403 printAsOperand(O, SlotTracker); 2404 O << " = vector-pointer "; 2405 2406 printOperands(O, SlotTracker); 2407 } 2408 #endif 2409 2410 void VPBlendRecipe::execute(VPTransformState &State) { 2411 assert(isNormalized() && "Expected blend to be normalized!"); 2412 // We know that all PHIs in non-header blocks are converted into 2413 // selects, so we don't have to worry about the insertion order and we 2414 // can just use the builder. 2415 // At this point we generate the predication tree. There may be 2416 // duplications since this is a simple recursive scan, but future 2417 // optimizations will clean it up. 2418 2419 unsigned NumIncoming = getNumIncomingValues(); 2420 2421 // Generate a sequence of selects of the form: 2422 // SELECT(Mask3, In3, 2423 // SELECT(Mask2, In2, 2424 // SELECT(Mask1, In1, 2425 // In0))) 2426 // Note that Mask0 is never used: lanes for which no path reaches this phi and 2427 // are essentially undef are taken from In0. 2428 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); 2429 Value *Result = nullptr; 2430 for (unsigned In = 0; In < NumIncoming; ++In) { 2431 // We might have single edge PHIs (blocks) - use an identity 2432 // 'select' for the first PHI operand. 2433 Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed); 2434 if (In == 0) 2435 Result = In0; // Initialize with the first incoming value. 2436 else { 2437 // Select between the current value and the previous incoming edge 2438 // based on the incoming mask. 2439 Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed); 2440 Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi"); 2441 } 2442 } 2443 State.set(this, Result, OnlyFirstLaneUsed); 2444 } 2445 2446 InstructionCost VPBlendRecipe::computeCost(ElementCount VF, 2447 VPCostContext &Ctx) const { 2448 // Handle cases where only the first lane is used the same way as the legacy 2449 // cost model. 2450 if (vputils::onlyFirstLaneUsed(this)) 2451 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); 2452 2453 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); 2454 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); 2455 return (getNumIncomingValues() - 1) * 2456 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy, 2457 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind); 2458 } 2459 2460 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2461 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 2462 VPSlotTracker &SlotTracker) const { 2463 O << Indent << "BLEND "; 2464 printAsOperand(O, SlotTracker); 2465 O << " ="; 2466 if (getNumIncomingValues() == 1) { 2467 // Not a User of any mask: not really blending, this is a 2468 // single-predecessor phi. 2469 O << " "; 2470 getIncomingValue(0)->printAsOperand(O, SlotTracker); 2471 } else { 2472 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 2473 O << " "; 2474 getIncomingValue(I)->printAsOperand(O, SlotTracker); 2475 if (I == 0) 2476 continue; 2477 O << "/"; 2478 getMask(I)->printAsOperand(O, SlotTracker); 2479 } 2480 } 2481 } 2482 #endif 2483 2484 void VPReductionRecipe::execute(VPTransformState &State) { 2485 assert(!State.Lane && "Reduction being replicated."); 2486 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 2487 RecurKind Kind = getRecurrenceKind(); 2488 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && 2489 "In-loop AnyOf reductions aren't currently supported"); 2490 // Propagate the fast-math flags carried by the underlying instruction. 2491 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 2492 State.Builder.setFastMathFlags(getFastMathFlags()); 2493 Value *NewVecOp = State.get(getVecOp()); 2494 if (VPValue *Cond = getCondOp()) { 2495 Value *NewCond = State.get(Cond, State.VF.isScalar()); 2496 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 2497 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 2498 2499 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags()); 2500 if (State.VF.isVector()) 2501 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start); 2502 2503 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); 2504 NewVecOp = Select; 2505 } 2506 Value *NewRed; 2507 Value *NextInChain; 2508 if (IsOrdered) { 2509 if (State.VF.isVector()) 2510 NewRed = 2511 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain); 2512 else 2513 NewRed = State.Builder.CreateBinOp( 2514 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), 2515 PrevInChain, NewVecOp); 2516 PrevInChain = NewRed; 2517 NextInChain = NewRed; 2518 } else { 2519 PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); 2520 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind); 2521 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 2522 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain); 2523 else 2524 NextInChain = State.Builder.CreateBinOp( 2525 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed, 2526 PrevInChain); 2527 } 2528 State.set(this, NextInChain, /*IsScalar*/ true); 2529 } 2530 2531 void VPReductionEVLRecipe::execute(VPTransformState &State) { 2532 assert(!State.Lane && "Reduction being replicated."); 2533 2534 auto &Builder = State.Builder; 2535 // Propagate the fast-math flags carried by the underlying instruction. 2536 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); 2537 Builder.setFastMathFlags(getFastMathFlags()); 2538 2539 RecurKind Kind = getRecurrenceKind(); 2540 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true); 2541 Value *VecOp = State.get(getVecOp()); 2542 Value *EVL = State.get(getEVL(), VPLane(0)); 2543 2544 Value *Mask; 2545 if (VPValue *CondOp = getCondOp()) 2546 Mask = State.get(CondOp); 2547 else 2548 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 2549 2550 Value *NewRed; 2551 if (isOrdered()) { 2552 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL); 2553 } else { 2554 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL); 2555 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 2556 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); 2557 else 2558 NewRed = Builder.CreateBinOp( 2559 (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed, 2560 Prev); 2561 } 2562 State.set(this, NewRed, /*IsScalar*/ true); 2563 } 2564 2565 InstructionCost VPReductionRecipe::computeCost(ElementCount VF, 2566 VPCostContext &Ctx) const { 2567 RecurKind RdxKind = getRecurrenceKind(); 2568 Type *ElementTy = Ctx.Types.inferScalarType(this); 2569 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF)); 2570 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind); 2571 FastMathFlags FMFs = getFastMathFlags(); 2572 std::optional<FastMathFlags> OptionalFMF = 2573 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt; 2574 2575 // TODO: Support any-of reductions. 2576 assert( 2577 (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || 2578 ForceTargetInstructionCost.getNumOccurrences() > 0) && 2579 "Any-of reduction not implemented in VPlan-based cost model currently."); 2580 2581 // Note that TTI should model the cost of moving result to the scalar register 2582 // and the BinOp cost in the getMinMaxReductionCost(). 2583 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { 2584 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); 2585 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind); 2586 } 2587 2588 // Note that TTI should model the cost of moving result to the scalar register 2589 // and the BinOp cost in the getArithmeticReductionCost(). 2590 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF, 2591 Ctx.CostKind); 2592 } 2593 2594 VPExpressionRecipe::VPExpressionRecipe( 2595 ExpressionTypes ExpressionType, 2596 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes) 2597 : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}), 2598 ExpressionRecipes(SetVector<VPSingleDefRecipe *>( 2599 ExpressionRecipes.begin(), ExpressionRecipes.end()) 2600 .takeVector()), 2601 ExpressionType(ExpressionType) { 2602 assert(!ExpressionRecipes.empty() && "Nothing to combine?"); 2603 assert( 2604 none_of(ExpressionRecipes, 2605 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && 2606 "expression cannot contain recipes with side-effects"); 2607 2608 // Maintain a copy of the expression recipes as a set of users. 2609 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers; 2610 for (auto *R : ExpressionRecipes) 2611 ExpressionRecipesAsSetOfUsers.insert(R); 2612 2613 // Recipes in the expression, except the last one, must only be used by 2614 // (other) recipes inside the expression. If there are other users, external 2615 // to the expression, use a clone of the recipe for external users. 2616 for (VPSingleDefRecipe *R : ExpressionRecipes) { 2617 if (R != ExpressionRecipes.back() && 2618 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) { 2619 return !ExpressionRecipesAsSetOfUsers.contains(U); 2620 })) { 2621 // There are users outside of the expression. Clone the recipe and use the 2622 // clone those external users. 2623 VPSingleDefRecipe *CopyForExtUsers = R->clone(); 2624 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers]( 2625 VPUser &U, unsigned) { 2626 return !ExpressionRecipesAsSetOfUsers.contains(&U); 2627 }); 2628 CopyForExtUsers->insertBefore(R); 2629 } 2630 if (R->getParent()) 2631 R->removeFromParent(); 2632 } 2633 2634 // Internalize all external operands to the expression recipes. To do so, 2635 // create new temporary VPValues for all operands defined by a recipe outside 2636 // the expression. The original operands are added as operands of the 2637 // VPExpressionRecipe itself. 2638 for (auto *R : ExpressionRecipes) { 2639 for (const auto &[Idx, Op] : enumerate(R->operands())) { 2640 auto *Def = Op->getDefiningRecipe(); 2641 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def)) 2642 continue; 2643 addOperand(Op); 2644 LiveInPlaceholders.push_back(new VPValue()); 2645 R->setOperand(Idx, LiveInPlaceholders.back()); 2646 } 2647 } 2648 } 2649 2650 void VPExpressionRecipe::decompose() { 2651 for (auto *R : ExpressionRecipes) 2652 R->insertBefore(this); 2653 2654 for (const auto &[Idx, Op] : enumerate(operands())) 2655 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op); 2656 2657 replaceAllUsesWith(ExpressionRecipes.back()); 2658 ExpressionRecipes.clear(); 2659 } 2660 2661 InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, 2662 VPCostContext &Ctx) const { 2663 Type *RedTy = Ctx.Types.inferScalarType(this); 2664 auto *SrcVecTy = cast<VectorType>( 2665 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)); 2666 assert(RedTy->isIntegerTy() && 2667 "VPExpressionRecipe only supports integer types currently."); 2668 switch (ExpressionType) { 2669 case ExpressionTypes::ExtendedReduction: { 2670 unsigned Opcode = RecurrenceDescriptor::getOpcode( 2671 cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind()); 2672 return Ctx.TTI.getExtendedReductionCost( 2673 Opcode, 2674 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == 2675 Instruction::ZExt, 2676 RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); 2677 } 2678 case ExpressionTypes::MulAccReduction: 2679 return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind); 2680 2681 case ExpressionTypes::ExtMulAccReduction: 2682 return Ctx.TTI.getMulAccReductionCost( 2683 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == 2684 Instruction::ZExt, 2685 RedTy, SrcVecTy, Ctx.CostKind); 2686 } 2687 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum"); 2688 } 2689 2690 bool VPExpressionRecipe::mayReadOrWriteMemory() const { 2691 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) { 2692 return R->mayReadFromMemory() || R->mayWriteToMemory(); 2693 }); 2694 } 2695 2696 bool VPExpressionRecipe::mayHaveSideEffects() const { 2697 assert( 2698 none_of(ExpressionRecipes, 2699 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && 2700 "expression cannot contain recipes with side-effects"); 2701 return false; 2702 } 2703 2704 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2705 2706 void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, 2707 VPSlotTracker &SlotTracker) const { 2708 O << Indent << "EXPRESSION "; 2709 printAsOperand(O, SlotTracker); 2710 O << " = "; 2711 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back()); 2712 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); 2713 2714 switch (ExpressionType) { 2715 case ExpressionTypes::ExtendedReduction: { 2716 getOperand(1)->printAsOperand(O, SlotTracker); 2717 O << " +"; 2718 O << " reduce." << Instruction::getOpcodeName(Opcode) << " ("; 2719 getOperand(0)->printAsOperand(O, SlotTracker); 2720 Red->printFlags(O); 2721 2722 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); 2723 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " 2724 << *Ext0->getResultType(); 2725 if (Red->isConditional()) { 2726 O << ", "; 2727 Red->getCondOp()->printAsOperand(O, SlotTracker); 2728 } 2729 O << ")"; 2730 break; 2731 } 2732 case ExpressionTypes::MulAccReduction: 2733 case ExpressionTypes::ExtMulAccReduction: { 2734 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); 2735 O << " + "; 2736 O << "reduce." 2737 << Instruction::getOpcodeName( 2738 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) 2739 << " ("; 2740 O << "mul"; 2741 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction; 2742 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2] 2743 : ExpressionRecipes[0]); 2744 Mul->printFlags(O); 2745 if (IsExtended) 2746 O << "("; 2747 getOperand(0)->printAsOperand(O, SlotTracker); 2748 if (IsExtended) { 2749 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]); 2750 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " 2751 << *Ext0->getResultType() << "), ("; 2752 } else { 2753 O << ", "; 2754 } 2755 getOperand(1)->printAsOperand(O, SlotTracker); 2756 if (IsExtended) { 2757 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]); 2758 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " 2759 << *Ext1->getResultType() << ")"; 2760 } 2761 if (Red->isConditional()) { 2762 O << ", "; 2763 Red->getCondOp()->printAsOperand(O, SlotTracker); 2764 } 2765 O << ")"; 2766 break; 2767 } 2768 } 2769 } 2770 2771 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 2772 VPSlotTracker &SlotTracker) const { 2773 O << Indent << "REDUCE "; 2774 printAsOperand(O, SlotTracker); 2775 O << " = "; 2776 getChainOp()->printAsOperand(O, SlotTracker); 2777 O << " +"; 2778 printFlags(O); 2779 O << " reduce." 2780 << Instruction::getOpcodeName( 2781 RecurrenceDescriptor::getOpcode(getRecurrenceKind())) 2782 << " ("; 2783 getVecOp()->printAsOperand(O, SlotTracker); 2784 if (isConditional()) { 2785 O << ", "; 2786 getCondOp()->printAsOperand(O, SlotTracker); 2787 } 2788 O << ")"; 2789 } 2790 2791 void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, 2792 VPSlotTracker &SlotTracker) const { 2793 O << Indent << "REDUCE "; 2794 printAsOperand(O, SlotTracker); 2795 O << " = "; 2796 getChainOp()->printAsOperand(O, SlotTracker); 2797 O << " +"; 2798 printFlags(O); 2799 O << " vp.reduce." 2800 << Instruction::getOpcodeName( 2801 RecurrenceDescriptor::getOpcode(getRecurrenceKind())) 2802 << " ("; 2803 getVecOp()->printAsOperand(O, SlotTracker); 2804 O << ", "; 2805 getEVL()->printAsOperand(O, SlotTracker); 2806 if (isConditional()) { 2807 O << ", "; 2808 getCondOp()->printAsOperand(O, SlotTracker); 2809 } 2810 O << ")"; 2811 } 2812 2813 #endif 2814 2815 /// A helper function to scalarize a single Instruction in the innermost loop. 2816 /// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue 2817 /// operands from \p RepRecipe instead of \p Instr's operands. 2818 static void scalarizeInstruction(const Instruction *Instr, 2819 VPReplicateRecipe *RepRecipe, 2820 const VPLane &Lane, VPTransformState &State) { 2821 assert((!Instr->getType()->isAggregateType() || 2822 canVectorizeTy(Instr->getType())) && 2823 "Expected vectorizable or non-aggregate type."); 2824 2825 // Does this instruction return a value ? 2826 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2827 2828 Instruction *Cloned = Instr->clone(); 2829 if (!IsVoidRetTy) { 2830 Cloned->setName(Instr->getName() + ".cloned"); 2831 #if !defined(NDEBUG) 2832 // Verify that VPlan type inference results agree with the type of the 2833 // generated values. 2834 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && 2835 "inferred type and type from generated instructions do not match"); 2836 #endif 2837 } 2838 2839 RepRecipe->applyFlags(*Cloned); 2840 RepRecipe->applyMetadata(*Cloned); 2841 2842 if (auto DL = RepRecipe->getDebugLoc()) 2843 State.setDebugLocFrom(DL); 2844 2845 // Replace the operands of the cloned instructions with their scalar 2846 // equivalents in the new loop. 2847 for (const auto &I : enumerate(RepRecipe->operands())) { 2848 auto InputLane = Lane; 2849 VPValue *Operand = I.value(); 2850 if (vputils::isSingleScalar(Operand)) 2851 InputLane = VPLane::getFirstLane(); 2852 Cloned->setOperand(I.index(), State.get(Operand, InputLane)); 2853 } 2854 2855 // Place the cloned scalar in the new loop. 2856 State.Builder.Insert(Cloned); 2857 2858 State.set(RepRecipe, Cloned, Lane); 2859 2860 // If we just cloned a new assumption, add it the assumption cache. 2861 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2862 State.AC->registerAssumption(II); 2863 2864 assert( 2865 (RepRecipe->getParent()->getParent() || 2866 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || 2867 all_of(RepRecipe->operands(), 2868 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && 2869 "Expected a recipe is either within a region or all of its operands " 2870 "are defined outside the vectorized region."); 2871 } 2872 2873 void VPReplicateRecipe::execute(VPTransformState &State) { 2874 Instruction *UI = getUnderlyingInstr(); 2875 2876 if (!State.Lane) { 2877 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions " 2878 "must have already been unrolled"); 2879 scalarizeInstruction(UI, this, VPLane(0), State); 2880 return; 2881 } 2882 2883 assert((State.VF.isScalar() || !isSingleScalar()) && 2884 "uniform recipe shouldn't be predicated"); 2885 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 2886 scalarizeInstruction(UI, this, *State.Lane, State); 2887 // Insert scalar instance packing it into a vector. 2888 if (State.VF.isVector() && shouldPack()) { 2889 Value *WideValue = 2890 State.Lane->isFirstLane() 2891 ? PoisonValue::get(VectorType::get(UI->getType(), State.VF)) 2892 : State.get(this); 2893 State.set(this, State.packScalarIntoVectorizedValue(this, WideValue, 2894 *State.Lane)); 2895 } 2896 } 2897 2898 bool VPReplicateRecipe::shouldPack() const { 2899 // Find if the recipe is used by a widened recipe via an intervening 2900 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. 2901 return any_of(users(), [](const VPUser *U) { 2902 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U)) 2903 return any_of(PredR->users(), [PredR](const VPUser *U) { 2904 return !U->usesScalars(PredR); 2905 }); 2906 return false; 2907 }); 2908 } 2909 2910 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, 2911 VPCostContext &Ctx) const { 2912 Instruction *UI = cast<Instruction>(getUnderlyingValue()); 2913 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan 2914 // transform, avoid computing their cost multiple times for now. 2915 Ctx.SkipCostComputation.insert(UI); 2916 2917 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2918 Type *ResultTy = Ctx.Types.inferScalarType(this); 2919 switch (UI->getOpcode()) { 2920 case Instruction::GetElementPtr: 2921 // We mark this instruction as zero-cost because the cost of GEPs in 2922 // vectorized code depends on whether the corresponding memory instruction 2923 // is scalarized or not. Therefore, we handle GEPs with the memory 2924 // instruction cost. 2925 return 0; 2926 case Instruction::Add: 2927 case Instruction::Sub: 2928 case Instruction::FAdd: 2929 case Instruction::FSub: 2930 case Instruction::Mul: 2931 case Instruction::FMul: 2932 case Instruction::FDiv: 2933 case Instruction::FRem: 2934 case Instruction::Shl: 2935 case Instruction::LShr: 2936 case Instruction::AShr: 2937 case Instruction::And: 2938 case Instruction::Or: 2939 case Instruction::Xor: { 2940 auto Op2Info = Ctx.getOperandInfo(getOperand(1)); 2941 SmallVector<const Value *, 4> Operands(UI->operand_values()); 2942 return Ctx.TTI.getArithmeticInstrCost( 2943 UI->getOpcode(), ResultTy, CostKind, 2944 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 2945 Op2Info, Operands, UI, &Ctx.TLI) * 2946 (isSingleScalar() ? 1 : VF.getFixedValue()); 2947 } 2948 } 2949 2950 return Ctx.getLegacyCost(UI, VF); 2951 } 2952 2953 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2954 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 2955 VPSlotTracker &SlotTracker) const { 2956 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE "); 2957 2958 if (!getUnderlyingInstr()->getType()->isVoidTy()) { 2959 printAsOperand(O, SlotTracker); 2960 O << " = "; 2961 } 2962 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) { 2963 O << "call"; 2964 printFlags(O); 2965 O << "@" << CB->getCalledFunction()->getName() << "("; 2966 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), 2967 O, [&O, &SlotTracker](VPValue *Op) { 2968 Op->printAsOperand(O, SlotTracker); 2969 }); 2970 O << ")"; 2971 } else { 2972 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); 2973 printFlags(O); 2974 printOperands(O, SlotTracker); 2975 } 2976 2977 if (shouldPack()) 2978 O << " (S->V)"; 2979 } 2980 #endif 2981 2982 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 2983 assert(State.Lane && "Branch on Mask works only on single instance."); 2984 2985 VPValue *BlockInMask = getOperand(0); 2986 Value *ConditionBit = State.get(BlockInMask, *State.Lane); 2987 2988 // Replace the temporary unreachable terminator with a new conditional branch, 2989 // whose two destinations will be set later when they are created. 2990 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 2991 assert(isa<UnreachableInst>(CurrentTerminator) && 2992 "Expected to replace unreachable terminator with conditional branch."); 2993 auto CondBr = 2994 State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr); 2995 CondBr->setSuccessor(0, nullptr); 2996 CurrentTerminator->eraseFromParent(); 2997 } 2998 2999 InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, 3000 VPCostContext &Ctx) const { 3001 // The legacy cost model doesn't assign costs to branches for individual 3002 // replicate regions. Match the current behavior in the VPlan cost model for 3003 // now. 3004 return 0; 3005 } 3006 3007 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 3008 assert(State.Lane && "Predicated instruction PHI works per instance."); 3009 Instruction *ScalarPredInst = 3010 cast<Instruction>(State.get(getOperand(0), *State.Lane)); 3011 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 3012 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 3013 assert(PredicatingBB && "Predicated block has no single predecessor."); 3014 assert(isa<VPReplicateRecipe>(getOperand(0)) && 3015 "operand must be VPReplicateRecipe"); 3016 3017 // By current pack/unpack logic we need to generate only a single phi node: if 3018 // a vector value for the predicated instruction exists at this point it means 3019 // the instruction has vector users only, and a phi for the vector value is 3020 // needed. In this case the recipe of the predicated instruction is marked to 3021 // also do that packing, thereby "hoisting" the insert-element sequence. 3022 // Otherwise, a phi node for the scalar value is needed. 3023 if (State.hasVectorValue(getOperand(0))) { 3024 Value *VectorValue = State.get(getOperand(0)); 3025 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 3026 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 3027 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 3028 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 3029 if (State.hasVectorValue(this)) 3030 State.reset(this, VPhi); 3031 else 3032 State.set(this, VPhi); 3033 // NOTE: Currently we need to update the value of the operand, so the next 3034 // predicated iteration inserts its generated value in the correct vector. 3035 State.reset(getOperand(0), VPhi); 3036 } else { 3037 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane()) 3038 return; 3039 3040 Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0)); 3041 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 3042 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 3043 PredicatingBB); 3044 Phi->addIncoming(ScalarPredInst, PredicatedBB); 3045 if (State.hasScalarValue(this, *State.Lane)) 3046 State.reset(this, Phi, *State.Lane); 3047 else 3048 State.set(this, Phi, *State.Lane); 3049 // NOTE: Currently we need to update the value of the operand, so the next 3050 // predicated iteration inserts its generated value in the correct vector. 3051 State.reset(getOperand(0), Phi, *State.Lane); 3052 } 3053 } 3054 3055 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3056 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3057 VPSlotTracker &SlotTracker) const { 3058 O << Indent << "PHI-PREDICATED-INSTRUCTION "; 3059 printAsOperand(O, SlotTracker); 3060 O << " = "; 3061 printOperands(O, SlotTracker); 3062 } 3063 #endif 3064 3065 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, 3066 VPCostContext &Ctx) const { 3067 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 3068 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3069 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr())) 3070 ->getAddressSpace(); 3071 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) 3072 ? Instruction::Load 3073 : Instruction::Store; 3074 3075 if (!Consecutive) { 3076 // TODO: Using the original IR may not be accurate. 3077 // Currently, ARM will use the underlying IR to calculate gather/scatter 3078 // instruction cost. 3079 const Value *Ptr = getLoadStorePointerOperand(&Ingredient); 3080 assert(!Reverse && 3081 "Inconsecutive memory access should not have the order."); 3082 return Ctx.TTI.getAddressComputationCost(Ty) + 3083 Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment, 3084 Ctx.CostKind, &Ingredient); 3085 } 3086 3087 InstructionCost Cost = 0; 3088 if (IsMasked) { 3089 Cost += 3090 Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind); 3091 } else { 3092 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( 3093 isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0) 3094 : getOperand(1)); 3095 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind, 3096 OpInfo, &Ingredient); 3097 } 3098 if (!Reverse) 3099 return Cost; 3100 3101 return Cost += Ctx.TTI.getShuffleCost( 3102 TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty), 3103 cast<VectorType>(Ty), {}, Ctx.CostKind, 0); 3104 } 3105 3106 void VPWidenLoadRecipe::execute(VPTransformState &State) { 3107 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 3108 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 3109 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3110 bool CreateGather = !isConsecutive(); 3111 3112 auto &Builder = State.Builder; 3113 Value *Mask = nullptr; 3114 if (auto *VPMask = getMask()) { 3115 // Mask reversal is only needed for non-all-one (null) masks, as reverse 3116 // of a null all-one mask is a null mask. 3117 Mask = State.get(VPMask); 3118 if (isReverse()) 3119 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 3120 } 3121 3122 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather); 3123 Value *NewLI; 3124 if (CreateGather) { 3125 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, 3126 "wide.masked.gather"); 3127 } else if (Mask) { 3128 NewLI = 3129 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, 3130 PoisonValue::get(DataTy), "wide.masked.load"); 3131 } else { 3132 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); 3133 } 3134 applyMetadata(*cast<Instruction>(NewLI)); 3135 if (Reverse) 3136 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 3137 State.set(this, NewLI); 3138 } 3139 3140 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3141 void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, 3142 VPSlotTracker &SlotTracker) const { 3143 O << Indent << "WIDEN "; 3144 printAsOperand(O, SlotTracker); 3145 O << " = load "; 3146 printOperands(O, SlotTracker); 3147 } 3148 #endif 3149 3150 /// Use all-true mask for reverse rather than actual mask, as it avoids a 3151 /// dependence w/o affecting the result. 3152 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, 3153 Value *EVL, const Twine &Name) { 3154 VectorType *ValTy = cast<VectorType>(Operand->getType()); 3155 Value *AllTrueMask = 3156 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); 3157 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, 3158 {Operand, AllTrueMask, EVL}, nullptr, Name); 3159 } 3160 3161 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 3162 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 3163 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 3164 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3165 bool CreateGather = !isConsecutive(); 3166 3167 auto &Builder = State.Builder; 3168 CallInst *NewLI; 3169 Value *EVL = State.get(getEVL(), VPLane(0)); 3170 Value *Addr = State.get(getAddr(), !CreateGather); 3171 Value *Mask = nullptr; 3172 if (VPValue *VPMask = getMask()) { 3173 Mask = State.get(VPMask); 3174 if (isReverse()) 3175 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 3176 } else { 3177 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 3178 } 3179 3180 if (CreateGather) { 3181 NewLI = 3182 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, 3183 nullptr, "wide.masked.gather"); 3184 } else { 3185 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load, 3186 {Addr, Mask, EVL}, nullptr, "vp.op.load"); 3187 } 3188 NewLI->addParamAttr( 3189 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); 3190 applyMetadata(*NewLI); 3191 Instruction *Res = NewLI; 3192 if (isReverse()) 3193 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); 3194 State.set(this, Res); 3195 } 3196 3197 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, 3198 VPCostContext &Ctx) const { 3199 if (!Consecutive || IsMasked) 3200 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 3201 3202 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 3203 // here because the EVL recipes using EVL to replace the tail mask. But in the 3204 // legacy model, it will always calculate the cost of mask. 3205 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 3206 // don't need to compare to the legacy cost model. 3207 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 3208 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3209 unsigned AS = getLoadStoreAddressSpace(&Ingredient); 3210 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 3211 Instruction::Load, Ty, Alignment, AS, Ctx.CostKind); 3212 if (!Reverse) 3213 return Cost; 3214 3215 return Cost + Ctx.TTI.getShuffleCost( 3216 TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty), 3217 cast<VectorType>(Ty), {}, Ctx.CostKind, 0); 3218 } 3219 3220 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3221 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, 3222 VPSlotTracker &SlotTracker) const { 3223 O << Indent << "WIDEN "; 3224 printAsOperand(O, SlotTracker); 3225 O << " = vp.load "; 3226 printOperands(O, SlotTracker); 3227 } 3228 #endif 3229 3230 void VPWidenStoreRecipe::execute(VPTransformState &State) { 3231 VPValue *StoredVPValue = getStoredValue(); 3232 bool CreateScatter = !isConsecutive(); 3233 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3234 3235 auto &Builder = State.Builder; 3236 3237 Value *Mask = nullptr; 3238 if (auto *VPMask = getMask()) { 3239 // Mask reversal is only needed for non-all-one (null) masks, as reverse 3240 // of a null all-one mask is a null mask. 3241 Mask = State.get(VPMask); 3242 if (isReverse()) 3243 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 3244 } 3245 3246 Value *StoredVal = State.get(StoredVPValue); 3247 if (isReverse()) { 3248 // If we store to reverse consecutive memory locations, then we need 3249 // to reverse the order of elements in the stored value. 3250 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 3251 // We don't want to update the value in the map as it might be used in 3252 // another expression. So don't call resetVectorValue(StoredVal). 3253 } 3254 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter); 3255 Instruction *NewSI = nullptr; 3256 if (CreateScatter) 3257 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); 3258 else if (Mask) 3259 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); 3260 else 3261 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); 3262 applyMetadata(*NewSI); 3263 } 3264 3265 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3266 void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, 3267 VPSlotTracker &SlotTracker) const { 3268 O << Indent << "WIDEN store "; 3269 printOperands(O, SlotTracker); 3270 } 3271 #endif 3272 3273 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { 3274 VPValue *StoredValue = getStoredValue(); 3275 bool CreateScatter = !isConsecutive(); 3276 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3277 3278 auto &Builder = State.Builder; 3279 3280 CallInst *NewSI = nullptr; 3281 Value *StoredVal = State.get(StoredValue); 3282 Value *EVL = State.get(getEVL(), VPLane(0)); 3283 if (isReverse()) 3284 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); 3285 Value *Mask = nullptr; 3286 if (VPValue *VPMask = getMask()) { 3287 Mask = State.get(VPMask); 3288 if (isReverse()) 3289 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 3290 } else { 3291 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 3292 } 3293 Value *Addr = State.get(getAddr(), !CreateScatter); 3294 if (CreateScatter) { 3295 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 3296 Intrinsic::vp_scatter, 3297 {StoredVal, Addr, Mask, EVL}); 3298 } else { 3299 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 3300 Intrinsic::vp_store, 3301 {StoredVal, Addr, Mask, EVL}); 3302 } 3303 NewSI->addParamAttr( 3304 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); 3305 applyMetadata(*NewSI); 3306 } 3307 3308 InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, 3309 VPCostContext &Ctx) const { 3310 if (!Consecutive || IsMasked) 3311 return VPWidenMemoryRecipe::computeCost(VF, Ctx); 3312 3313 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() 3314 // here because the EVL recipes using EVL to replace the tail mask. But in the 3315 // legacy model, it will always calculate the cost of mask. 3316 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we 3317 // don't need to compare to the legacy cost model. 3318 Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); 3319 const Align Alignment = getLoadStoreAlignment(&Ingredient); 3320 unsigned AS = getLoadStoreAddressSpace(&Ingredient); 3321 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( 3322 Instruction::Store, Ty, Alignment, AS, Ctx.CostKind); 3323 if (!Reverse) 3324 return Cost; 3325 3326 return Cost + Ctx.TTI.getShuffleCost( 3327 TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty), 3328 cast<VectorType>(Ty), {}, Ctx.CostKind, 0); 3329 } 3330 3331 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3332 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, 3333 VPSlotTracker &SlotTracker) const { 3334 O << Indent << "WIDEN vp.store "; 3335 printOperands(O, SlotTracker); 3336 } 3337 #endif 3338 3339 static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, 3340 VectorType *DstVTy, const DataLayout &DL) { 3341 // Verify that V is a vector type with same number of elements as DstVTy. 3342 auto VF = DstVTy->getElementCount(); 3343 auto *SrcVecTy = cast<VectorType>(V->getType()); 3344 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 3345 Type *SrcElemTy = SrcVecTy->getElementType(); 3346 Type *DstElemTy = DstVTy->getElementType(); 3347 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3348 "Vector elements must have same size"); 3349 3350 // Do a direct cast if element types are castable. 3351 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3352 return Builder.CreateBitOrPointerCast(V, DstVTy); 3353 } 3354 // V cannot be directly casted to desired vector type. 3355 // May happen when V is a floating point vector but DstVTy is a vector of 3356 // pointers or vice-versa. Handle this using a two-step bitcast using an 3357 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3358 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3359 "Only one type should be a pointer type"); 3360 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3361 "Only one type should be a floating point type"); 3362 Type *IntTy = 3363 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3364 auto *VecIntTy = VectorType::get(IntTy, VF); 3365 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3366 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 3367 } 3368 3369 /// Return a vector containing interleaved elements from multiple 3370 /// smaller input vectors. 3371 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 3372 const Twine &Name) { 3373 unsigned Factor = Vals.size(); 3374 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 3375 3376 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 3377 #ifndef NDEBUG 3378 for (Value *Val : Vals) 3379 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 3380 #endif 3381 3382 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 3383 // must use intrinsics to interleave. 3384 if (VecTy->isScalableTy()) { 3385 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors"); 3386 VectorType *InterleaveTy = 3387 VectorType::get(VecTy->getElementType(), 3388 VecTy->getElementCount().multiplyCoefficientBy(Factor)); 3389 return Builder.CreateIntrinsic(InterleaveTy, 3390 getInterleaveIntrinsicID(Factor), Vals, 3391 /*FMFSource=*/nullptr, Name); 3392 } 3393 3394 // Fixed length. Start by concatenating all vectors into a wide vector. 3395 Value *WideVec = concatenateVectors(Builder, Vals); 3396 3397 // Interleave the elements into the wide vector. 3398 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 3399 return Builder.CreateShuffleVector( 3400 WideVec, createInterleaveMask(NumElts, Factor), Name); 3401 } 3402 3403 // Try to vectorize the interleave group that \p Instr belongs to. 3404 // 3405 // E.g. Translate following interleaved load group (factor = 3): 3406 // for (i = 0; i < N; i+=3) { 3407 // R = Pic[i]; // Member of index 0 3408 // G = Pic[i+1]; // Member of index 1 3409 // B = Pic[i+2]; // Member of index 2 3410 // ... // do something to R, G, B 3411 // } 3412 // To: 3413 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 3414 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 3415 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 3416 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 3417 // 3418 // Or translate following interleaved store group (factor = 3): 3419 // for (i = 0; i < N; i+=3) { 3420 // ... do something to R, G, B 3421 // Pic[i] = R; // Member of index 0 3422 // Pic[i+1] = G; // Member of index 1 3423 // Pic[i+2] = B; // Member of index 2 3424 // } 3425 // To: 3426 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 3427 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 3428 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 3429 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 3430 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 3431 void VPInterleaveRecipe::execute(VPTransformState &State) { 3432 assert(!State.Lane && "Interleave group being replicated."); 3433 const InterleaveGroup<Instruction> *Group = IG; 3434 Instruction *Instr = Group->getInsertPos(); 3435 3436 // Prepare for the vector type of the interleaved load/store. 3437 Type *ScalarTy = getLoadStoreType(Instr); 3438 unsigned InterleaveFactor = Group->getFactor(); 3439 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); 3440 3441 VPValue *BlockInMask = getMask(); 3442 VPValue *Addr = getAddr(); 3443 Value *ResAddr = State.get(Addr, VPLane(0)); 3444 Value *PoisonVec = PoisonValue::get(VecTy); 3445 3446 auto CreateGroupMask = [&BlockInMask, &State, 3447 &InterleaveFactor](Value *MaskForGaps) -> Value * { 3448 if (State.VF.isScalable()) { 3449 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 3450 assert(InterleaveFactor <= 8 && 3451 "Unsupported deinterleave factor for scalable vectors"); 3452 auto *ResBlockInMask = State.get(BlockInMask); 3453 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask); 3454 return interleaveVectors(State.Builder, Ops, "interleaved.mask"); 3455 } 3456 3457 if (!BlockInMask) 3458 return MaskForGaps; 3459 3460 Value *ResBlockInMask = State.get(BlockInMask); 3461 Value *ShuffledMask = State.Builder.CreateShuffleVector( 3462 ResBlockInMask, 3463 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()), 3464 "interleaved.mask"); 3465 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, 3466 ShuffledMask, MaskForGaps) 3467 : ShuffledMask; 3468 }; 3469 3470 const DataLayout &DL = Instr->getDataLayout(); 3471 // Vectorize the interleaved load group. 3472 if (isa<LoadInst>(Instr)) { 3473 Value *MaskForGaps = nullptr; 3474 if (NeedsMaskForGaps) { 3475 MaskForGaps = 3476 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group); 3477 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 3478 } 3479 3480 Instruction *NewLoad; 3481 if (BlockInMask || MaskForGaps) { 3482 Value *GroupMask = CreateGroupMask(MaskForGaps); 3483 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr, 3484 Group->getAlign(), GroupMask, 3485 PoisonVec, "wide.masked.vec"); 3486 } else 3487 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr, 3488 Group->getAlign(), "wide.vec"); 3489 Group->addMetadata(NewLoad); 3490 3491 ArrayRef<VPValue *> VPDefs = definedValues(); 3492 const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); 3493 if (VecTy->isScalableTy()) { 3494 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 3495 // so must use intrinsics to deinterleave. 3496 assert(InterleaveFactor <= 8 && 3497 "Unsupported deinterleave factor for scalable vectors"); 3498 Value *Deinterleave = State.Builder.CreateIntrinsic( 3499 getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(), 3500 NewLoad, 3501 /*FMFSource=*/nullptr, "strided.vec"); 3502 3503 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { 3504 Instruction *Member = Group->getMember(I); 3505 Value *StridedVec = State.Builder.CreateExtractValue(Deinterleave, I); 3506 if (!Member) { 3507 // This value is not needed as it's not used 3508 cast<Instruction>(StridedVec)->eraseFromParent(); 3509 continue; 3510 } 3511 // If this member has different type, cast the result type. 3512 if (Member->getType() != ScalarTy) { 3513 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 3514 StridedVec = 3515 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 3516 } 3517 3518 if (Group->isReverse()) 3519 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 3520 3521 State.set(VPDefs[J], StridedVec); 3522 ++J; 3523 } 3524 3525 return; 3526 } 3527 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 3528 3529 // For each member in the group, shuffle out the appropriate data from the 3530 // wide loads. 3531 unsigned J = 0; 3532 for (unsigned I = 0; I < InterleaveFactor; ++I) { 3533 Instruction *Member = Group->getMember(I); 3534 3535 // Skip the gaps in the group. 3536 if (!Member) 3537 continue; 3538 3539 auto StrideMask = 3540 createStrideMask(I, InterleaveFactor, State.VF.getFixedValue()); 3541 Value *StridedVec = 3542 State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec"); 3543 3544 // If this member has different type, cast the result type. 3545 if (Member->getType() != ScalarTy) { 3546 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); 3547 StridedVec = 3548 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); 3549 } 3550 3551 if (Group->isReverse()) 3552 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); 3553 3554 State.set(VPDefs[J], StridedVec); 3555 ++J; 3556 } 3557 return; 3558 } 3559 3560 // The sub vector type for current instruction. 3561 auto *SubVT = VectorType::get(ScalarTy, State.VF); 3562 3563 // Vectorize the interleaved store group. 3564 Value *MaskForGaps = 3565 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); 3566 assert((!MaskForGaps || !State.VF.isScalable()) && 3567 "masking gaps for scalable vectors is not yet supported."); 3568 ArrayRef<VPValue *> StoredValues = getStoredValues(); 3569 // Collect the stored vector from each member. 3570 SmallVector<Value *, 4> StoredVecs; 3571 unsigned StoredIdx = 0; 3572 for (unsigned i = 0; i < InterleaveFactor; i++) { 3573 assert((Group->getMember(i) || MaskForGaps) && 3574 "Fail to get a member from an interleaved store group"); 3575 Instruction *Member = Group->getMember(i); 3576 3577 // Skip the gaps in the group. 3578 if (!Member) { 3579 Value *Undef = PoisonValue::get(SubVT); 3580 StoredVecs.push_back(Undef); 3581 continue; 3582 } 3583 3584 Value *StoredVec = State.get(StoredValues[StoredIdx]); 3585 ++StoredIdx; 3586 3587 if (Group->isReverse()) 3588 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); 3589 3590 // If this member has different type, cast it to a unified type. 3591 3592 if (StoredVec->getType() != SubVT) 3593 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); 3594 3595 StoredVecs.push_back(StoredVec); 3596 } 3597 3598 // Interleave all the smaller vectors into one wider vector. 3599 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); 3600 Instruction *NewStoreInstr; 3601 if (BlockInMask || MaskForGaps) { 3602 Value *GroupMask = CreateGroupMask(MaskForGaps); 3603 NewStoreInstr = State.Builder.CreateMaskedStore( 3604 IVec, ResAddr, Group->getAlign(), GroupMask); 3605 } else 3606 NewStoreInstr = 3607 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign()); 3608 3609 Group->addMetadata(NewStoreInstr); 3610 } 3611 3612 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3613 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 3614 VPSlotTracker &SlotTracker) const { 3615 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 3616 IG->getInsertPos()->printAsOperand(O, false); 3617 O << ", "; 3618 getAddr()->printAsOperand(O, SlotTracker); 3619 VPValue *Mask = getMask(); 3620 if (Mask) { 3621 O << ", "; 3622 Mask->printAsOperand(O, SlotTracker); 3623 } 3624 3625 unsigned OpIdx = 0; 3626 for (unsigned i = 0; i < IG->getFactor(); ++i) { 3627 if (!IG->getMember(i)) 3628 continue; 3629 if (getNumStoreOperands() > 0) { 3630 O << "\n" << Indent << " store "; 3631 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 3632 O << " to index " << i; 3633 } else { 3634 O << "\n" << Indent << " "; 3635 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 3636 O << " = load from index " << i; 3637 } 3638 ++OpIdx; 3639 } 3640 } 3641 #endif 3642 3643 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, 3644 VPCostContext &Ctx) const { 3645 Instruction *InsertPos = getInsertPos(); 3646 // Find the VPValue index of the interleave group. We need to skip gaps. 3647 unsigned InsertPosIdx = 0; 3648 for (unsigned Idx = 0; IG->getFactor(); ++Idx) 3649 if (auto *Member = IG->getMember(Idx)) { 3650 if (Member == InsertPos) 3651 break; 3652 InsertPosIdx++; 3653 } 3654 Type *ValTy = Ctx.Types.inferScalarType( 3655 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx) 3656 : getStoredValues()[InsertPosIdx]); 3657 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 3658 unsigned AS = getLoadStoreAddressSpace(InsertPos); 3659 3660 unsigned InterleaveFactor = IG->getFactor(); 3661 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 3662 3663 // Holds the indices of existing members in the interleaved group. 3664 SmallVector<unsigned, 4> Indices; 3665 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 3666 if (IG->getMember(IF)) 3667 Indices.push_back(IF); 3668 3669 // Calculate the cost of the whole interleaved group. 3670 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( 3671 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices, 3672 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps); 3673 3674 if (!IG->isReverse()) 3675 return Cost; 3676 3677 return Cost + IG->getNumMembers() * 3678 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 3679 VectorTy, VectorTy, {}, Ctx.CostKind, 3680 0); 3681 } 3682 3683 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3684 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3685 VPSlotTracker &SlotTracker) const { 3686 O << Indent << "EMIT "; 3687 printAsOperand(O, SlotTracker); 3688 O << " = CANONICAL-INDUCTION "; 3689 printOperands(O, SlotTracker); 3690 } 3691 #endif 3692 3693 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { 3694 return IsScalarAfterVectorization && 3695 (!IsScalable || vputils::onlyFirstLaneUsed(this)); 3696 } 3697 3698 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 3699 assert(getInductionDescriptor().getKind() == 3700 InductionDescriptor::IK_PtrInduction && 3701 "Not a pointer induction according to InductionDescriptor!"); 3702 assert(State.TypeAnalysis.inferScalarType(this)->isPointerTy() && 3703 "Unexpected type."); 3704 assert(!onlyScalarsGenerated(State.VF.isScalable()) && 3705 "Recipe should have been replaced"); 3706 3707 unsigned CurrentPart = getUnrollPart(*this); 3708 3709 // Build a pointer phi 3710 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 3711 Type *ScStValueType = ScalarStartValue->getType(); 3712 3713 BasicBlock *VectorPH = 3714 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); 3715 PHINode *NewPointerPhi = nullptr; 3716 if (CurrentPart == 0) { 3717 IRBuilder<>::InsertPointGuard Guard(State.Builder); 3718 if (State.Builder.GetInsertPoint() != 3719 State.Builder.GetInsertBlock()->getFirstNonPHIIt()) 3720 State.Builder.SetInsertPoint( 3721 State.Builder.GetInsertBlock()->getFirstNonPHIIt()); 3722 NewPointerPhi = State.Builder.CreatePHI(ScStValueType, 2, "pointer.phi"); 3723 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 3724 NewPointerPhi->setDebugLoc(getDebugLoc()); 3725 } else { 3726 // The recipe has been unrolled. In that case, fetch the single pointer phi 3727 // shared among all unrolled parts of the recipe. 3728 auto *GEP = 3729 cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand())); 3730 NewPointerPhi = cast<PHINode>(GEP->getPointerOperand()); 3731 } 3732 3733 // A pointer induction, performed by using a gep 3734 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); 3735 Value *ScalarStepValue = State.get(getStepValue(), VPLane(0)); 3736 Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue()); 3737 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 3738 // Add induction update using an incorrect block temporarily. The phi node 3739 // will be fixed after VPlan execution. Note that at this point the latch 3740 // block cannot be used, as it does not exist yet. 3741 // TODO: Model increment value in VPlan, by turning the recipe into a 3742 // multi-def and a subclass of VPHeaderPHIRecipe. 3743 if (CurrentPart == 0) { 3744 // The recipe represents the first part of the pointer induction. Create the 3745 // GEP to increment the phi across all unrolled parts. 3746 Value *NumUnrolledElems = State.get(getOperand(2), true); 3747 3748 Value *InductionGEP = GetElementPtrInst::Create( 3749 State.Builder.getInt8Ty(), NewPointerPhi, 3750 State.Builder.CreateMul( 3751 ScalarStepValue, 3752 State.Builder.CreateTrunc(NumUnrolledElems, PhiType)), 3753 "ptr.ind", InductionLoc); 3754 3755 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 3756 } 3757 3758 // Create actual address geps that use the pointer phi as base and a 3759 // vectorized version of the step value (<step*0, ..., step*N>) as offset. 3760 Type *VecPhiType = VectorType::get(PhiType, State.VF); 3761 Value *StartOffsetScalar = State.Builder.CreateMul( 3762 RuntimeVF, ConstantInt::get(PhiType, CurrentPart)); 3763 Value *StartOffset = 3764 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 3765 // Create a vector of consecutive numbers from zero to VF. 3766 StartOffset = State.Builder.CreateAdd( 3767 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 3768 3769 assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) && 3770 "scalar step must be the same across all parts"); 3771 Value *GEP = State.Builder.CreateGEP( 3772 State.Builder.getInt8Ty(), NewPointerPhi, 3773 State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat( 3774 State.VF, ScalarStepValue)), 3775 "vector.gep"); 3776 State.set(this, GEP); 3777 } 3778 3779 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3780 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, 3781 VPSlotTracker &SlotTracker) const { 3782 assert((getNumOperands() == 3 || getNumOperands() == 5) && 3783 "unexpected number of operands"); 3784 O << Indent << "EMIT "; 3785 printAsOperand(O, SlotTracker); 3786 O << " = WIDEN-POINTER-INDUCTION "; 3787 getStartValue()->printAsOperand(O, SlotTracker); 3788 O << ", "; 3789 getStepValue()->printAsOperand(O, SlotTracker); 3790 O << ", "; 3791 getOperand(2)->printAsOperand(O, SlotTracker); 3792 if (getNumOperands() == 5) { 3793 O << ", "; 3794 getOperand(3)->printAsOperand(O, SlotTracker); 3795 O << ", "; 3796 getOperand(4)->printAsOperand(O, SlotTracker); 3797 } 3798 } 3799 #endif 3800 3801 void VPExpandSCEVRecipe::execute(VPTransformState &State) { 3802 assert(!State.Lane && "cannot be used in per-lane"); 3803 const DataLayout &DL = SE.getDataLayout(); 3804 SCEVExpander Exp(SE, DL, "induction", /*PreserveLCSSA=*/true); 3805 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), 3806 &*State.Builder.GetInsertPoint()); 3807 State.set(this, Res, VPLane(0)); 3808 } 3809 3810 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3811 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, 3812 VPSlotTracker &SlotTracker) const { 3813 O << Indent << "EMIT "; 3814 printAsOperand(O, SlotTracker); 3815 O << " = EXPAND SCEV " << *Expr; 3816 } 3817 #endif 3818 3819 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 3820 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true); 3821 Type *STy = CanonicalIV->getType(); 3822 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 3823 ElementCount VF = State.VF; 3824 Value *VStart = VF.isScalar() 3825 ? CanonicalIV 3826 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); 3827 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this)); 3828 if (VF.isVector()) { 3829 VStep = Builder.CreateVectorSplat(VF, VStep); 3830 VStep = 3831 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); 3832 } 3833 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 3834 State.set(this, CanonicalVectorIV); 3835 } 3836 3837 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3838 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 3839 VPSlotTracker &SlotTracker) const { 3840 O << Indent << "EMIT "; 3841 printAsOperand(O, SlotTracker); 3842 O << " = WIDEN-CANONICAL-INDUCTION "; 3843 printOperands(O, SlotTracker); 3844 } 3845 #endif 3846 3847 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { 3848 auto &Builder = State.Builder; 3849 // Create a vector from the initial value. 3850 auto *VectorInit = getStartValue()->getLiveInIRValue(); 3851 3852 Type *VecTy = State.VF.isScalar() 3853 ? VectorInit->getType() 3854 : VectorType::get(VectorInit->getType(), State.VF); 3855 3856 BasicBlock *VectorPH = 3857 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); 3858 if (State.VF.isVector()) { 3859 auto *IdxTy = Builder.getInt32Ty(); 3860 auto *One = ConstantInt::get(IdxTy, 1); 3861 IRBuilder<>::InsertPointGuard Guard(Builder); 3862 Builder.SetInsertPoint(VectorPH->getTerminator()); 3863 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); 3864 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3865 VectorInit = Builder.CreateInsertElement( 3866 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); 3867 } 3868 3869 // Create a phi node for the new recurrence. 3870 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur"); 3871 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); 3872 Phi->addIncoming(VectorInit, VectorPH); 3873 State.set(this, Phi); 3874 } 3875 3876 InstructionCost 3877 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, 3878 VPCostContext &Ctx) const { 3879 if (VF.isScalar()) 3880 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); 3881 3882 return 0; 3883 } 3884 3885 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3886 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, 3887 VPSlotTracker &SlotTracker) const { 3888 O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; 3889 printAsOperand(O, SlotTracker); 3890 O << " = phi "; 3891 printOperands(O, SlotTracker); 3892 } 3893 #endif 3894 3895 void VPReductionPHIRecipe::execute(VPTransformState &State) { 3896 // Reductions do not have to start at zero. They can start with 3897 // any loop invariant values. 3898 VPValue *StartVPV = getStartValue(); 3899 3900 // In order to support recurrences we need to be able to vectorize Phi nodes. 3901 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3902 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3903 // this value when we vectorize all of the instructions that use the PHI. 3904 BasicBlock *VectorPH = 3905 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); 3906 bool ScalarPHI = State.VF.isScalar() || IsInLoop; 3907 Value *StartV = State.get(StartVPV, ScalarPHI); 3908 Type *VecTy = StartV->getType(); 3909 3910 BasicBlock *HeaderBB = State.CFG.PrevBB; 3911 assert(State.CurrentParentLoop->getHeader() == HeaderBB && 3912 "recipe must be in the vector loop header"); 3913 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); 3914 Phi->insertBefore(HeaderBB->getFirstInsertionPt()); 3915 State.set(this, Phi, IsInLoop); 3916 3917 Phi->addIncoming(StartV, VectorPH); 3918 } 3919 3920 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3921 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3922 VPSlotTracker &SlotTracker) const { 3923 O << Indent << "WIDEN-REDUCTION-PHI "; 3924 3925 printAsOperand(O, SlotTracker); 3926 O << " = phi "; 3927 printOperands(O, SlotTracker); 3928 if (VFScaleFactor != 1) 3929 O << " (VF scaled by 1/" << VFScaleFactor << ")"; 3930 } 3931 #endif 3932 3933 void VPWidenPHIRecipe::execute(VPTransformState &State) { 3934 Value *Op0 = State.get(getOperand(0)); 3935 Type *VecTy = Op0->getType(); 3936 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name); 3937 // Manually move it with the other PHIs in case PHI recipes above this one 3938 // also inserted non-phi instructions. 3939 // TODO: Remove once VPWidenPointerInductionRecipe is also expanded in 3940 // convertToConcreteRecipes. 3941 VecPhi->moveBefore(State.Builder.GetInsertBlock()->getFirstNonPHIIt()); 3942 State.set(this, VecPhi); 3943 } 3944 3945 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3946 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3947 VPSlotTracker &SlotTracker) const { 3948 O << Indent << "WIDEN-PHI "; 3949 3950 printAsOperand(O, SlotTracker); 3951 O << " = phi "; 3952 printPhiOperands(O, SlotTracker); 3953 } 3954 #endif 3955 3956 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and 3957 // remove VPActiveLaneMaskPHIRecipe. 3958 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { 3959 BasicBlock *VectorPH = 3960 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); 3961 Value *StartMask = State.get(getOperand(0)); 3962 PHINode *Phi = 3963 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); 3964 Phi->addIncoming(StartMask, VectorPH); 3965 State.set(this, Phi); 3966 } 3967 3968 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3969 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3970 VPSlotTracker &SlotTracker) const { 3971 O << Indent << "ACTIVE-LANE-MASK-PHI "; 3972 3973 printAsOperand(O, SlotTracker); 3974 O << " = phi "; 3975 printOperands(O, SlotTracker); 3976 } 3977 #endif 3978 3979 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3980 void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, 3981 VPSlotTracker &SlotTracker) const { 3982 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; 3983 3984 printAsOperand(O, SlotTracker); 3985 O << " = phi "; 3986 printOperands(O, SlotTracker); 3987 } 3988 #endif 3989