Transforms/Vectorize/VPlanAnalysis.cpp

//===- VPlanAnalysis.cpp - Various Analyses working on VPlan ----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "VPlanAnalysis.h"
#include "VPlan.h"
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/PatternMatch.h"

using namespace llvm;

#define DEBUG_TYPE "vplan"

VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan)
    : Ctx(Plan.getScalarHeader()->getIRBasicBlock()->getContext()) {
  if (auto LoopRegion = Plan.getVectorLoopRegion()) {
    if (const auto *CanIV = dyn_cast<VPCanonicalIVPHIRecipe>(
            &LoopRegion->getEntryBasicBlock()->front())) {
      CanonicalIVTy = CanIV->getScalarType();
      return;
    }
  }

  // If there's no canonical IV, retrieve the type from the trip count
  // expression.
  auto *TC = Plan.getTripCount();
  if (TC->isLiveIn()) {
    CanonicalIVTy = TC->getLiveInIRValue()->getType();
    return;
  }
  CanonicalIVTy = cast<VPExpandSCEVRecipe>(TC)->getSCEV()->getType();
}

Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPBlendRecipe *R) {
  Type *ResTy = inferScalarType(R->getIncomingValue(0));
  for (unsigned I = 1, E = R->getNumIncomingValues(); I != E; ++I) {
    VPValue *Inc = R->getIncomingValue(I);
    assert(inferScalarType(Inc) == ResTy &&
           "different types inferred for different incoming values");
    CachedTypes[Inc] = ResTy;
  }
  return ResTy;
}

Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
  // Set the result type from the first operand, check if the types for all
  // other operands match and cache them.
  auto SetResultTyFromOp = [this, R]() {
    Type *ResTy = inferScalarType(R->getOperand(0));
    for (unsigned Op = 1; Op != R->getNumOperands(); ++Op) {
      VPValue *OtherV = R->getOperand(Op);
      assert(inferScalarType(OtherV) == ResTy &&
             "different types inferred for different operands");
      CachedTypes[OtherV] = ResTy;
    }
    return ResTy;
  };

  unsigned Opcode = R->getOpcode();
  if (Instruction::isBinaryOp(Opcode) || Instruction::isUnaryOp(Opcode))
    return SetResultTyFromOp();

  switch (Opcode) {
  case Instruction::ExtractElement:
  case Instruction::Freeze:
  case VPInstruction::ReductionStartVector:
    return inferScalarType(R->getOperand(0));
  case Instruction::Select: {
    Type *ResTy = inferScalarType(R->getOperand(1));
    VPValue *OtherV = R->getOperand(2);
    assert(inferScalarType(OtherV) == ResTy &&
           "different types inferred for different operands");
    CachedTypes[OtherV] = ResTy;
    return ResTy;
  }
  case Instruction::ICmp:
  case Instruction::FCmp:
  case VPInstruction::ActiveLaneMask:
    assert(inferScalarType(R->getOperand(0)) ==
               inferScalarType(R->getOperand(1)) &&
           "different types inferred for different operands");
    return IntegerType::get(Ctx, 1);
  case VPInstruction::ComputeAnyOfResult:
    return inferScalarType(R->getOperand(1));
  case VPInstruction::ComputeFindIVResult:
  case VPInstruction::ComputeReductionResult: {
    return inferScalarType(R->getOperand(0));
  }
  case VPInstruction::ExplicitVectorLength:
    return Type::getIntNTy(Ctx, 32);
  case Instruction::PHI:
    // Infer the type of first operand only, as other operands of header phi's
    // may lead to infinite recursion.
    return inferScalarType(R->getOperand(0));
  case VPInstruction::FirstOrderRecurrenceSplice:
  case VPInstruction::Not:
  case VPInstruction::CalculateTripCountMinusVF:
  case VPInstruction::CanonicalIVIncrementForPart:
  case VPInstruction::AnyOf:
  case VPInstruction::BuildStructVector:
  case VPInstruction::BuildVector:
    return SetResultTyFromOp();
  case VPInstruction::FirstActiveLane:
    return Type::getIntNTy(Ctx, 64);
  case VPInstruction::ExtractLastElement:
  case VPInstruction::ExtractPenultimateElement: {
    Type *BaseTy = inferScalarType(R->getOperand(0));
    if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
      return VecTy->getElementType();
    return BaseTy;
  }
  case VPInstruction::LogicalAnd:
    assert(inferScalarType(R->getOperand(0))->isIntegerTy(1) &&
           inferScalarType(R->getOperand(1))->isIntegerTy(1) &&
           "LogicalAnd operands should be bool");
    return IntegerType::get(Ctx, 1);
  case VPInstruction::Broadcast:
  case VPInstruction::PtrAdd:
    // Return the type based on first operand.
    return inferScalarType(R->getOperand(0));
  case VPInstruction::BranchOnCond:
  case VPInstruction::BranchOnCount:
    return Type::getVoidTy(Ctx);
  default:
    break;
  }
  // Type inference not implemented for opcode.
  LLVM_DEBUG({
    dbgs() << "LV: Found unhandled opcode for: ";
    R->getVPSingleValue()->dump();
  });
  llvm_unreachable("Unhandled opcode!");
}

Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {
  unsigned Opcode = R->getOpcode();
  if (Instruction::isBinaryOp(Opcode) || Instruction::isShift(Opcode) ||
      Instruction::isBitwiseLogicOp(Opcode)) {
    Type *ResTy = inferScalarType(R->getOperand(0));
    assert(ResTy == inferScalarType(R->getOperand(1)) &&
           "types for both operands must match for binary op");
    CachedTypes[R->getOperand(1)] = ResTy;
    return ResTy;
  }

  switch (Opcode) {
  case Instruction::ICmp:
  case Instruction::FCmp:
    return IntegerType::get(Ctx, 1);
  case Instruction::FNeg:
  case Instruction::Freeze:
    return inferScalarType(R->getOperand(0));
  case Instruction::ExtractValue: {
    assert(R->getNumOperands() == 2 && "expected single level extractvalue");
    auto *StructTy = cast<StructType>(inferScalarType(R->getOperand(0)));
    auto *CI = cast<ConstantInt>(R->getOperand(1)->getLiveInIRValue());
    return StructTy->getTypeAtIndex(CI->getZExtValue());
  }
  default:
    break;
  }

  // Type inference not implemented for opcode.
  LLVM_DEBUG({
    dbgs() << "LV: Found unhandled opcode for: ";
    R->getVPSingleValue()->dump();
  });
  llvm_unreachable("Unhandled opcode!");
}

Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
  auto &CI = *cast<CallInst>(R->getUnderlyingInstr());
  return CI.getType();
}

Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
  assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
         "Store recipes should not define any values");
  return cast<LoadInst>(&R->getIngredient())->getType();
}

Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenSelectRecipe *R) {
  Type *ResTy = inferScalarType(R->getOperand(1));
  VPValue *OtherV = R->getOperand(2);
  assert(inferScalarType(OtherV) == ResTy &&
         "different types inferred for different operands");
  CachedTypes[OtherV] = ResTy;
  return ResTy;
}

Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
  unsigned Opcode = R->getUnderlyingInstr()->getOpcode();

  if (Instruction::isBinaryOp(Opcode) || Instruction::isShift(Opcode) ||
      Instruction::isBitwiseLogicOp(Opcode)) {
    Type *ResTy = inferScalarType(R->getOperand(0));
    assert(ResTy == inferScalarType(R->getOperand(1)) &&
           "inferred types for operands of binary op don't match");
    CachedTypes[R->getOperand(1)] = ResTy;
    return ResTy;
  }

  if (Instruction::isCast(Opcode))
    return R->getUnderlyingInstr()->getType();

  switch (Opcode) {
  case Instruction::Call: {
    unsigned CallIdx = R->getNumOperands() - (R->isPredicated() ? 2 : 1);
    return cast<Function>(R->getOperand(CallIdx)->getLiveInIRValue())
        ->getReturnType();
  }
  case Instruction::Select: {
    Type *ResTy = inferScalarType(R->getOperand(1));
    assert(ResTy == inferScalarType(R->getOperand(2)) &&
           "inferred types for operands of select op don't match");
    CachedTypes[R->getOperand(2)] = ResTy;
    return ResTy;
  }
  case Instruction::ICmp:
  case Instruction::FCmp:
    return IntegerType::get(Ctx, 1);
  case Instruction::Alloca:
  case Instruction::ExtractValue:
    return R->getUnderlyingInstr()->getType();
  case Instruction::Freeze:
  case Instruction::FNeg:
  case Instruction::GetElementPtr:
    return inferScalarType(R->getOperand(0));
  case Instruction::Load:
    return cast<LoadInst>(R->getUnderlyingInstr())->getType();
  case Instruction::Store:
    // FIXME: VPReplicateRecipes with store opcodes still define a result
    // VPValue, so we need to handle them here. Remove the code here once this
    // is modeled accurately in VPlan.
    return Type::getVoidTy(Ctx);
  default:
    break;
  }
  // Type inference not implemented for opcode.
  LLVM_DEBUG({
    dbgs() << "LV: Found unhandled opcode for: ";
    R->getVPSingleValue()->dump();
  });
  llvm_unreachable("Unhandled opcode");
}

Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
  if (Type *CachedTy = CachedTypes.lookup(V))
    return CachedTy;

  if (V->isLiveIn()) {
    if (auto *IRValue = V->getLiveInIRValue())
      return IRValue->getType();
    // All VPValues without any underlying IR value (like the vector trip count
    // or the backedge-taken count) have the same type as the canonical IV.
    return CanonicalIVTy;
  }

  Type *ResultTy =
      TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
          .Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
                VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
                VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>(
              [this](const auto *R) {
                // Handle header phi recipes, except VPWidenIntOrFpInduction
                // which needs special handling due it being possibly truncated.
                // TODO: consider inferring/caching type of siblings, e.g.,
                // backedge value, here and in cases below.
                return inferScalarType(R->getStartValue());
              })
          .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
              [](const auto *R) { return R->getScalarType(); })
          .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
                VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
                VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
                VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
            return inferScalarType(R->getOperand(0));
          })
          // VPInstructionWithType must be handled before VPInstruction.
          .Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
                VPWidenCastRecipe>(
              [](const auto *R) { return R->getResultType(); })
          .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
                VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
              [this](const auto *R) { return inferScalarTypeForRecipe(R); })
          .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
            // TODO: Use info from interleave group.
            return V->getUnderlyingValue()->getType();
          })
          .Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
            return R->getSCEV()->getType();
          })
          .Case<VPReductionRecipe>([this](const auto *R) {
            return inferScalarType(R->getChainOp());
          })
          .Case<VPExpressionRecipe>([this](const auto *R) {
            return inferScalarType(R->getOperandOfResultType());
          });

  assert(ResultTy && "could not infer type for the given VPValue");
  CachedTypes[V] = ResultTy;
  return ResultTy;
}

void llvm::collectEphemeralRecipesForVPlan(
    VPlan &Plan, DenseSet<VPRecipeBase *> &EphRecipes) {
  // First, collect seed recipes which are operands of assumes.
  SmallVector<VPRecipeBase *> Worklist;
  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
           vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
    for (VPRecipeBase &R : *VPBB) {
      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
      if (!RepR || !match(RepR->getUnderlyingInstr(),
                          PatternMatch::m_Intrinsic<Intrinsic::assume>()))
        continue;
      Worklist.push_back(RepR);
      EphRecipes.insert(RepR);
    }
  }

  // Process operands of candidates in worklist and add them to the set of
  // ephemeral recipes, if they don't have side-effects and are only used by
  // other ephemeral recipes.
  while (!Worklist.empty()) {
    VPRecipeBase *Cur = Worklist.pop_back_val();
    for (VPValue *Op : Cur->operands()) {
      auto *OpR = Op->getDefiningRecipe();
      if (!OpR || OpR->mayHaveSideEffects() || EphRecipes.contains(OpR))
        continue;
      if (any_of(Op->users(), [EphRecipes](VPUser *U) {
            auto *UR = dyn_cast<VPRecipeBase>(U);
            return !UR || !EphRecipes.contains(UR);
          }))
        continue;
      EphRecipes.insert(OpR);
      Worklist.push_back(OpR);
    }
  }
}

template void DomTreeBuilder::Calculate<DominatorTreeBase<VPBlockBase, false>>(
    DominatorTreeBase<VPBlockBase, false> &DT);

bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
                                        const VPRecipeBase *B) {
  if (A == B)
    return false;

  auto LocalComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) {
    for (auto &R : *A->getParent()) {
      if (&R == A)
        return true;
      if (&R == B)
        return false;
    }
    llvm_unreachable("recipe not found");
  };
  const VPBlockBase *ParentA = A->getParent();
  const VPBlockBase *ParentB = B->getParent();
  if (ParentA == ParentB)
    return LocalComesBefore(A, B);

#ifndef NDEBUG
  auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
    auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
    if (Region && Region->isReplicator()) {
      assert(Region->getNumSuccessors() == 1 &&
             Region->getNumPredecessors() == 1 && "Expected SESE region!");
      assert(R->getParent()->size() == 1 &&
             "A recipe in an original replicator region must be the only "
             "recipe in its block");
      return Region;
    }
    return nullptr;
  };
  assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(A)) &&
         "No replicate regions expected at this point");
  assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(B)) &&
         "No replicate regions expected at this point");
#endif
  return Base::properlyDominates(ParentA, ParentB);
}

/// Get the VF scaling factor applied to the recipe's output, if the recipe has
/// one.
static unsigned getVFScaleFactor(VPRecipeBase *R) {
  if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
    return RR->getVFScaleFactor();
  if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
    return RR->getVFScaleFactor();
  assert(
      (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
                                     VPInstruction::ReductionStartVector) &&
      "getting scaling factor of reduction-start-vector not implemented yet");
  return 1;
}

bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI) const {
  return any_of(MaxLocalUsers, [&TTI](auto &LU) {
    return LU.second > TTI.getNumberOfRegisters(LU.first);
  });
}

SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
    VPlan &Plan, ArrayRef<ElementCount> VFs, const TargetTransformInfo &TTI,
    const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
  // Each 'key' in the map opens a new interval. The values
  // of the map are the index of the 'last seen' usage of the
  // recipe that is the key.
  using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned, 16>;

  // Maps indices to recipes.
  SmallVector<VPRecipeBase *, 64> Idx2Recipe;
  // Marks the end of each interval.
  IntervalMap EndPoint;
  // Saves the list of recipe indices that are used in the loop.
  SmallPtrSet<VPRecipeBase *, 8> Ends;
  // Saves the list of values that are used in the loop but are defined outside
  // the loop (not including non-recipe values such as arguments and
  // constants).
  SmallSetVector<VPValue *, 8> LoopInvariants;
  LoopInvariants.insert(&Plan.getVectorTripCount());

  // We scan the loop in a topological order in order and assign a number to
  // each recipe. We use RPO to ensure that defs are met before their users. We
  // assume that each recipe that has in-loop users starts an interval. We
  // record every time that an in-loop value is used, so we have a list of the
  // first and last occurrences of each recipe.
  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
      Plan.getVectorLoopRegion());
  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
    if (!VPBB->getParent())
      break;
    for (VPRecipeBase &R : *VPBB) {
      Idx2Recipe.push_back(&R);

      // Save the end location of each USE.
      for (VPValue *U : R.operands()) {
        auto *DefR = U->getDefiningRecipe();

        // Ignore non-recipe values such as arguments, constants, etc.
        // FIXME: Might need some motivation why these values are ignored. If
        // for example an argument is used inside the loop it will increase the
        // register pressure (so shouldn't we add it to LoopInvariants).
        if (!DefR && (!U->getLiveInIRValue() ||
                      !isa<Instruction>(U->getLiveInIRValue())))
          continue;

        // If this recipe is outside the loop then record it and continue.
        if (!DefR) {
          LoopInvariants.insert(U);
          continue;
        }

        // Overwrite previous end points.
        EndPoint[DefR] = Idx2Recipe.size();
        Ends.insert(DefR);
      }
    }
    if (VPBB == Plan.getVectorLoopRegion()->getExiting()) {
      // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
      // exiting block, where their increment will get materialized eventually.
      for (auto &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
        if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
          EndPoint[&R] = Idx2Recipe.size();
          Ends.insert(&R);
        }
      }
    }
  }

  // Saves the list of intervals that end with the index in 'key'.
  using RecipeList = SmallVector<VPRecipeBase *, 2>;
  SmallDenseMap<unsigned, RecipeList, 16> TransposeEnds;

  // Next, we transpose the EndPoints into a multi map that holds the list of
  // intervals that *end* at a specific location.
  for (auto &Interval : EndPoint)
    TransposeEnds[Interval.second].push_back(Interval.first);

  SmallPtrSet<VPRecipeBase *, 8> OpenIntervals;
  SmallVector<VPRegisterUsage, 8> RUs(VFs.size());
  SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());

  LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");

  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());

  const auto &TTICapture = TTI;
  auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
    if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
        (VF.isScalable() &&
         !TTICapture.isElementTypeLegalForScalableVector(Ty)))
      return 0;
    return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
  };

  // We scan the instructions linearly and record each time that a new interval
  // starts, by placing it in a set. If we find this value in TransposEnds then
  // we remove it from the set. The max register usage is the maximum register
  // usage of the recipes of the set.
  for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) {
    VPRecipeBase *R = Idx2Recipe[Idx];

    // Remove all of the recipes that end at this location.
    RecipeList &List = TransposeEnds[Idx];
    for (VPRecipeBase *ToRemove : List)
      OpenIntervals.erase(ToRemove);

    // Ignore recipes that are never used within the loop and do not have side
    // effects.
    if (!Ends.count(R) && !R->mayHaveSideEffects())
      continue;

    // Skip recipes for ignored values.
    // TODO: Should mark recipes for ephemeral values that cannot be removed
    // explictly in VPlan.
    if (isa<VPSingleDefRecipe>(R) &&
        ValuesToIgnore.contains(
            cast<VPSingleDefRecipe>(R)->getUnderlyingValue()))
      continue;

    // For each VF find the maximum usage of registers.
    for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
      // Count the number of registers used, per register class, given all open
      // intervals.
      // Note that elements in this SmallMapVector will be default constructed
      // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
      // there is no previous entry for ClassID.
      SmallMapVector<unsigned, unsigned, 4> RegUsage;

      for (auto *R : OpenIntervals) {
        // Skip recipes that weren't present in the original loop.
        // TODO: Remove after removing the legacy
        // LoopVectorizationCostModel::calculateRegisterUsage
        if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
                VPBranchOnMaskRecipe>(R))
          continue;

        if (VFs[J].isScalar() ||
            isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
                VPScalarIVStepsRecipe>(R) ||
            (isa<VPInstruction>(R) &&
             all_of(cast<VPSingleDefRecipe>(R)->users(),
                    [&](VPUser *U) {
                      return cast<VPRecipeBase>(U)->usesScalars(
                          R->getVPSingleValue());
                    })) ||
            (isa<VPReductionPHIRecipe>(R) &&
             (cast<VPReductionPHIRecipe>(R))->isInLoop())) {
          unsigned ClassID = TTI.getRegisterClassForType(
              false, TypeInfo.inferScalarType(R->getVPSingleValue()));
          // FIXME: The target might use more than one register for the type
          // even in the scalar case.
          RegUsage[ClassID] += 1;
        } else {
          // The output from scaled phis and scaled reductions actually has
          // fewer lanes than the VF.
          unsigned ScaleFactor = getVFScaleFactor(R);
          ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
          LLVM_DEBUG(if (VF != VFs[J]) {
            dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
                   << " for " << *R << "\n";
          });

          for (VPValue *DefV : R->definedValues()) {
            Type *ScalarTy = TypeInfo.inferScalarType(DefV);
            unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
            RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
          }
        }
      }

      for (const auto &Pair : RegUsage) {
        auto &Entry = MaxUsages[J][Pair.first];
        Entry = std::max(Entry, Pair.second);
      }
    }

    LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
                      << OpenIntervals.size() << '\n');

    // Add the current recipe to the list of open intervals.
    OpenIntervals.insert(R);
  }

  // We also search for instructions that are defined outside the loop, but are
  // used inside the loop. We need this number separately from the max-interval
  // usage number because when we unroll, loop-invariant values do not take
  // more register.
  VPRegisterUsage RU;
  for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
    // Note that elements in this SmallMapVector will be default constructed
    // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
    // there is no previous entry for ClassID.
    SmallMapVector<unsigned, unsigned, 4> Invariant;

    for (auto *In : LoopInvariants) {
      // FIXME: The target might use more than one register for the type
      // even in the scalar case.
      bool IsScalar = all_of(In->users(), [&](VPUser *U) {
        return cast<VPRecipeBase>(U)->usesScalars(In);
      });

      ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
      unsigned ClassID = TTI.getRegisterClassForType(
          VF.isVector(), TypeInfo.inferScalarType(In));
      Invariant[ClassID] += GetRegUsage(TypeInfo.inferScalarType(In), VF);
    }

    LLVM_DEBUG({
      dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
      dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
             << " item\n";
      for (const auto &pair : MaxUsages[Idx]) {
        dbgs() << "LV(REG): RegisterClass: "
               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
               << " registers\n";
      }
      dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
             << " item\n";
      for (const auto &pair : Invariant) {
        dbgs() << "LV(REG): RegisterClass: "
               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
               << " registers\n";
      }
    });

    RU.LoopInvariantRegs = Invariant;
    RU.MaxLocalUsers = MaxUsages[Idx];
    RUs[Idx] = RU;
  }

  return RUs;
}