AArch64/GISel/AArch64PostLegalizerCombiner.cpp

//=== AArch64PostLegalizerCombiner.cpp --------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// Post-legalization combines on generic MachineInstrs.
///
/// The combines here must preserve instruction legality.
///
/// Lowering combines (e.g. pseudo matching) should be handled by
/// AArch64PostLegalizerLowering.
///
/// Combines which don't rely on instruction legality should go in the
/// AArch64PreLegalizerCombiner.
///
//===----------------------------------------------------------------------===//

#include "AArch64TargetMachine.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Support/Debug.h"

#define GET_GICOMBINER_DEPS
#include "AArch64GenPostLegalizeGICombiner.inc"
#undef GET_GICOMBINER_DEPS

#define DEBUG_TYPE "aarch64-postlegalizer-combiner"

using namespace llvm;
using namespace MIPatternMatch;

namespace {

#define GET_GICOMBINER_TYPES
#include "AArch64GenPostLegalizeGICombiner.inc"
#undef GET_GICOMBINER_TYPES

/// This combine tries do what performExtractVectorEltCombine does in SDAG.
/// Rewrite for pairwise fadd pattern
///   (s32 (g_extract_vector_elt
///           (g_fadd (vXs32 Other)
///                  (g_vector_shuffle (vXs32 Other) undef <1,X,...> )) 0))
/// ->
///   (s32 (g_fadd (g_extract_vector_elt (vXs32 Other) 0)
///              (g_extract_vector_elt (vXs32 Other) 1))
bool matchExtractVecEltPairwiseAdd(
    MachineInstr &MI, MachineRegisterInfo &MRI,
    std::tuple<unsigned, LLT, Register> &MatchInfo) {
  Register Src1 = MI.getOperand(1).getReg();
  Register Src2 = MI.getOperand(2).getReg();
  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());

  auto Cst = getIConstantVRegValWithLookThrough(Src2, MRI);
  if (!Cst || Cst->Value != 0)
    return false;
  // SDAG also checks for FullFP16, but this looks to be beneficial anyway.

  // Now check for an fadd operation. TODO: expand this for integer add?
  auto *FAddMI = getOpcodeDef(TargetOpcode::G_FADD, Src1, MRI);
  if (!FAddMI)
    return false;

  // If we add support for integer add, must restrict these types to just s64.
  unsigned DstSize = DstTy.getSizeInBits();
  if (DstSize != 16 && DstSize != 32 && DstSize != 64)
    return false;

  Register Src1Op1 = FAddMI->getOperand(1).getReg();
  Register Src1Op2 = FAddMI->getOperand(2).getReg();
  MachineInstr *Shuffle =
      getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op2, MRI);
  MachineInstr *Other = MRI.getVRegDef(Src1Op1);
  if (!Shuffle) {
    Shuffle = getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op1, MRI);
    Other = MRI.getVRegDef(Src1Op2);
  }

  // We're looking for a shuffle that moves the second element to index 0.
  if (Shuffle && Shuffle->getOperand(3).getShuffleMask()[0] == 1 &&
      Other == MRI.getVRegDef(Shuffle->getOperand(1).getReg())) {
    std::get<0>(MatchInfo) = TargetOpcode::G_FADD;
    std::get<1>(MatchInfo) = DstTy;
    std::get<2>(MatchInfo) = Other->getOperand(0).getReg();
    return true;
  }
  return false;
}

void applyExtractVecEltPairwiseAdd(
    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
    std::tuple<unsigned, LLT, Register> &MatchInfo) {
  unsigned Opc = std::get<0>(MatchInfo);
  assert(Opc == TargetOpcode::G_FADD && "Unexpected opcode!");
  // We want to generate two extracts of elements 0 and 1, and add them.
  LLT Ty = std::get<1>(MatchInfo);
  Register Src = std::get<2>(MatchInfo);
  LLT s64 = LLT::scalar(64);
  B.setInstrAndDebugLoc(MI);
  auto Elt0 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 0));
  auto Elt1 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 1));
  B.buildInstr(Opc, {MI.getOperand(0).getReg()}, {Elt0, Elt1});
  MI.eraseFromParent();
}

bool isSignExtended(Register R, MachineRegisterInfo &MRI) {
  // TODO: check if extended build vector as well.
  unsigned Opc = MRI.getVRegDef(R)->getOpcode();
  return Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG;
}

bool isZeroExtended(Register R, MachineRegisterInfo &MRI) {
  // TODO: check if extended build vector as well.
  return MRI.getVRegDef(R)->getOpcode() == TargetOpcode::G_ZEXT;
}

bool matchAArch64MulConstCombine(
    MachineInstr &MI, MachineRegisterInfo &MRI,
    std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) {
  assert(MI.getOpcode() == TargetOpcode::G_MUL);
  Register LHS = MI.getOperand(1).getReg();
  Register RHS = MI.getOperand(2).getReg();
  Register Dst = MI.getOperand(0).getReg();
  const LLT Ty = MRI.getType(LHS);

  // The below optimizations require a constant RHS.
  auto Const = getIConstantVRegValWithLookThrough(RHS, MRI);
  if (!Const)
    return false;

  APInt ConstValue = Const->Value.sext(Ty.getSizeInBits());
  // The following code is ported from AArch64ISelLowering.
  // Multiplication of a power of two plus/minus one can be done more
  // cheaply as shift+add/sub. For now, this is true unilaterally. If
  // future CPUs have a cheaper MADD instruction, this may need to be
  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
  // 64-bit is 5 cycles, so this is always a win.
  // More aggressively, some multiplications N0 * C can be lowered to
  // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
  // e.g. 6=3*2=(2+1)*2.
  // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
  // which equals to (1+2)*16-(1+2).
  // TrailingZeroes is used to test if the mul can be lowered to
  // shift+add+shift.
  unsigned TrailingZeroes = ConstValue.countr_zero();
  if (TrailingZeroes) {
    // Conservatively do not lower to shift+add+shift if the mul might be
    // folded into smul or umul.
    if (MRI.hasOneNonDBGUse(LHS) &&
        (isSignExtended(LHS, MRI) || isZeroExtended(LHS, MRI)))
      return false;
    // Conservatively do not lower to shift+add+shift if the mul might be
    // folded into madd or msub.
    if (MRI.hasOneNonDBGUse(Dst)) {
      MachineInstr &UseMI = *MRI.use_instr_begin(Dst);
      unsigned UseOpc = UseMI.getOpcode();
      if (UseOpc == TargetOpcode::G_ADD || UseOpc == TargetOpcode::G_PTR_ADD ||
          UseOpc == TargetOpcode::G_SUB)
        return false;
    }
  }
  // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
  // and shift+add+shift.
  APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);

  unsigned ShiftAmt, AddSubOpc;
  // Is the shifted value the LHS operand of the add/sub?
  bool ShiftValUseIsLHS = true;
  // Do we need to negate the result?
  bool NegateResult = false;

  if (ConstValue.isNonNegative()) {
    // (mul x, 2^N + 1) => (add (shl x, N), x)
    // (mul x, 2^N - 1) => (sub (shl x, N), x)
    // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
    APInt SCVMinus1 = ShiftedConstValue - 1;
    APInt CVPlus1 = ConstValue + 1;
    if (SCVMinus1.isPowerOf2()) {
      ShiftAmt = SCVMinus1.logBase2();
      AddSubOpc = TargetOpcode::G_ADD;
    } else if (CVPlus1.isPowerOf2()) {
      ShiftAmt = CVPlus1.logBase2();
      AddSubOpc = TargetOpcode::G_SUB;
    } else
      return false;
  } else {
    // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
    // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
    APInt CVNegPlus1 = -ConstValue + 1;
    APInt CVNegMinus1 = -ConstValue - 1;
    if (CVNegPlus1.isPowerOf2()) {
      ShiftAmt = CVNegPlus1.logBase2();
      AddSubOpc = TargetOpcode::G_SUB;
      ShiftValUseIsLHS = false;
    } else if (CVNegMinus1.isPowerOf2()) {
      ShiftAmt = CVNegMinus1.logBase2();
      AddSubOpc = TargetOpcode::G_ADD;
      NegateResult = true;
    } else
      return false;
  }

  if (NegateResult && TrailingZeroes)
    return false;

  ApplyFn = [=](MachineIRBuilder &B, Register DstReg) {
    auto Shift = B.buildConstant(LLT::scalar(64), ShiftAmt);
    auto ShiftedVal = B.buildShl(Ty, LHS, Shift);

    Register AddSubLHS = ShiftValUseIsLHS ? ShiftedVal.getReg(0) : LHS;
    Register AddSubRHS = ShiftValUseIsLHS ? LHS : ShiftedVal.getReg(0);
    auto Res = B.buildInstr(AddSubOpc, {Ty}, {AddSubLHS, AddSubRHS});
    assert(!(NegateResult && TrailingZeroes) &&
           "NegateResult and TrailingZeroes cannot both be true for now.");
    // Negate the result.
    if (NegateResult) {
      B.buildSub(DstReg, B.buildConstant(Ty, 0), Res);
      return;
    }
    // Shift the result.
    if (TrailingZeroes) {
      B.buildShl(DstReg, Res, B.buildConstant(LLT::scalar(64), TrailingZeroes));
      return;
    }
    B.buildCopy(DstReg, Res.getReg(0));
  };
  return true;
}

void applyAArch64MulConstCombine(
    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
    std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) {
  B.setInstrAndDebugLoc(MI);
  ApplyFn(B, MI.getOperand(0).getReg());
  MI.eraseFromParent();
}

/// Try to fold a G_MERGE_VALUES of 2 s32 sources, where the second source
/// is a zero, into a G_ZEXT of the first.
bool matchFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI) {
  auto &Merge = cast<GMerge>(MI);
  LLT SrcTy = MRI.getType(Merge.getSourceReg(0));
  if (SrcTy != LLT::scalar(32) || Merge.getNumSources() != 2)
    return false;
  return mi_match(Merge.getSourceReg(1), MRI, m_SpecificICst(0));
}

void applyFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI,
                          MachineIRBuilder &B, GISelChangeObserver &Observer) {
  // Mutate %d(s64) = G_MERGE_VALUES %a(s32), 0(s32)
  //  ->
  // %d(s64) = G_ZEXT %a(s32)
  Observer.changingInstr(MI);
  MI.setDesc(B.getTII().get(TargetOpcode::G_ZEXT));
  MI.removeOperand(2);
  Observer.changedInstr(MI);
}

/// \returns True if a G_ANYEXT instruction \p MI should be mutated to a G_ZEXT
/// instruction.
bool matchMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI) {
  // If this is coming from a scalar compare then we can use a G_ZEXT instead of
  // a G_ANYEXT:
  //
  // %cmp:_(s32) = G_[I|F]CMP ... <-- produces 0/1.
  // %ext:_(s64) = G_ANYEXT %cmp(s32)
  //
  // By doing this, we can leverage more KnownBits combines.
  assert(MI.getOpcode() == TargetOpcode::G_ANYEXT);
  Register Dst = MI.getOperand(0).getReg();
  Register Src = MI.getOperand(1).getReg();
  return MRI.getType(Dst).isScalar() &&
         mi_match(Src, MRI,
                  m_any_of(m_GICmp(m_Pred(), m_Reg(), m_Reg()),
                           m_GFCmp(m_Pred(), m_Reg(), m_Reg())));
}

void applyMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI,
                             MachineIRBuilder &B,
                             GISelChangeObserver &Observer) {
  Observer.changingInstr(MI);
  MI.setDesc(B.getTII().get(TargetOpcode::G_ZEXT));
  Observer.changedInstr(MI);
}

/// Match a 128b store of zero and split it into two 64 bit stores, for
/// size/performance reasons.
bool matchSplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI) {
  GStore &Store = cast<GStore>(MI);
  if (!Store.isSimple())
    return false;
  LLT ValTy = MRI.getType(Store.getValueReg());
  if (ValTy.isScalableVector())
    return false;
  if (!ValTy.isVector() || ValTy.getSizeInBits() != 128)
    return false;
  if (Store.getMemSizeInBits() != ValTy.getSizeInBits())
    return false; // Don't split truncating stores.
  if (!MRI.hasOneNonDBGUse(Store.getValueReg()))
    return false;
  auto MaybeCst = isConstantOrConstantSplatVector(
      *MRI.getVRegDef(Store.getValueReg()), MRI);
  return MaybeCst && MaybeCst->isZero();
}

void applySplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &B,
                            GISelChangeObserver &Observer) {
  B.setInstrAndDebugLoc(MI);
  GStore &Store = cast<GStore>(MI);
  assert(MRI.getType(Store.getValueReg()).isVector() &&
         "Expected a vector store value");
  LLT NewTy = LLT::scalar(64);
  Register PtrReg = Store.getPointerReg();
  auto Zero = B.buildConstant(NewTy, 0);
  auto HighPtr = B.buildPtrAdd(MRI.getType(PtrReg), PtrReg,
                               B.buildConstant(LLT::scalar(64), 8));
  auto &MF = *MI.getMF();
  auto *LowMMO = MF.getMachineMemOperand(&Store.getMMO(), 0, NewTy);
  auto *HighMMO = MF.getMachineMemOperand(&Store.getMMO(), 8, NewTy);
  B.buildStore(Zero, PtrReg, *LowMMO);
  B.buildStore(Zero, HighPtr, *HighMMO);
  Store.eraseFromParent();
}

bool matchOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI,
                  std::tuple<Register, Register, Register> &MatchInfo) {
  const LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
  if (!DstTy.isVector())
    return false;

  Register AO1, AO2, BVO1, BVO2;
  if (!mi_match(MI, MRI,
                m_GOr(m_GAnd(m_Reg(AO1), m_Reg(BVO1)),
                      m_GAnd(m_Reg(AO2), m_Reg(BVO2)))))
    return false;

  auto *BV1 = getOpcodeDef<GBuildVector>(BVO1, MRI);
  auto *BV2 = getOpcodeDef<GBuildVector>(BVO2, MRI);
  if (!BV1 || !BV2)
    return false;

  for (int I = 0, E = DstTy.getNumElements(); I < E; I++) {
    auto ValAndVReg1 =
        getIConstantVRegValWithLookThrough(BV1->getSourceReg(I), MRI);
    auto ValAndVReg2 =
        getIConstantVRegValWithLookThrough(BV2->getSourceReg(I), MRI);
    if (!ValAndVReg1 || !ValAndVReg2 ||
        ValAndVReg1->Value != ~ValAndVReg2->Value)
      return false;
  }

  MatchInfo = {AO1, AO2, BVO1};
  return true;
}

void applyOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI,
                  MachineIRBuilder &B,
                  std::tuple<Register, Register, Register> &MatchInfo) {
  B.setInstrAndDebugLoc(MI);
  B.buildInstr(
      AArch64::G_BSP, {MI.getOperand(0).getReg()},
      {std::get<2>(MatchInfo), std::get<0>(MatchInfo), std::get<1>(MatchInfo)});
  MI.eraseFromParent();
}

// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
bool matchCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
                         Register &SrcReg) {
  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());

  if (DstTy != LLT::fixed_vector(2, 64) && DstTy != LLT::fixed_vector(2, 32) &&
      DstTy != LLT::fixed_vector(4, 32) && DstTy != LLT::fixed_vector(4, 16) &&
      DstTy != LLT::fixed_vector(8, 16))
    return false;

  auto AndMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
  if (AndMI->getOpcode() != TargetOpcode::G_AND)
    return false;
  auto LShrMI = getDefIgnoringCopies(AndMI->getOperand(1).getReg(), MRI);
  if (LShrMI->getOpcode() != TargetOpcode::G_LSHR)
    return false;

  // Check the constant splat values
  auto V1 = isConstantOrConstantSplatVector(
      *MRI.getVRegDef(MI.getOperand(2).getReg()), MRI);
  auto V2 = isConstantOrConstantSplatVector(
      *MRI.getVRegDef(AndMI->getOperand(2).getReg()), MRI);
  auto V3 = isConstantOrConstantSplatVector(
      *MRI.getVRegDef(LShrMI->getOperand(2).getReg()), MRI);
  if (!V1.has_value() || !V2.has_value() || !V3.has_value())
    return false;
  unsigned HalfSize = DstTy.getScalarSizeInBits() / 2;
  if (!V1.value().isMask(HalfSize) || V2.value() != (1ULL | 1ULL << HalfSize) ||
      V3 != (HalfSize - 1))
    return false;

  SrcReg = LShrMI->getOperand(1).getReg();

  return true;
}

void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
                         MachineIRBuilder &B, Register &SrcReg) {
  Register DstReg = MI.getOperand(0).getReg();
  LLT DstTy = MRI.getType(DstReg);
  LLT HalfTy =
      DstTy.changeElementCount(DstTy.getElementCount().multiplyCoefficientBy(2))
          .changeElementSize(DstTy.getScalarSizeInBits() / 2);

  Register ZeroVec = B.buildConstant(HalfTy, 0).getReg(0);
  Register CastReg =
      B.buildInstr(TargetOpcode::G_BITCAST, {HalfTy}, {SrcReg}).getReg(0);
  Register CMLTReg =
      B.buildICmp(CmpInst::Predicate::ICMP_SLT, HalfTy, CastReg, ZeroVec)
          .getReg(0);

  B.buildInstr(TargetOpcode::G_BITCAST, {DstReg}, {CMLTReg}).getReg(0);
  MI.eraseFromParent();
}

class AArch64PostLegalizerCombinerImpl : public Combiner {
protected:
  // TODO: Make CombinerHelper methods const.
  mutable CombinerHelper Helper;
  const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig;
  const AArch64Subtarget &STI;

public:
  AArch64PostLegalizerCombinerImpl(
      MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
      GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
      const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig,
      const AArch64Subtarget &STI, MachineDominatorTree *MDT,
      const LegalizerInfo *LI);

  static const char *getName() { return "AArch64PostLegalizerCombiner"; }

  bool tryCombineAll(MachineInstr &I) const override;

private:
#define GET_GICOMBINER_CLASS_MEMBERS
#include "AArch64GenPostLegalizeGICombiner.inc"
#undef GET_GICOMBINER_CLASS_MEMBERS
};

#define GET_GICOMBINER_IMPL
#include "AArch64GenPostLegalizeGICombiner.inc"
#undef GET_GICOMBINER_IMPL

AArch64PostLegalizerCombinerImpl::AArch64PostLegalizerCombinerImpl(
    MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
    GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
    const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig,
    const AArch64Subtarget &STI, MachineDominatorTree *MDT,
    const LegalizerInfo *LI)
    : Combiner(MF, CInfo, TPC, &KB, CSEInfo),
      Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI),
      RuleConfig(RuleConfig), STI(STI),
#define GET_GICOMBINER_CONSTRUCTOR_INITS
#include "AArch64GenPostLegalizeGICombiner.inc"
#undef GET_GICOMBINER_CONSTRUCTOR_INITS
{
}

class AArch64PostLegalizerCombiner : public MachineFunctionPass {
public:
  static char ID;

  AArch64PostLegalizerCombiner(bool IsOptNone = false);

  StringRef getPassName() const override {
    return "AArch64PostLegalizerCombiner";
  }

  bool runOnMachineFunction(MachineFunction &MF) override;
  void getAnalysisUsage(AnalysisUsage &AU) const override;

private:
  bool IsOptNone;
  AArch64PostLegalizerCombinerImplRuleConfig RuleConfig;


  struct StoreInfo {
    GStore *St = nullptr;
    // The G_PTR_ADD that's used by the store. We keep this to cache the
    // MachineInstr def.
    GPtrAdd *Ptr = nullptr;
    // The signed offset to the Ptr instruction.
    int64_t Offset = 0;
    LLT StoredType;
  };
  bool tryOptimizeConsecStores(SmallVectorImpl<StoreInfo> &Stores,
                               CSEMIRBuilder &MIB);

  bool optimizeConsecutiveMemOpAddressing(MachineFunction &MF,
                                          CSEMIRBuilder &MIB);
};
} // end anonymous namespace

void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
  AU.addRequired<TargetPassConfig>();
  AU.setPreservesCFG();
  getSelectionDAGFallbackAnalysisUsage(AU);
  AU.addRequired<GISelKnownBitsAnalysis>();
  AU.addPreserved<GISelKnownBitsAnalysis>();
  if (!IsOptNone) {
    AU.addRequired<MachineDominatorTreeWrapperPass>();
    AU.addPreserved<MachineDominatorTreeWrapperPass>();
    AU.addRequired<GISelCSEAnalysisWrapperPass>();
    AU.addPreserved<GISelCSEAnalysisWrapperPass>();
  }
  MachineFunctionPass::getAnalysisUsage(AU);
}

AArch64PostLegalizerCombiner::AArch64PostLegalizerCombiner(bool IsOptNone)
    : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
  initializeAArch64PostLegalizerCombinerPass(*PassRegistry::getPassRegistry());

  if (!RuleConfig.parseCommandLineOption())
    report_fatal_error("Invalid rule identifier");
}

bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
  if (MF.getProperties().hasProperty(
          MachineFunctionProperties::Property::FailedISel))
    return false;
  assert(MF.getProperties().hasProperty(
             MachineFunctionProperties::Property::Legalized) &&
         "Expected a legalized function?");
  auto *TPC = &getAnalysis<TargetPassConfig>();
  const Function &F = MF.getFunction();
  bool EnableOpt =
      MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);

  const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
  const auto *LI = ST.getLegalizerInfo();

  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
  MachineDominatorTree *MDT =
      IsOptNone ? nullptr
                : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
  GISelCSEAnalysisWrapper &Wrapper =
      getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
  auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());

  CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
                     /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(),
                     F.hasMinSize());
  AArch64PostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo,
                                        RuleConfig, ST, MDT, LI);
  bool Changed = Impl.combineMachineInstrs();

  auto MIB = CSEMIRBuilder(MF);
  MIB.setCSEInfo(CSEInfo);
  Changed |= optimizeConsecutiveMemOpAddressing(MF, MIB);
  return Changed;
}

bool AArch64PostLegalizerCombiner::tryOptimizeConsecStores(
    SmallVectorImpl<StoreInfo> &Stores, CSEMIRBuilder &MIB) {
  if (Stores.size() <= 2)
    return false;

  // Profitabity checks:
  int64_t BaseOffset = Stores[0].Offset;
  unsigned NumPairsExpected = Stores.size() / 2;
  unsigned TotalInstsExpected = NumPairsExpected + (Stores.size() % 2);
  // Size savings will depend on whether we can fold the offset, as an
  // immediate of an ADD.
  auto &TLI = *MIB.getMF().getSubtarget().getTargetLowering();
  if (!TLI.isLegalAddImmediate(BaseOffset))
    TotalInstsExpected++;
  int SavingsExpected = Stores.size() - TotalInstsExpected;
  if (SavingsExpected <= 0)
    return false;

  auto &MRI = MIB.getMF().getRegInfo();

  // We have a series of consecutive stores. Factor out the common base
  // pointer and rewrite the offsets.
  Register NewBase = Stores[0].Ptr->getReg(0);
  for (auto &SInfo : Stores) {
    // Compute a new pointer with the new base ptr and adjusted offset.
    MIB.setInstrAndDebugLoc(*SInfo.St);
    auto NewOff = MIB.buildConstant(LLT::scalar(64), SInfo.Offset - BaseOffset);
    auto NewPtr = MIB.buildPtrAdd(MRI.getType(SInfo.St->getPointerReg()),
                                  NewBase, NewOff);
    if (MIB.getObserver())
      MIB.getObserver()->changingInstr(*SInfo.St);
    SInfo.St->getOperand(1).setReg(NewPtr.getReg(0));
    if (MIB.getObserver())
      MIB.getObserver()->changedInstr(*SInfo.St);
  }
  LLVM_DEBUG(dbgs() << "Split a series of " << Stores.size()
                    << " stores into a base pointer and offsets.\n");
  return true;
}

static cl::opt<bool>
    EnableConsecutiveMemOpOpt("aarch64-postlegalizer-consecutive-memops",
                              cl::init(true), cl::Hidden,
                              cl::desc("Enable consecutive memop optimization "
                                       "in AArch64PostLegalizerCombiner"));

bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
    MachineFunction &MF, CSEMIRBuilder &MIB) {
  // This combine needs to run after all reassociations/folds on pointer
  // addressing have been done, specifically those that combine two G_PTR_ADDs
  // with constant offsets into a single G_PTR_ADD with a combined offset.
  // The goal of this optimization is to undo that combine in the case where
  // doing so has prevented the formation of pair stores due to illegal
  // addressing modes of STP. The reason that we do it here is because
  // it's much easier to undo the transformation of a series consecutive
  // mem ops, than it is to detect when doing it would be a bad idea looking
  // at a single G_PTR_ADD in the reassociation/ptradd_immed_chain combine.
  //
  // An example:
  //   G_STORE %11:_(<2 x s64>), %base:_(p0) :: (store (<2 x s64>), align 1)
  //   %off1:_(s64) = G_CONSTANT i64 4128
  //   %p1:_(p0) = G_PTR_ADD %0:_, %off1:_(s64)
  //   G_STORE %11:_(<2 x s64>), %p1:_(p0) :: (store (<2 x s64>), align 1)
  //   %off2:_(s64) = G_CONSTANT i64 4144
  //   %p2:_(p0) = G_PTR_ADD %0:_, %off2:_(s64)
  //   G_STORE %11:_(<2 x s64>), %p2:_(p0) :: (store (<2 x s64>), align 1)
  //   %off3:_(s64) = G_CONSTANT i64 4160
  //   %p3:_(p0) = G_PTR_ADD %0:_, %off3:_(s64)
  //   G_STORE %11:_(<2 x s64>), %17:_(p0) :: (store (<2 x s64>), align 1)
  bool Changed = false;
  auto &MRI = MF.getRegInfo();

  if (!EnableConsecutiveMemOpOpt)
    return Changed;

  SmallVector<StoreInfo, 8> Stores;
  // If we see a load, then we keep track of any values defined by it.
  // In the following example, STP formation will fail anyway because
  // the latter store is using a load result that appears after the
  // the prior store. In this situation if we factor out the offset then
  // we increase code size for no benefit.
  //   G_STORE %v1:_(s64), %base:_(p0) :: (store (s64))
  //   %v2:_(s64) = G_LOAD %ldptr:_(p0) :: (load (s64))
  //   G_STORE %v2:_(s64), %base:_(p0) :: (store (s64))
  SmallVector<Register> LoadValsSinceLastStore;

  auto storeIsValid = [&](StoreInfo &Last, StoreInfo New) {
    // Check if this store is consecutive to the last one.
    if (Last.Ptr->getBaseReg() != New.Ptr->getBaseReg() ||
        (Last.Offset + static_cast<int64_t>(Last.StoredType.getSizeInBytes()) !=
         New.Offset) ||
        Last.StoredType != New.StoredType)
      return false;

    // Check if this store is using a load result that appears after the
    // last store. If so, bail out.
    if (any_of(LoadValsSinceLastStore, [&](Register LoadVal) {
          return New.St->getValueReg() == LoadVal;
        }))
      return false;

    // Check if the current offset would be too large for STP.
    // If not, then STP formation should be able to handle it, so we don't
    // need to do anything.
    int64_t MaxLegalOffset;
    switch (New.StoredType.getSizeInBits()) {
    case 32:
      MaxLegalOffset = 252;
      break;
    case 64:
      MaxLegalOffset = 504;
      break;
    case 128:
      MaxLegalOffset = 1008;
      break;
    default:
      llvm_unreachable("Unexpected stored type size");
    }
    if (New.Offset < MaxLegalOffset)
      return false;

    // If factoring it out still wouldn't help then don't bother.
    return New.Offset - Stores[0].Offset <= MaxLegalOffset;
  };

  auto resetState = [&]() {
    Stores.clear();
    LoadValsSinceLastStore.clear();
  };

  for (auto &MBB : MF) {
    // We're looking inside a single BB at a time since the memset pattern
    // should only be in a single block.
    resetState();
    for (auto &MI : MBB) {
      // Skip for scalable vectors
      if (auto *LdSt = dyn_cast<GLoadStore>(&MI);
          LdSt && MRI.getType(LdSt->getOperand(0).getReg()).isScalableVector())
        continue;

      if (auto *St = dyn_cast<GStore>(&MI)) {
        Register PtrBaseReg;
        APInt Offset;
        LLT StoredValTy = MRI.getType(St->getValueReg());
        unsigned ValSize = StoredValTy.getSizeInBits();
        if (ValSize < 32 || St->getMMO().getSizeInBits() != ValSize)
          continue;

        Register PtrReg = St->getPointerReg();
        if (mi_match(
                PtrReg, MRI,
                m_OneNonDBGUse(m_GPtrAdd(m_Reg(PtrBaseReg), m_ICst(Offset))))) {
          GPtrAdd *PtrAdd = cast<GPtrAdd>(MRI.getVRegDef(PtrReg));
          StoreInfo New = {St, PtrAdd, Offset.getSExtValue(), StoredValTy};

          if (Stores.empty()) {
            Stores.push_back(New);
            continue;
          }

          // Check if this store is a valid continuation of the sequence.
          auto &Last = Stores.back();
          if (storeIsValid(Last, New)) {
            Stores.push_back(New);
            LoadValsSinceLastStore.clear(); // Reset the load value tracking.
          } else {
            // The store isn't a valid to consider for the prior sequence,
            // so try to optimize what we have so far and start a new sequence.
            Changed |= tryOptimizeConsecStores(Stores, MIB);
            resetState();
            Stores.push_back(New);
          }
        }
      } else if (auto *Ld = dyn_cast<GLoad>(&MI)) {
        LoadValsSinceLastStore.push_back(Ld->getDstReg());
      }
    }
    Changed |= tryOptimizeConsecStores(Stores, MIB);
    resetState();
  }

  return Changed;
}

char AArch64PostLegalizerCombiner::ID = 0;
INITIALIZE_PASS_BEGIN(AArch64PostLegalizerCombiner, DEBUG_TYPE,
                      "Combine AArch64 MachineInstrs after legalization", false,
                      false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE,
                    "Combine AArch64 MachineInstrs after legalization", false,
                    false)

namespace llvm {
FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone) {
  return new AArch64PostLegalizerCombiner(IsOptNone);
}
} // end namespace llvm