//===- Scalarizer.cpp - Scalarize vector operations -----------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass converts vector operations into scalar operations (or, optionally, // operations on smaller vector widths), in order to expose optimization // opportunities on the individual scalar operations. // It is mainly intended for targets that do not have vector units, but it // may also be useful for revectorizing code to different vector widths. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/Scalarizer.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" #include #include #include #include #include using namespace llvm; #define DEBUG_TYPE "scalarizer" static cl::opt ClScalarizeVariableInsertExtract( "scalarize-variable-insert-extract", cl::init(true), cl::Hidden, cl::desc("Allow the scalarizer pass to scalarize " "insertelement/extractelement with variable index")); // This is disabled by default because having separate loads and stores // makes it more likely that the -combiner-alias-analysis limits will be // reached. static cl::opt ClScalarizeLoadStore( "scalarize-load-store", cl::init(false), cl::Hidden, cl::desc("Allow the scalarizer pass to scalarize loads and store")); // Split vectors larger than this size into fragments, where each fragment is // either a vector no larger than this size or a scalar. // // Instructions with operands or results of different sizes that would be split // into a different number of fragments are currently left as-is. static cl::opt ClScalarizeMinBits( "scalarize-min-bits", cl::init(0), cl::Hidden, cl::desc("Instruct the scalarizer pass to attempt to keep values of a " "minimum number of bits")); namespace { BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) { BasicBlock *BB = Itr->getParent(); if (isa(Itr)) Itr = BB->getFirstInsertionPt(); if (Itr != BB->end()) Itr = skipDebugIntrinsics(Itr); return Itr; } // Used to store the scattered form of a vector. using ValueVector = SmallVector; // Used to map a vector Value and associated type to its scattered form. // The associated type is only non-null for pointer values that are "scattered" // when used as pointer operands to load or store. // // We use std::map because we want iterators to persist across insertion and // because the values are relatively large. using ScatterMap = std::map, ValueVector>; // Lists Instructions that have been replaced with scalar implementations, // along with a pointer to their scattered forms. using GatherList = SmallVector, 16>; struct VectorSplit { // The type of the vector. FixedVectorType *VecTy = nullptr; // The number of elements packed in a fragment (other than the remainder). unsigned NumPacked = 0; // The number of fragments (scalars or smaller vectors) into which the vector // shall be split. unsigned NumFragments = 0; // The type of each complete fragment. Type *SplitTy = nullptr; // The type of the remainder (last) fragment; null if all fragments are // complete. Type *RemainderTy = nullptr; Type *getFragmentType(unsigned I) const { return RemainderTy && I == NumFragments - 1 ? RemainderTy : SplitTy; } }; // Provides a very limited vector-like interface for lazily accessing one // component of a scattered vector or vector pointer. class Scatterer { public: Scatterer() = default; // Scatter V into Size components. If new instructions are needed, // insert them before BBI in BB. If Cache is nonnull, use it to cache // the results. Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, const VectorSplit &VS, ValueVector *cachePtr = nullptr); // Return component I, creating a new Value for it if necessary. Value *operator[](unsigned I); // Return the number of components. unsigned size() const { return VS.NumFragments; } private: BasicBlock *BB; BasicBlock::iterator BBI; Value *V; VectorSplit VS; bool IsPointer; ValueVector *CachePtr; ValueVector Tmp; }; // FCmpSplitter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp // called Name that compares X and Y in the same way as FCI. struct FCmpSplitter { FCmpSplitter(FCmpInst &fci) : FCI(fci) {} Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, const Twine &Name) const { return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name); } FCmpInst &FCI; }; // ICmpSplitter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp // called Name that compares X and Y in the same way as ICI. struct ICmpSplitter { ICmpSplitter(ICmpInst &ici) : ICI(ici) {} Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, const Twine &Name) const { return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name); } ICmpInst &ICI; }; // UnarySplitter(UO)(Builder, X, Name) uses Builder to create // a unary operator like UO called Name with operand X. struct UnarySplitter { UnarySplitter(UnaryOperator &uo) : UO(uo) {} Value *operator()(IRBuilder<> &Builder, Value *Op, const Twine &Name) const { return Builder.CreateUnOp(UO.getOpcode(), Op, Name); } UnaryOperator &UO; }; // BinarySplitter(BO)(Builder, X, Y, Name) uses Builder to create // a binary operator like BO called Name with operands X and Y. struct BinarySplitter { BinarySplitter(BinaryOperator &bo) : BO(bo) {} Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1, const Twine &Name) const { return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name); } BinaryOperator &BO; }; // Information about a load or store that we're scalarizing. struct VectorLayout { VectorLayout() = default; // Return the alignment of fragment Frag. Align getFragmentAlign(unsigned Frag) { return commonAlignment(VecAlign, Frag * SplitSize); } // The split of the underlying vector type. VectorSplit VS; // The alignment of the vector. Align VecAlign; // The size of each (non-remainder) fragment in bytes. uint64_t SplitSize = 0; }; /// Concatenate the given fragments to a single vector value of the type /// described in @p VS. static Value *concatenate(IRBuilder<> &Builder, ArrayRef Fragments, const VectorSplit &VS, Twine Name) { unsigned NumElements = VS.VecTy->getNumElements(); SmallVector ExtendMask; SmallVector InsertMask; if (VS.NumPacked > 1) { // Prepare the shufflevector masks once and re-use them for all // fragments. ExtendMask.resize(NumElements, -1); for (unsigned I = 0; I < VS.NumPacked; ++I) ExtendMask[I] = I; InsertMask.resize(NumElements); for (unsigned I = 0; I < NumElements; ++I) InsertMask[I] = I; } Value *Res = PoisonValue::get(VS.VecTy); for (unsigned I = 0; I < VS.NumFragments; ++I) { Value *Fragment = Fragments[I]; unsigned NumPacked = VS.NumPacked; if (I == VS.NumFragments - 1 && VS.RemainderTy) { if (auto *RemVecTy = dyn_cast(VS.RemainderTy)) NumPacked = RemVecTy->getNumElements(); else NumPacked = 1; } if (NumPacked == 1) { Res = Builder.CreateInsertElement(Res, Fragment, I * VS.NumPacked, Name + ".upto" + Twine(I)); } else { Fragment = Builder.CreateShuffleVector(Fragment, Fragment, ExtendMask); if (I == 0) { Res = Fragment; } else { for (unsigned J = 0; J < NumPacked; ++J) InsertMask[I * VS.NumPacked + J] = NumElements + J; Res = Builder.CreateShuffleVector(Res, Fragment, InsertMask, Name + ".upto" + Twine(I)); for (unsigned J = 0; J < NumPacked; ++J) InsertMask[I * VS.NumPacked + J] = I * VS.NumPacked + J; } } } return Res; } template T getWithDefaultOverride(const cl::opt &ClOption, const std::optional &DefaultOverride) { return ClOption.getNumOccurrences() ? ClOption : DefaultOverride.value_or(ClOption); } class ScalarizerVisitor : public InstVisitor { public: ScalarizerVisitor(unsigned ParallelLoopAccessMDKind, DominatorTree *DT, ScalarizerPassOptions Options) : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind), DT(DT), ScalarizeVariableInsertExtract( getWithDefaultOverride(ClScalarizeVariableInsertExtract, Options.ScalarizeVariableInsertExtract)), ScalarizeLoadStore(getWithDefaultOverride(ClScalarizeLoadStore, Options.ScalarizeLoadStore)), ScalarizeMinBits(getWithDefaultOverride(ClScalarizeMinBits, Options.ScalarizeMinBits)) {} bool visit(Function &F); // InstVisitor methods. They return true if the instruction was scalarized, // false if nothing changed. bool visitInstruction(Instruction &I) { return false; } bool visitSelectInst(SelectInst &SI); bool visitICmpInst(ICmpInst &ICI); bool visitFCmpInst(FCmpInst &FCI); bool visitUnaryOperator(UnaryOperator &UO); bool visitBinaryOperator(BinaryOperator &BO); bool visitGetElementPtrInst(GetElementPtrInst &GEPI); bool visitCastInst(CastInst &CI); bool visitBitCastInst(BitCastInst &BCI); bool visitInsertElementInst(InsertElementInst &IEI); bool visitExtractElementInst(ExtractElementInst &EEI); bool visitShuffleVectorInst(ShuffleVectorInst &SVI); bool visitPHINode(PHINode &PHI); bool visitLoadInst(LoadInst &LI); bool visitStoreInst(StoreInst &SI); bool visitCallInst(CallInst &ICI); bool visitFreezeInst(FreezeInst &FI); private: Scatterer scatter(Instruction *Point, Value *V, const VectorSplit &VS); void gather(Instruction *Op, const ValueVector &CV, const VectorSplit &VS); void replaceUses(Instruction *Op, Value *CV); bool canTransferMetadata(unsigned Kind); void transferMetadataAndIRFlags(Instruction *Op, const ValueVector &CV); std::optional getVectorSplit(Type *Ty); std::optional getVectorLayout(Type *Ty, Align Alignment, const DataLayout &DL); bool finish(); template bool splitUnary(Instruction &, const T &); template bool splitBinary(Instruction &, const T &); bool splitCall(CallInst &CI); ScatterMap Scattered; GatherList Gathered; bool Scalarized; SmallVector PotentiallyDeadInstrs; unsigned ParallelLoopAccessMDKind; DominatorTree *DT; const bool ScalarizeVariableInsertExtract; const bool ScalarizeLoadStore; const unsigned ScalarizeMinBits; }; class ScalarizerLegacyPass : public FunctionPass { public: static char ID; ScalarizerLegacyPass() : FunctionPass(ID) { initializeScalarizerLegacyPassPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage& AU) const override { AU.addRequired(); AU.addPreserved(); } }; } // end anonymous namespace char ScalarizerLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(ScalarizerLegacyPass, "scalarizer", "Scalarize vector operations", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer", "Scalarize vector operations", false, false) Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, const VectorSplit &VS, ValueVector *cachePtr) : BB(bb), BBI(bbi), V(v), VS(VS), CachePtr(cachePtr) { IsPointer = V->getType()->isPointerTy(); if (!CachePtr) { Tmp.resize(VS.NumFragments, nullptr); } else { assert((CachePtr->empty() || VS.NumFragments == CachePtr->size() || IsPointer) && "Inconsistent vector sizes"); if (VS.NumFragments > CachePtr->size()) CachePtr->resize(VS.NumFragments, nullptr); } } // Return fragment Frag, creating a new Value for it if necessary. Value *Scatterer::operator[](unsigned Frag) { ValueVector &CV = CachePtr ? *CachePtr : Tmp; // Try to reuse a previous value. if (CV[Frag]) return CV[Frag]; IRBuilder<> Builder(BB, BBI); if (IsPointer) { if (Frag == 0) CV[Frag] = V; else CV[Frag] = Builder.CreateConstGEP1_32(VS.SplitTy, V, Frag, V->getName() + ".i" + Twine(Frag)); return CV[Frag]; } Type *FragmentTy = VS.getFragmentType(Frag); if (auto *VecTy = dyn_cast(FragmentTy)) { SmallVector Mask; for (unsigned J = 0; J < VecTy->getNumElements(); ++J) Mask.push_back(Frag * VS.NumPacked + J); CV[Frag] = Builder.CreateShuffleVector(V, PoisonValue::get(V->getType()), Mask, V->getName() + ".i" + Twine(Frag)); } else { // Search through a chain of InsertElementInsts looking for element Frag. // Record other elements in the cache. The new V is still suitable // for all uncached indices. while (true) { InsertElementInst *Insert = dyn_cast(V); if (!Insert) break; ConstantInt *Idx = dyn_cast(Insert->getOperand(2)); if (!Idx) break; unsigned J = Idx->getZExtValue(); V = Insert->getOperand(0); if (Frag * VS.NumPacked == J) { CV[Frag] = Insert->getOperand(1); return CV[Frag]; } if (VS.NumPacked == 1 && !CV[J]) { // Only cache the first entry we find for each index we're not actively // searching for. This prevents us from going too far up the chain and // caching incorrect entries. CV[J] = Insert->getOperand(1); } } CV[Frag] = Builder.CreateExtractElement(V, Frag * VS.NumPacked, V->getName() + ".i" + Twine(Frag)); } return CV[Frag]; } bool ScalarizerLegacyPass::runOnFunction(Function &F) { if (skipFunction(F)) return false; Module &M = *F.getParent(); unsigned ParallelLoopAccessMDKind = M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); DominatorTree *DT = &getAnalysis().getDomTree(); ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT, ScalarizerPassOptions()); return Impl.visit(F); } FunctionPass *llvm::createScalarizerPass() { return new ScalarizerLegacyPass(); } bool ScalarizerVisitor::visit(Function &F) { assert(Gathered.empty() && Scattered.empty()); Scalarized = false; // To ensure we replace gathered components correctly we need to do an ordered // traversal of the basic blocks in the function. ReversePostOrderTraversal RPOT(&F.getEntryBlock()); for (BasicBlock *BB : RPOT) { for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { Instruction *I = &*II; bool Done = InstVisitor::visit(I); ++II; if (Done && I->getType()->isVoidTy()) I->eraseFromParent(); } } return finish(); } // Return a scattered form of V that can be accessed by Point. V must be a // vector or a pointer to a vector. Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V, const VectorSplit &VS) { if (Argument *VArg = dyn_cast(V)) { // Put the scattered form of arguments in the entry block, // so that it can be used everywhere. Function *F = VArg->getParent(); BasicBlock *BB = &F->getEntryBlock(); return Scatterer(BB, BB->begin(), V, VS, &Scattered[{V, VS.SplitTy}]); } if (Instruction *VOp = dyn_cast(V)) { // When scalarizing PHI nodes we might try to examine/rewrite InsertElement // nodes in predecessors. If those predecessors are unreachable from entry, // then the IR in those blocks could have unexpected properties resulting in // infinite loops in Scatterer::operator[]. By simply treating values // originating from instructions in unreachable blocks as undef we do not // need to analyse them further. if (!DT->isReachableFromEntry(VOp->getParent())) return Scatterer(Point->getParent(), Point->getIterator(), PoisonValue::get(V->getType()), VS); // Put the scattered form of an instruction directly after the // instruction, skipping over PHI nodes and debug intrinsics. BasicBlock *BB = VOp->getParent(); return Scatterer( BB, skipPastPhiNodesAndDbg(std::next(BasicBlock::iterator(VOp))), V, VS, &Scattered[{V, VS.SplitTy}]); } // In the fallback case, just put the scattered before Point and // keep the result local to Point. return Scatterer(Point->getParent(), Point->getIterator(), V, VS); } // Replace Op with the gathered form of the components in CV. Defer the // deletion of Op and creation of the gathered form to the end of the pass, // so that we can avoid creating the gathered form if all uses of Op are // replaced with uses of CV. void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV, const VectorSplit &VS) { transferMetadataAndIRFlags(Op, CV); // If we already have a scattered form of Op (created from ExtractElements // of Op itself), replace them with the new form. ValueVector &SV = Scattered[{Op, VS.SplitTy}]; if (!SV.empty()) { for (unsigned I = 0, E = SV.size(); I != E; ++I) { Value *V = SV[I]; if (V == nullptr || SV[I] == CV[I]) continue; Instruction *Old = cast(V); if (isa(CV[I])) CV[I]->takeName(Old); Old->replaceAllUsesWith(CV[I]); PotentiallyDeadInstrs.emplace_back(Old); } } SV = CV; Gathered.push_back(GatherList::value_type(Op, &SV)); } // Replace Op with CV and collect Op has a potentially dead instruction. void ScalarizerVisitor::replaceUses(Instruction *Op, Value *CV) { if (CV != Op) { Op->replaceAllUsesWith(CV); PotentiallyDeadInstrs.emplace_back(Op); Scalarized = true; } } // Return true if it is safe to transfer the given metadata tag from // vector to scalar instructions. bool ScalarizerVisitor::canTransferMetadata(unsigned Tag) { return (Tag == LLVMContext::MD_tbaa || Tag == LLVMContext::MD_fpmath || Tag == LLVMContext::MD_tbaa_struct || Tag == LLVMContext::MD_invariant_load || Tag == LLVMContext::MD_alias_scope || Tag == LLVMContext::MD_noalias || Tag == ParallelLoopAccessMDKind || Tag == LLVMContext::MD_access_group); } // Transfer metadata from Op to the instructions in CV if it is known // to be safe to do so. void ScalarizerVisitor::transferMetadataAndIRFlags(Instruction *Op, const ValueVector &CV) { SmallVector, 4> MDs; Op->getAllMetadataOtherThanDebugLoc(MDs); for (unsigned I = 0, E = CV.size(); I != E; ++I) { if (Instruction *New = dyn_cast(CV[I])) { for (const auto &MD : MDs) if (canTransferMetadata(MD.first)) New->setMetadata(MD.first, MD.second); New->copyIRFlags(Op); if (Op->getDebugLoc() && !New->getDebugLoc()) New->setDebugLoc(Op->getDebugLoc()); } } } // Determine how Ty is split, if at all. std::optional ScalarizerVisitor::getVectorSplit(Type *Ty) { VectorSplit Split; Split.VecTy = dyn_cast(Ty); if (!Split.VecTy) return {}; unsigned NumElems = Split.VecTy->getNumElements(); Type *ElemTy = Split.VecTy->getElementType(); if (NumElems == 1 || ElemTy->isPointerTy() || 2 * ElemTy->getScalarSizeInBits() > ScalarizeMinBits) { Split.NumPacked = 1; Split.NumFragments = NumElems; Split.SplitTy = ElemTy; } else { Split.NumPacked = ScalarizeMinBits / ElemTy->getScalarSizeInBits(); if (Split.NumPacked >= NumElems) return {}; Split.NumFragments = divideCeil(NumElems, Split.NumPacked); Split.SplitTy = FixedVectorType::get(ElemTy, Split.NumPacked); unsigned RemainderElems = NumElems % Split.NumPacked; if (RemainderElems > 1) Split.RemainderTy = FixedVectorType::get(ElemTy, RemainderElems); else if (RemainderElems == 1) Split.RemainderTy = ElemTy; } return Split; } // Try to fill in Layout from Ty, returning true on success. Alignment is // the alignment of the vector, or std::nullopt if the ABI default should be // used. std::optional ScalarizerVisitor::getVectorLayout(Type *Ty, Align Alignment, const DataLayout &DL) { std::optional VS = getVectorSplit(Ty); if (!VS) return {}; VectorLayout Layout; Layout.VS = *VS; // Check that we're dealing with full-byte fragments. if (!DL.typeSizeEqualsStoreSize(VS->SplitTy) || (VS->RemainderTy && !DL.typeSizeEqualsStoreSize(VS->RemainderTy))) return {}; Layout.VecAlign = Alignment; Layout.SplitSize = DL.getTypeStoreSize(VS->SplitTy); return Layout; } // Scalarize one-operand instruction I, using Split(Builder, X, Name) // to create an instruction like I with operand X and name Name. template bool ScalarizerVisitor::splitUnary(Instruction &I, const Splitter &Split) { std::optional VS = getVectorSplit(I.getType()); if (!VS) return false; std::optional OpVS; if (I.getOperand(0)->getType() == I.getType()) { OpVS = VS; } else { OpVS = getVectorSplit(I.getOperand(0)->getType()); if (!OpVS || VS->NumPacked != OpVS->NumPacked) return false; } IRBuilder<> Builder(&I); Scatterer Op = scatter(&I, I.getOperand(0), *OpVS); assert(Op.size() == VS->NumFragments && "Mismatched unary operation"); ValueVector Res; Res.resize(VS->NumFragments); for (unsigned Frag = 0; Frag < VS->NumFragments; ++Frag) Res[Frag] = Split(Builder, Op[Frag], I.getName() + ".i" + Twine(Frag)); gather(&I, Res, *VS); return true; } // Scalarize two-operand instruction I, using Split(Builder, X, Y, Name) // to create an instruction like I with operands X and Y and name Name. template bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) { std::optional VS = getVectorSplit(I.getType()); if (!VS) return false; std::optional OpVS; if (I.getOperand(0)->getType() == I.getType()) { OpVS = VS; } else { OpVS = getVectorSplit(I.getOperand(0)->getType()); if (!OpVS || VS->NumPacked != OpVS->NumPacked) return false; } IRBuilder<> Builder(&I); Scatterer VOp0 = scatter(&I, I.getOperand(0), *OpVS); Scatterer VOp1 = scatter(&I, I.getOperand(1), *OpVS); assert(VOp0.size() == VS->NumFragments && "Mismatched binary operation"); assert(VOp1.size() == VS->NumFragments && "Mismatched binary operation"); ValueVector Res; Res.resize(VS->NumFragments); for (unsigned Frag = 0; Frag < VS->NumFragments; ++Frag) { Value *Op0 = VOp0[Frag]; Value *Op1 = VOp1[Frag]; Res[Frag] = Split(Builder, Op0, Op1, I.getName() + ".i" + Twine(Frag)); } gather(&I, Res, *VS); return true; } static bool isTriviallyScalariable(Intrinsic::ID ID) { return isTriviallyVectorizable(ID); } /// If a call to a vector typed intrinsic function, split into a scalar call per /// element if possible for the intrinsic. bool ScalarizerVisitor::splitCall(CallInst &CI) { std::optional VS = getVectorSplit(CI.getType()); if (!VS) return false; Function *F = CI.getCalledFunction(); if (!F) return false; Intrinsic::ID ID = F->getIntrinsicID(); if (ID == Intrinsic::not_intrinsic || !isTriviallyScalariable(ID)) return false; // unsigned NumElems = VT->getNumElements(); unsigned NumArgs = CI.arg_size(); ValueVector ScalarOperands(NumArgs); SmallVector Scattered(NumArgs); SmallVector OverloadIdx(NumArgs, -1); SmallVector Tys; // Add return type if intrinsic is overloaded on it. if (isVectorIntrinsicWithOverloadTypeAtArg(ID, -1)) Tys.push_back(VS->SplitTy); // Assumes that any vector type has the same number of elements as the return // vector type, which is true for all current intrinsics. for (unsigned I = 0; I != NumArgs; ++I) { Value *OpI = CI.getOperand(I); if (auto *OpVecTy = dyn_cast(OpI->getType())) { assert(OpVecTy->getNumElements() == VS->VecTy->getNumElements()); std::optional OpVS = getVectorSplit(OpI->getType()); if (!OpVS || OpVS->NumPacked != VS->NumPacked) { // The natural split of the operand doesn't match the result. This could // happen if the vector elements are different and the ScalarizeMinBits // option is used. // // We could in principle handle this case as well, at the cost of // complicating the scattering machinery to support multiple scattering // granularities for a single value. return false; } Scattered[I] = scatter(&CI, OpI, *OpVS); if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) { OverloadIdx[I] = Tys.size(); Tys.push_back(OpVS->SplitTy); } } else { ScalarOperands[I] = OpI; if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) Tys.push_back(OpI->getType()); } } ValueVector Res(VS->NumFragments); ValueVector ScalarCallOps(NumArgs); Function *NewIntrin = Intrinsic::getDeclaration(F->getParent(), ID, Tys); IRBuilder<> Builder(&CI); // Perform actual scalarization, taking care to preserve any scalar operands. for (unsigned I = 0; I < VS->NumFragments; ++I) { bool IsRemainder = I == VS->NumFragments - 1 && VS->RemainderTy; ScalarCallOps.clear(); if (IsRemainder) Tys[0] = VS->RemainderTy; for (unsigned J = 0; J != NumArgs; ++J) { if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) { ScalarCallOps.push_back(ScalarOperands[J]); } else { ScalarCallOps.push_back(Scattered[J][I]); if (IsRemainder && OverloadIdx[J] >= 0) Tys[OverloadIdx[J]] = Scattered[J][I]->getType(); } } if (IsRemainder) NewIntrin = Intrinsic::getDeclaration(F->getParent(), ID, Tys); Res[I] = Builder.CreateCall(NewIntrin, ScalarCallOps, CI.getName() + ".i" + Twine(I)); } gather(&CI, Res, *VS); return true; } bool ScalarizerVisitor::visitSelectInst(SelectInst &SI) { std::optional VS = getVectorSplit(SI.getType()); if (!VS) return false; std::optional CondVS; if (isa(SI.getCondition()->getType())) { CondVS = getVectorSplit(SI.getCondition()->getType()); if (!CondVS || CondVS->NumPacked != VS->NumPacked) { // This happens when ScalarizeMinBits is used. return false; } } IRBuilder<> Builder(&SI); Scatterer VOp1 = scatter(&SI, SI.getOperand(1), *VS); Scatterer VOp2 = scatter(&SI, SI.getOperand(2), *VS); assert(VOp1.size() == VS->NumFragments && "Mismatched select"); assert(VOp2.size() == VS->NumFragments && "Mismatched select"); ValueVector Res; Res.resize(VS->NumFragments); if (CondVS) { Scatterer VOp0 = scatter(&SI, SI.getOperand(0), *CondVS); assert(VOp0.size() == CondVS->NumFragments && "Mismatched select"); for (unsigned I = 0; I < VS->NumFragments; ++I) { Value *Op0 = VOp0[I]; Value *Op1 = VOp1[I]; Value *Op2 = VOp2[I]; Res[I] = Builder.CreateSelect(Op0, Op1, Op2, SI.getName() + ".i" + Twine(I)); } } else { Value *Op0 = SI.getOperand(0); for (unsigned I = 0; I < VS->NumFragments; ++I) { Value *Op1 = VOp1[I]; Value *Op2 = VOp2[I]; Res[I] = Builder.CreateSelect(Op0, Op1, Op2, SI.getName() + ".i" + Twine(I)); } } gather(&SI, Res, *VS); return true; } bool ScalarizerVisitor::visitICmpInst(ICmpInst &ICI) { return splitBinary(ICI, ICmpSplitter(ICI)); } bool ScalarizerVisitor::visitFCmpInst(FCmpInst &FCI) { return splitBinary(FCI, FCmpSplitter(FCI)); } bool ScalarizerVisitor::visitUnaryOperator(UnaryOperator &UO) { return splitUnary(UO, UnarySplitter(UO)); } bool ScalarizerVisitor::visitBinaryOperator(BinaryOperator &BO) { return splitBinary(BO, BinarySplitter(BO)); } bool ScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { std::optional VS = getVectorSplit(GEPI.getType()); if (!VS) return false; IRBuilder<> Builder(&GEPI); unsigned NumIndices = GEPI.getNumIndices(); // The base pointer and indices might be scalar even if it's a vector GEP. SmallVector ScalarOps{1 + NumIndices}; SmallVector ScatterOps{1 + NumIndices}; for (unsigned I = 0; I < 1 + NumIndices; ++I) { if (auto *VecTy = dyn_cast(GEPI.getOperand(I)->getType())) { std::optional OpVS = getVectorSplit(VecTy); if (!OpVS || OpVS->NumPacked != VS->NumPacked) { // This can happen when ScalarizeMinBits is used. return false; } ScatterOps[I] = scatter(&GEPI, GEPI.getOperand(I), *OpVS); } else { ScalarOps[I] = GEPI.getOperand(I); } } ValueVector Res; Res.resize(VS->NumFragments); for (unsigned I = 0; I < VS->NumFragments; ++I) { SmallVector SplitOps; SplitOps.resize(1 + NumIndices); for (unsigned J = 0; J < 1 + NumIndices; ++J) { if (ScalarOps[J]) SplitOps[J] = ScalarOps[J]; else SplitOps[J] = ScatterOps[J][I]; } Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), SplitOps[0], ArrayRef(SplitOps).drop_front(), GEPI.getName() + ".i" + Twine(I)); if (GEPI.isInBounds()) if (GetElementPtrInst *NewGEPI = dyn_cast(Res[I])) NewGEPI->setIsInBounds(); } gather(&GEPI, Res, *VS); return true; } bool ScalarizerVisitor::visitCastInst(CastInst &CI) { std::optional DestVS = getVectorSplit(CI.getDestTy()); if (!DestVS) return false; std::optional SrcVS = getVectorSplit(CI.getSrcTy()); if (!SrcVS || SrcVS->NumPacked != DestVS->NumPacked) return false; IRBuilder<> Builder(&CI); Scatterer Op0 = scatter(&CI, CI.getOperand(0), *SrcVS); assert(Op0.size() == SrcVS->NumFragments && "Mismatched cast"); ValueVector Res; Res.resize(DestVS->NumFragments); for (unsigned I = 0; I < DestVS->NumFragments; ++I) Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], DestVS->getFragmentType(I), CI.getName() + ".i" + Twine(I)); gather(&CI, Res, *DestVS); return true; } bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) { std::optional DstVS = getVectorSplit(BCI.getDestTy()); std::optional SrcVS = getVectorSplit(BCI.getSrcTy()); if (!DstVS || !SrcVS || DstVS->RemainderTy || SrcVS->RemainderTy) return false; const bool isPointerTy = DstVS->VecTy->getElementType()->isPointerTy(); // Vectors of pointers are always fully scalarized. assert(!isPointerTy || (DstVS->NumPacked == 1 && SrcVS->NumPacked == 1)); IRBuilder<> Builder(&BCI); Scatterer Op0 = scatter(&BCI, BCI.getOperand(0), *SrcVS); ValueVector Res; Res.resize(DstVS->NumFragments); unsigned DstSplitBits = DstVS->SplitTy->getPrimitiveSizeInBits(); unsigned SrcSplitBits = SrcVS->SplitTy->getPrimitiveSizeInBits(); if (isPointerTy || DstSplitBits == SrcSplitBits) { assert(DstVS->NumFragments == SrcVS->NumFragments); for (unsigned I = 0; I < DstVS->NumFragments; ++I) { Res[I] = Builder.CreateBitCast(Op0[I], DstVS->getFragmentType(I), BCI.getName() + ".i" + Twine(I)); } } else if (SrcSplitBits % DstSplitBits == 0) { // Convert each source fragment to the same-sized destination vector and // then scatter the result to the destination. VectorSplit MidVS; MidVS.NumPacked = DstVS->NumPacked; MidVS.NumFragments = SrcSplitBits / DstSplitBits; MidVS.VecTy = FixedVectorType::get(DstVS->VecTy->getElementType(), MidVS.NumPacked * MidVS.NumFragments); MidVS.SplitTy = DstVS->SplitTy; unsigned ResI = 0; for (unsigned I = 0; I < SrcVS->NumFragments; ++I) { Value *V = Op0[I]; // Look through any existing bitcasts before converting to . // In the best case, the resulting conversion might be a no-op. Instruction *VI; while ((VI = dyn_cast(V)) && VI->getOpcode() == Instruction::BitCast) V = VI->getOperand(0); V = Builder.CreateBitCast(V, MidVS.VecTy, V->getName() + ".cast"); Scatterer Mid = scatter(&BCI, V, MidVS); for (unsigned J = 0; J < MidVS.NumFragments; ++J) Res[ResI++] = Mid[J]; } } else if (DstSplitBits % SrcSplitBits == 0) { // Gather enough source fragments to make up a destination fragment and // then convert to the destination type. VectorSplit MidVS; MidVS.NumFragments = DstSplitBits / SrcSplitBits; MidVS.NumPacked = SrcVS->NumPacked; MidVS.VecTy = FixedVectorType::get(SrcVS->VecTy->getElementType(), MidVS.NumPacked * MidVS.NumFragments); MidVS.SplitTy = SrcVS->SplitTy; unsigned SrcI = 0; SmallVector ConcatOps; ConcatOps.resize(MidVS.NumFragments); for (unsigned I = 0; I < DstVS->NumFragments; ++I) { for (unsigned J = 0; J < MidVS.NumFragments; ++J) ConcatOps[J] = Op0[SrcI++]; Value *V = concatenate(Builder, ConcatOps, MidVS, BCI.getName() + ".i" + Twine(I)); Res[I] = Builder.CreateBitCast(V, DstVS->getFragmentType(I), BCI.getName() + ".i" + Twine(I)); } } else { return false; } gather(&BCI, Res, *DstVS); return true; } bool ScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) { std::optional VS = getVectorSplit(IEI.getType()); if (!VS) return false; IRBuilder<> Builder(&IEI); Scatterer Op0 = scatter(&IEI, IEI.getOperand(0), *VS); Value *NewElt = IEI.getOperand(1); Value *InsIdx = IEI.getOperand(2); ValueVector Res; Res.resize(VS->NumFragments); if (auto *CI = dyn_cast(InsIdx)) { unsigned Idx = CI->getZExtValue(); unsigned Fragment = Idx / VS->NumPacked; for (unsigned I = 0; I < VS->NumFragments; ++I) { if (I == Fragment) { bool IsPacked = VS->NumPacked > 1; if (Fragment == VS->NumFragments - 1 && VS->RemainderTy && !VS->RemainderTy->isVectorTy()) IsPacked = false; if (IsPacked) { Res[I] = Builder.CreateInsertElement(Op0[I], NewElt, Idx % VS->NumPacked); } else { Res[I] = NewElt; } } else { Res[I] = Op0[I]; } } } else { // Never split a variable insertelement that isn't fully scalarized. if (!ScalarizeVariableInsertExtract || VS->NumPacked > 1) return false; for (unsigned I = 0; I < VS->NumFragments; ++I) { Value *ShouldReplace = Builder.CreateICmpEQ(InsIdx, ConstantInt::get(InsIdx->getType(), I), InsIdx->getName() + ".is." + Twine(I)); Value *OldElt = Op0[I]; Res[I] = Builder.CreateSelect(ShouldReplace, NewElt, OldElt, IEI.getName() + ".i" + Twine(I)); } } gather(&IEI, Res, *VS); return true; } bool ScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) { std::optional VS = getVectorSplit(EEI.getOperand(0)->getType()); if (!VS) return false; IRBuilder<> Builder(&EEI); Scatterer Op0 = scatter(&EEI, EEI.getOperand(0), *VS); Value *ExtIdx = EEI.getOperand(1); if (auto *CI = dyn_cast(ExtIdx)) { unsigned Idx = CI->getZExtValue(); unsigned Fragment = Idx / VS->NumPacked; Value *Res = Op0[Fragment]; bool IsPacked = VS->NumPacked > 1; if (Fragment == VS->NumFragments - 1 && VS->RemainderTy && !VS->RemainderTy->isVectorTy()) IsPacked = false; if (IsPacked) Res = Builder.CreateExtractElement(Res, Idx % VS->NumPacked); replaceUses(&EEI, Res); return true; } // Never split a variable extractelement that isn't fully scalarized. if (!ScalarizeVariableInsertExtract || VS->NumPacked > 1) return false; Value *Res = PoisonValue::get(VS->VecTy->getElementType()); for (unsigned I = 0; I < VS->NumFragments; ++I) { Value *ShouldExtract = Builder.CreateICmpEQ(ExtIdx, ConstantInt::get(ExtIdx->getType(), I), ExtIdx->getName() + ".is." + Twine(I)); Value *Elt = Op0[I]; Res = Builder.CreateSelect(ShouldExtract, Elt, Res, EEI.getName() + ".upto" + Twine(I)); } replaceUses(&EEI, Res); return true; } bool ScalarizerVisitor::visitShuffleVectorInst(ShuffleVectorInst &SVI) { std::optional VS = getVectorSplit(SVI.getType()); std::optional VSOp = getVectorSplit(SVI.getOperand(0)->getType()); if (!VS || !VSOp || VS->NumPacked > 1 || VSOp->NumPacked > 1) return false; Scatterer Op0 = scatter(&SVI, SVI.getOperand(0), *VSOp); Scatterer Op1 = scatter(&SVI, SVI.getOperand(1), *VSOp); ValueVector Res; Res.resize(VS->NumFragments); for (unsigned I = 0; I < VS->NumFragments; ++I) { int Selector = SVI.getMaskValue(I); if (Selector < 0) Res[I] = PoisonValue::get(VS->VecTy->getElementType()); else if (unsigned(Selector) < Op0.size()) Res[I] = Op0[Selector]; else Res[I] = Op1[Selector - Op0.size()]; } gather(&SVI, Res, *VS); return true; } bool ScalarizerVisitor::visitPHINode(PHINode &PHI) { std::optional VS = getVectorSplit(PHI.getType()); if (!VS) return false; IRBuilder<> Builder(&PHI); ValueVector Res; Res.resize(VS->NumFragments); unsigned NumOps = PHI.getNumOperands(); for (unsigned I = 0; I < VS->NumFragments; ++I) { Res[I] = Builder.CreatePHI(VS->getFragmentType(I), NumOps, PHI.getName() + ".i" + Twine(I)); } for (unsigned I = 0; I < NumOps; ++I) { Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I), *VS); BasicBlock *IncomingBlock = PHI.getIncomingBlock(I); for (unsigned J = 0; J < VS->NumFragments; ++J) cast(Res[J])->addIncoming(Op[J], IncomingBlock); } gather(&PHI, Res, *VS); return true; } bool ScalarizerVisitor::visitLoadInst(LoadInst &LI) { if (!ScalarizeLoadStore) return false; if (!LI.isSimple()) return false; std::optional Layout = getVectorLayout( LI.getType(), LI.getAlign(), LI.getModule()->getDataLayout()); if (!Layout) return false; IRBuilder<> Builder(&LI); Scatterer Ptr = scatter(&LI, LI.getPointerOperand(), Layout->VS); ValueVector Res; Res.resize(Layout->VS.NumFragments); for (unsigned I = 0; I < Layout->VS.NumFragments; ++I) { Res[I] = Builder.CreateAlignedLoad(Layout->VS.getFragmentType(I), Ptr[I], Align(Layout->getFragmentAlign(I)), LI.getName() + ".i" + Twine(I)); } gather(&LI, Res, Layout->VS); return true; } bool ScalarizerVisitor::visitStoreInst(StoreInst &SI) { if (!ScalarizeLoadStore) return false; if (!SI.isSimple()) return false; Value *FullValue = SI.getValueOperand(); std::optional Layout = getVectorLayout( FullValue->getType(), SI.getAlign(), SI.getModule()->getDataLayout()); if (!Layout) return false; IRBuilder<> Builder(&SI); Scatterer VPtr = scatter(&SI, SI.getPointerOperand(), Layout->VS); Scatterer VVal = scatter(&SI, FullValue, Layout->VS); ValueVector Stores; Stores.resize(Layout->VS.NumFragments); for (unsigned I = 0; I < Layout->VS.NumFragments; ++I) { Value *Val = VVal[I]; Value *Ptr = VPtr[I]; Stores[I] = Builder.CreateAlignedStore(Val, Ptr, Layout->getFragmentAlign(I)); } transferMetadataAndIRFlags(&SI, Stores); return true; } bool ScalarizerVisitor::visitCallInst(CallInst &CI) { return splitCall(CI); } bool ScalarizerVisitor::visitFreezeInst(FreezeInst &FI) { return splitUnary(FI, [](IRBuilder<> &Builder, Value *Op, const Twine &Name) { return Builder.CreateFreeze(Op, Name); }); } // Delete the instructions that we scalarized. If a full vector result // is still needed, recreate it using InsertElements. bool ScalarizerVisitor::finish() { // The presence of data in Gathered or Scattered indicates changes // made to the Function. if (Gathered.empty() && Scattered.empty() && !Scalarized) return false; for (const auto &GMI : Gathered) { Instruction *Op = GMI.first; ValueVector &CV = *GMI.second; if (!Op->use_empty()) { // The value is still needed, so recreate it using a series of // insertelements and/or shufflevectors. Value *Res; if (auto *Ty = dyn_cast(Op->getType())) { BasicBlock *BB = Op->getParent(); IRBuilder<> Builder(Op); if (isa(Op)) Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); VectorSplit VS = *getVectorSplit(Ty); assert(VS.NumFragments == CV.size()); Res = concatenate(Builder, CV, VS, Op->getName()); Res->takeName(Op); } else { assert(CV.size() == 1 && Op->getType() == CV[0]->getType()); Res = CV[0]; if (Op == Res) continue; } Op->replaceAllUsesWith(Res); } PotentiallyDeadInstrs.emplace_back(Op); } Gathered.clear(); Scattered.clear(); Scalarized = false; RecursivelyDeleteTriviallyDeadInstructionsPermissive(PotentiallyDeadInstrs); return true; } PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM) { Module &M = *F.getParent(); unsigned ParallelLoopAccessMDKind = M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); DominatorTree *DT = &AM.getResult(F); ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT, Options); bool Changed = Impl.visit(F); PreservedAnalyses PA; PA.preserve(); return Changed ? PA : PreservedAnalyses::all(); }