//===- IRSimilarityIdentifier.cpp - Find similarity in a module -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // \file // Implementation file for the IRSimilarityIdentifier for identifying // similarities in IR including the IRInstructionMapper. // //===----------------------------------------------------------------------===// #include "llvm/Analysis/IRSimilarityIdentifier.h" #include "llvm/ADT/DenseMap.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Operator.h" #include "llvm/IR/User.h" #include "llvm/InitializePasses.h" #include "llvm/Support/SuffixTree.h" using namespace llvm; using namespace IRSimilarity; namespace llvm { cl::opt DisableBranches("no-ir-sim-branch-matching", cl::init(false), cl::ReallyHidden, cl::desc("disable similarity matching, and outlining, " "across branches for debugging purposes.")); cl::opt DisableIndirectCalls("no-ir-sim-indirect-calls", cl::init(false), cl::ReallyHidden, cl::desc("disable outlining indirect calls.")); cl::opt MatchCallsByName("ir-sim-calls-by-name", cl::init(false), cl::ReallyHidden, cl::desc("only allow matching call instructions if the " "name and type signature match.")); cl::opt DisableIntrinsics("no-ir-sim-intrinsics", cl::init(false), cl::ReallyHidden, cl::desc("Don't match or outline intrinsics")); } // namespace llvm IRInstructionData::IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDList) : Inst(&I), Legal(Legality), IDL(&IDList) { initializeInstruction(); } void IRInstructionData::initializeInstruction() { // We check for whether we have a comparison instruction. If it is, we // find the "less than" version of the predicate for consistency for // comparison instructions throught the program. if (CmpInst *C = dyn_cast(Inst)) { CmpInst::Predicate Predicate = predicateForConsistency(C); if (Predicate != C->getPredicate()) RevisedPredicate = Predicate; } // Here we collect the operands and their types for determining whether // the structure of the operand use matches between two different candidates. for (Use &OI : Inst->operands()) { if (isa(Inst) && RevisedPredicate) { // If we have a CmpInst where the predicate is reversed, it means the // operands must be reversed as well. OperVals.insert(OperVals.begin(), OI.get()); continue; } OperVals.push_back(OI.get()); } // We capture the incoming BasicBlocks as values as well as the incoming // Values in order to check for structural similarity. if (PHINode *PN = dyn_cast(Inst)) for (BasicBlock *BB : PN->blocks()) OperVals.push_back(BB); } IRInstructionData::IRInstructionData(IRInstructionDataList &IDList) : IDL(&IDList) {} void IRInstructionData::setBranchSuccessors( DenseMap &BasicBlockToInteger) { assert(isa(Inst) && "Instruction must be branch"); BranchInst *BI = cast(Inst); DenseMap::iterator BBNumIt; BBNumIt = BasicBlockToInteger.find(BI->getParent()); assert(BBNumIt != BasicBlockToInteger.end() && "Could not find location for BasicBlock!"); int CurrentBlockNumber = static_cast(BBNumIt->second); for (BasicBlock *Successor : BI->successors()) { BBNumIt = BasicBlockToInteger.find(Successor); assert(BBNumIt != BasicBlockToInteger.end() && "Could not find number for BasicBlock!"); int OtherBlockNumber = static_cast(BBNumIt->second); int Relative = OtherBlockNumber - CurrentBlockNumber; RelativeBlockLocations.push_back(Relative); } } void IRInstructionData::setCalleeName(bool MatchByName) { CallInst *CI = dyn_cast(Inst); assert(CI && "Instruction must be call"); CalleeName = ""; if (IntrinsicInst *II = dyn_cast(Inst)) { // To hash intrinsics, we use the opcode, and types like the other // instructions, but also, the Intrinsic ID, and the Name of the // intrinsic. Intrinsic::ID IntrinsicID = II->getIntrinsicID(); FunctionType *FT = II->getFunctionType(); // If there is an overloaded name, we have to use the complex version // of getName to get the entire string. if (Intrinsic::isOverloaded(IntrinsicID)) CalleeName = Intrinsic::getName(IntrinsicID, FT->params(), II->getModule(), FT); // If there is not an overloaded name, we only need to use this version. else CalleeName = Intrinsic::getName(IntrinsicID).str(); return; } if (!CI->isIndirectCall() && MatchByName) CalleeName = CI->getCalledFunction()->getName().str(); } void IRInstructionData::setPHIPredecessors( DenseMap &BasicBlockToInteger) { assert(isa(Inst) && "Instruction must be phi node"); PHINode *PN = cast(Inst); DenseMap::iterator BBNumIt; BBNumIt = BasicBlockToInteger.find(PN->getParent()); assert(BBNumIt != BasicBlockToInteger.end() && "Could not find location for BasicBlock!"); int CurrentBlockNumber = static_cast(BBNumIt->second); // Convert the incoming blocks of the PHINode to an integer value, based on // the relative distances between the current block and the incoming block. for (unsigned Idx = 0; Idx < PN->getNumIncomingValues(); Idx++) { BasicBlock *Incoming = PN->getIncomingBlock(Idx); BBNumIt = BasicBlockToInteger.find(Incoming); assert(BBNumIt != BasicBlockToInteger.end() && "Could not find number for BasicBlock!"); int OtherBlockNumber = static_cast(BBNumIt->second); int Relative = OtherBlockNumber - CurrentBlockNumber; RelativeBlockLocations.push_back(Relative); RelativeBlockLocations.push_back(Relative); } } CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) { switch (CI->getPredicate()) { case CmpInst::FCMP_OGT: case CmpInst::FCMP_UGT: case CmpInst::FCMP_OGE: case CmpInst::FCMP_UGE: case CmpInst::ICMP_SGT: case CmpInst::ICMP_UGT: case CmpInst::ICMP_SGE: case CmpInst::ICMP_UGE: return CI->getSwappedPredicate(); default: return CI->getPredicate(); } } CmpInst::Predicate IRInstructionData::getPredicate() const { assert(isa(Inst) && "Can only get a predicate from a compare instruction"); if (RevisedPredicate) return RevisedPredicate.value(); return cast(Inst)->getPredicate(); } StringRef IRInstructionData::getCalleeName() const { assert(isa(Inst) && "Can only get a name from a call instruction"); assert(CalleeName && "CalleeName has not been set"); return *CalleeName; } bool IRSimilarity::isClose(const IRInstructionData &A, const IRInstructionData &B) { if (!A.Legal || !B.Legal) return false; // Check if we are performing the same sort of operation on the same types // but not on the same values. if (!A.Inst->isSameOperationAs(B.Inst)) { // If there is a predicate, this means that either there is a swapped // predicate, or that the types are different, we want to make sure that // the predicates are equivalent via swapping. if (isa(A.Inst) && isa(B.Inst)) { if (A.getPredicate() != B.getPredicate()) return false; // If the predicates are the same via swap, make sure that the types are // still the same. auto ZippedTypes = zip(A.OperVals, B.OperVals); return all_of( ZippedTypes, [](std::tuple R) { return std::get<0>(R)->getType() == std::get<1>(R)->getType(); }); } return false; } // Since any GEP Instruction operands after the first operand cannot be // defined by a register, we must make sure that the operands after the first // are the same in the two instructions if (auto *GEP = dyn_cast(A.Inst)) { auto *OtherGEP = cast(B.Inst); // If the instructions do not have the same inbounds restrictions, we do // not consider them the same. if (GEP->isInBounds() != OtherGEP->isInBounds()) return false; auto ZippedOperands = zip(GEP->indices(), OtherGEP->indices()); // We increment here since we do not care about the first instruction, // we only care about the following operands since they must be the // exact same to be considered similar. return all_of(drop_begin(ZippedOperands), [](std::tuple R) { return std::get<0>(R) == std::get<1>(R); }); } // If the instructions are functions calls, we make sure that the function // name is the same. We already know that the types are since is // isSameOperationAs is true. if (isa(A.Inst) && isa(B.Inst)) { if (A.getCalleeName().str() != B.getCalleeName().str()) return false; } if (isa(A.Inst) && isa(B.Inst) && A.RelativeBlockLocations.size() != B.RelativeBlockLocations.size()) return false; return true; } // TODO: This is the same as the MachineOutliner, and should be consolidated // into the same interface. void IRInstructionMapper::convertToUnsignedVec( BasicBlock &BB, std::vector &InstrList, std::vector &IntegerMapping) { BasicBlock::iterator It = BB.begin(); std::vector IntegerMappingForBB; std::vector InstrListForBB; for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) { switch (InstClassifier.visit(*It)) { case InstrType::Legal: mapToLegalUnsigned(It, IntegerMappingForBB, InstrListForBB); break; case InstrType::Illegal: mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB); break; case InstrType::Invisible: AddedIllegalLastTime = false; break; } } if (AddedIllegalLastTime) mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true); for (IRInstructionData *ID : InstrListForBB) this->IDL->push_back(*ID); llvm::append_range(InstrList, InstrListForBB); llvm::append_range(IntegerMapping, IntegerMappingForBB); } // TODO: This is the same as the MachineOutliner, and should be consolidated // into the same interface. unsigned IRInstructionMapper::mapToLegalUnsigned( BasicBlock::iterator &It, std::vector &IntegerMappingForBB, std::vector &InstrListForBB) { // We added something legal, so we should unset the AddedLegalLastTime // flag. AddedIllegalLastTime = false; // If we have at least two adjacent legal instructions (which may have // invisible instructions in between), remember that. if (CanCombineWithPrevInstr) HaveLegalRange = true; CanCombineWithPrevInstr = true; // Get the integer for this instruction or give it the current // LegalInstrNumber. IRInstructionData *ID = allocateIRInstructionData(*It, true, *IDL); InstrListForBB.push_back(ID); if (isa(*It)) ID->setBranchSuccessors(BasicBlockToInteger); if (isa(*It)) ID->setCalleeName(EnableMatchCallsByName); if (isa(*It)) ID->setPHIPredecessors(BasicBlockToInteger); // Add to the instruction list bool WasInserted; DenseMap::iterator ResultIt; std::tie(ResultIt, WasInserted) = InstructionIntegerMap.insert(std::make_pair(ID, LegalInstrNumber)); unsigned INumber = ResultIt->second; // There was an insertion. if (WasInserted) LegalInstrNumber++; IntegerMappingForBB.push_back(INumber); // Make sure we don't overflow or use any integers reserved by the DenseMap. assert(LegalInstrNumber < IllegalInstrNumber && "Instruction mapping overflow!"); assert(LegalInstrNumber != DenseMapInfo::getEmptyKey() && "Tried to assign DenseMap tombstone or empty key to instruction."); assert(LegalInstrNumber != DenseMapInfo::getTombstoneKey() && "Tried to assign DenseMap tombstone or empty key to instruction."); return INumber; } IRInstructionData * IRInstructionMapper::allocateIRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL) { return new (InstDataAllocator->Allocate()) IRInstructionData(I, Legality, IDL); } IRInstructionData * IRInstructionMapper::allocateIRInstructionData(IRInstructionDataList &IDL) { return new (InstDataAllocator->Allocate()) IRInstructionData(IDL); } IRInstructionDataList * IRInstructionMapper::allocateIRInstructionDataList() { return new (IDLAllocator->Allocate()) IRInstructionDataList(); } // TODO: This is the same as the MachineOutliner, and should be consolidated // into the same interface. unsigned IRInstructionMapper::mapToIllegalUnsigned( BasicBlock::iterator &It, std::vector &IntegerMappingForBB, std::vector &InstrListForBB, bool End) { // Can't combine an illegal instruction. Set the flag. CanCombineWithPrevInstr = false; // Only add one illegal number per range of legal numbers. if (AddedIllegalLastTime) return IllegalInstrNumber; IRInstructionData *ID = nullptr; if (!End) ID = allocateIRInstructionData(*It, false, *IDL); else ID = allocateIRInstructionData(*IDL); InstrListForBB.push_back(ID); // Remember that we added an illegal number last time. AddedIllegalLastTime = true; unsigned INumber = IllegalInstrNumber; IntegerMappingForBB.push_back(IllegalInstrNumber--); assert(LegalInstrNumber < IllegalInstrNumber && "Instruction mapping overflow!"); assert(IllegalInstrNumber != DenseMapInfo::getEmptyKey() && "IllegalInstrNumber cannot be DenseMap tombstone or empty key!"); assert(IllegalInstrNumber != DenseMapInfo::getTombstoneKey() && "IllegalInstrNumber cannot be DenseMap tombstone or empty key!"); return INumber; } IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len, IRInstructionData *FirstInstIt, IRInstructionData *LastInstIt) : StartIdx(StartIdx), Len(Len) { assert(FirstInstIt != nullptr && "Instruction is nullptr!"); assert(LastInstIt != nullptr && "Instruction is nullptr!"); assert(StartIdx + Len > StartIdx && "Overflow for IRSimilarityCandidate range?"); assert(Len - 1 == static_cast(std::distance( iterator(FirstInstIt), iterator(LastInstIt))) && "Length of the first and last IRInstructionData do not match the " "given length"); // We iterate over the given instructions, and map each unique value // to a unique number in the IRSimilarityCandidate ValueToNumber and // NumberToValue maps. A constant get its own value globally, the individual // uses of the constants are not considered to be unique. // // IR: Mapping Added: // %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2 // %add2 = add i32 %a, %1 %add2 -> 4 // %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5 // // when replace with global values, starting from 1, would be // // 3 = add i32 1, 2 // 4 = add i32 1, 3 // 6 = add i32 5, 2 unsigned LocalValNumber = 1; IRInstructionDataList::iterator ID = iterator(*FirstInstIt); for (unsigned Loc = StartIdx; Loc < StartIdx + Len; Loc++, ID++) { // Map the operand values to an unsigned integer if it does not already // have an unsigned integer assigned to it. for (Value *Arg : ID->OperVals) if (ValueToNumber.find(Arg) == ValueToNumber.end()) { ValueToNumber.try_emplace(Arg, LocalValNumber); NumberToValue.try_emplace(LocalValNumber, Arg); LocalValNumber++; } // Mapping the instructions to an unsigned integer if it is not already // exist in the mapping. if (ValueToNumber.find(ID->Inst) == ValueToNumber.end()) { ValueToNumber.try_emplace(ID->Inst, LocalValNumber); NumberToValue.try_emplace(LocalValNumber, ID->Inst); LocalValNumber++; } } // Setting the first and last instruction data pointers for the candidate. If // we got through the entire for loop without hitting an assert, we know // that both of these instructions are not nullptrs. FirstInst = FirstInstIt; LastInst = LastInstIt; // Add the basic blocks contained in the set into the global value numbering. DenseSet BBSet; getBasicBlocks(BBSet); for (BasicBlock *BB : BBSet) { if (ValueToNumber.find(BB) != ValueToNumber.end()) continue; ValueToNumber.try_emplace(BB, LocalValNumber); NumberToValue.try_emplace(LocalValNumber, BB); LocalValNumber++; } } bool IRSimilarityCandidate::isSimilar(const IRSimilarityCandidate &A, const IRSimilarityCandidate &B) { if (A.getLength() != B.getLength()) return false; auto InstrDataForBoth = zip(make_range(A.begin(), A.end()), make_range(B.begin(), B.end())); return all_of(InstrDataForBoth, [](std::tuple R) { IRInstructionData &A = std::get<0>(R); IRInstructionData &B = std::get<1>(R); if (!A.Legal || !B.Legal) return false; return isClose(A, B); }); } /// Determine if one or more of the assigned global value numbers for the /// operands in \p TargetValueNumbers is in the current mapping set for operand /// numbers in \p SourceOperands. The set of possible corresponding global /// value numbers are replaced with the most recent version of compatible /// values. /// /// \param [in] SourceValueToNumberMapping - The mapping of a Value to global /// value number for the source IRInstructionCandidate. /// \param [in, out] CurrentSrcTgtNumberMapping - The current mapping of source /// IRSimilarityCandidate global value numbers to a set of possible numbers in /// the target. /// \param [in] SourceOperands - The operands in the original /// IRSimilarityCandidate in the current instruction. /// \param [in] TargetValueNumbers - The global value numbers of the operands in /// the corresponding Instruction in the other IRSimilarityCandidate. /// \returns true if there exists a possible mapping between the source /// Instruction operands and the target Instruction operands, and false if not. static bool checkNumberingAndReplaceCommutative( const DenseMap &SourceValueToNumberMapping, DenseMap> &CurrentSrcTgtNumberMapping, ArrayRef &SourceOperands, DenseSet &TargetValueNumbers){ DenseMap>::iterator ValueMappingIt; unsigned ArgVal; bool WasInserted; // Iterate over the operands in the source IRSimilarityCandidate to determine // whether there exists an operand in the other IRSimilarityCandidate that // creates a valid mapping of Value to Value between the // IRSimilarityCaniddates. for (Value *V : SourceOperands) { ArgVal = SourceValueToNumberMapping.find(V)->second; // Instead of finding a current mapping, we attempt to insert a set. std::tie(ValueMappingIt, WasInserted) = CurrentSrcTgtNumberMapping.insert( std::make_pair(ArgVal, TargetValueNumbers)); // We need to iterate over the items in other IRSimilarityCandidate's // Instruction to determine whether there is a valid mapping of // Value to Value. DenseSet NewSet; for (unsigned &Curr : ValueMappingIt->second) // If we can find the value in the mapping, we add it to the new set. if (TargetValueNumbers.contains(Curr)) NewSet.insert(Curr); // If we could not find a Value, return 0. if (NewSet.empty()) return false; // Otherwise replace the old mapping with the newly constructed one. if (NewSet.size() != ValueMappingIt->second.size()) ValueMappingIt->second.swap(NewSet); // We have reached no conclusions about the mapping, and cannot remove // any items from the other operands, so we move to check the next operand. if (ValueMappingIt->second.size() != 1) continue; unsigned ValToRemove = *ValueMappingIt->second.begin(); // When there is only one item left in the mapping for and operand, remove // the value from the other operands. If it results in there being no // mapping, return false, it means the mapping is wrong for (Value *InnerV : SourceOperands) { if (V == InnerV) continue; unsigned InnerVal = SourceValueToNumberMapping.find(InnerV)->second; ValueMappingIt = CurrentSrcTgtNumberMapping.find(InnerVal); if (ValueMappingIt == CurrentSrcTgtNumberMapping.end()) continue; ValueMappingIt->second.erase(ValToRemove); if (ValueMappingIt->second.empty()) return false; } } return true; } /// Determine if operand number \p TargetArgVal is in the current mapping set /// for operand number \p SourceArgVal. /// /// \param [in, out] CurrentSrcTgtNumberMapping current mapping of global /// value numbers from source IRSimilarityCandidate to target /// IRSimilarityCandidate. /// \param [in] SourceArgVal The global value number for an operand in the /// in the original candidate. /// \param [in] TargetArgVal The global value number for the corresponding /// operand in the other candidate. /// \returns True if there exists a mapping and false if not. bool checkNumberingAndReplace( DenseMap> &CurrentSrcTgtNumberMapping, unsigned SourceArgVal, unsigned TargetArgVal) { // We are given two unsigned integers representing the global values of // the operands in different IRSimilarityCandidates and a current mapping // between the two. // // Source Operand GVN: 1 // Target Operand GVN: 2 // CurrentMapping: {1: {1, 2}} // // Since we have mapping, and the target operand is contained in the set, we // update it to: // CurrentMapping: {1: {2}} // and can return true. But, if the mapping was // CurrentMapping: {1: {3}} // we would return false. bool WasInserted; DenseMap>::iterator Val; std::tie(Val, WasInserted) = CurrentSrcTgtNumberMapping.insert( std::make_pair(SourceArgVal, DenseSet({TargetArgVal}))); // If we created a new mapping, then we are done. if (WasInserted) return true; // If there is more than one option in the mapping set, and the target value // is included in the mapping set replace that set with one that only includes // the target value, as it is the only valid mapping via the non commutative // instruction. DenseSet &TargetSet = Val->second; if (TargetSet.size() > 1 && TargetSet.contains(TargetArgVal)) { TargetSet.clear(); TargetSet.insert(TargetArgVal); return true; } // Return true if we can find the value in the set. return TargetSet.contains(TargetArgVal); } bool IRSimilarityCandidate::compareNonCommutativeOperandMapping( OperandMapping A, OperandMapping B) { // Iterators to keep track of where we are in the operands for each // Instruction. ArrayRef::iterator VItA = A.OperVals.begin(); ArrayRef::iterator VItB = B.OperVals.begin(); unsigned OperandLength = A.OperVals.size(); // For each operand, get the value numbering and ensure it is consistent. for (unsigned Idx = 0; Idx < OperandLength; Idx++, VItA++, VItB++) { unsigned OperValA = A.IRSC.ValueToNumber.find(*VItA)->second; unsigned OperValB = B.IRSC.ValueToNumber.find(*VItB)->second; // Attempt to add a set with only the target value. If there is no mapping // we can create it here. // // For an instruction like a subtraction: // IRSimilarityCandidateA: IRSimilarityCandidateB: // %resultA = sub %a, %b %resultB = sub %d, %e // // We map %a -> %d and %b -> %e. // // And check to see whether their mapping is consistent in // checkNumberingAndReplace. if (!checkNumberingAndReplace(A.ValueNumberMapping, OperValA, OperValB)) return false; if (!checkNumberingAndReplace(B.ValueNumberMapping, OperValB, OperValA)) return false; } return true; } bool IRSimilarityCandidate::compareCommutativeOperandMapping( OperandMapping A, OperandMapping B) { DenseSet ValueNumbersA; DenseSet ValueNumbersB; ArrayRef::iterator VItA = A.OperVals.begin(); ArrayRef::iterator VItB = B.OperVals.begin(); unsigned OperandLength = A.OperVals.size(); // Find the value number sets for the operands. for (unsigned Idx = 0; Idx < OperandLength; Idx++, VItA++, VItB++) { ValueNumbersA.insert(A.IRSC.ValueToNumber.find(*VItA)->second); ValueNumbersB.insert(B.IRSC.ValueToNumber.find(*VItB)->second); } // Iterate over the operands in the first IRSimilarityCandidate and make sure // there exists a possible mapping with the operands in the second // IRSimilarityCandidate. if (!checkNumberingAndReplaceCommutative(A.IRSC.ValueToNumber, A.ValueNumberMapping, A.OperVals, ValueNumbersB)) return false; // Iterate over the operands in the second IRSimilarityCandidate and make sure // there exists a possible mapping with the operands in the first // IRSimilarityCandidate. if (!checkNumberingAndReplaceCommutative(B.IRSC.ValueToNumber, B.ValueNumberMapping, B.OperVals, ValueNumbersA)) return false; return true; } bool IRSimilarityCandidate::checkRelativeLocations(RelativeLocMapping A, RelativeLocMapping B) { // Get the basic blocks the label refers to. BasicBlock *ABB = static_cast(A.OperVal); BasicBlock *BBB = static_cast(B.OperVal); // Get the basic blocks contained in each region. DenseSet BasicBlockA; DenseSet BasicBlockB; A.IRSC.getBasicBlocks(BasicBlockA); B.IRSC.getBasicBlocks(BasicBlockB); // Determine if the block is contained in the region. bool AContained = BasicBlockA.contains(ABB); bool BContained = BasicBlockB.contains(BBB); // Both blocks need to be contained in the region, or both need to be outside // the reigon. if (AContained != BContained) return false; // If both are contained, then we need to make sure that the relative // distance to the target blocks are the same. if (AContained) return A.RelativeLocation == B.RelativeLocation; return true; } bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A, const IRSimilarityCandidate &B) { DenseMap> MappingA; DenseMap> MappingB; return IRSimilarityCandidate::compareStructure(A, B, MappingA, MappingB); } typedef detail::zippy &, SmallVector &, ArrayRef &, ArrayRef &> ZippedRelativeLocationsT; bool IRSimilarityCandidate::compareStructure( const IRSimilarityCandidate &A, const IRSimilarityCandidate &B, DenseMap> &ValueNumberMappingA, DenseMap> &ValueNumberMappingB) { if (A.getLength() != B.getLength()) return false; if (A.ValueToNumber.size() != B.ValueToNumber.size()) return false; iterator ItA = A.begin(); iterator ItB = B.begin(); // These ValueNumber Mapping sets create a create a mapping between the values // in one candidate to values in the other candidate. If we create a set with // one element, and that same element maps to the original element in the // candidate we have a good mapping. DenseMap>::iterator ValueMappingIt; // Iterate over the instructions contained in each candidate unsigned SectionLength = A.getStartIdx() + A.getLength(); for (unsigned Loc = A.getStartIdx(); Loc < SectionLength; ItA++, ItB++, Loc++) { // Make sure the instructions are similar to one another. if (!isClose(*ItA, *ItB)) return false; Instruction *IA = ItA->Inst; Instruction *IB = ItB->Inst; if (!ItA->Legal || !ItB->Legal) return false; // Get the operand sets for the instructions. ArrayRef OperValsA = ItA->OperVals; ArrayRef OperValsB = ItB->OperVals; unsigned InstValA = A.ValueToNumber.find(IA)->second; unsigned InstValB = B.ValueToNumber.find(IB)->second; bool WasInserted; // Ensure that the mappings for the instructions exists. std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingA.insert( std::make_pair(InstValA, DenseSet({InstValB}))); if (!WasInserted && !ValueMappingIt->second.contains(InstValB)) return false; std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingB.insert( std::make_pair(InstValB, DenseSet({InstValA}))); if (!WasInserted && !ValueMappingIt->second.contains(InstValA)) return false; // We have different paths for commutative instructions and non-commutative // instructions since commutative instructions could allow multiple mappings // to certain values. if (IA->isCommutative() && !isa(IA) && !isa(IA)) { if (!compareCommutativeOperandMapping( {A, OperValsA, ValueNumberMappingA}, {B, OperValsB, ValueNumberMappingB})) return false; continue; } // Handle the non-commutative cases. if (!compareNonCommutativeOperandMapping( {A, OperValsA, ValueNumberMappingA}, {B, OperValsB, ValueNumberMappingB})) return false; // Here we check that between two corresponding instructions, // when referring to a basic block in the same region, the // relative locations are the same. And, that the instructions refer to // basic blocks outside the region in the same corresponding locations. // We are able to make the assumption about blocks outside of the region // since the target block labels are considered values and will follow the // same number matching that we defined for the other instructions in the // region. So, at this point, in each location we target a specific block // outside the region, we are targeting a corresponding block in each // analagous location in the region we are comparing to. if (!(isa(IA) && isa(IB)) && !(isa(IA) && isa(IB))) continue; SmallVector &RelBlockLocsA = ItA->RelativeBlockLocations; SmallVector &RelBlockLocsB = ItB->RelativeBlockLocations; if (RelBlockLocsA.size() != RelBlockLocsB.size() && OperValsA.size() != OperValsB.size()) return false; ZippedRelativeLocationsT ZippedRelativeLocations = zip(RelBlockLocsA, RelBlockLocsB, OperValsA, OperValsB); if (any_of(ZippedRelativeLocations, [&A, &B](std::tuple R) { return !checkRelativeLocations( {A, std::get<0>(R), std::get<2>(R)}, {B, std::get<1>(R), std::get<3>(R)}); })) return false; } return true; } bool IRSimilarityCandidate::overlap(const IRSimilarityCandidate &A, const IRSimilarityCandidate &B) { auto DoesOverlap = [](const IRSimilarityCandidate &X, const IRSimilarityCandidate &Y) { // Check: // XXXXXX X starts before Y ends // YYYYYYY Y starts after X starts return X.StartIdx <= Y.getEndIdx() && Y.StartIdx >= X.StartIdx; }; return DoesOverlap(A, B) || DoesOverlap(B, A); } void IRSimilarityIdentifier::populateMapper( Module &M, std::vector &InstrList, std::vector &IntegerMapping) { std::vector InstrListForModule; std::vector IntegerMappingForModule; // Iterate over the functions in the module to map each Instruction in each // BasicBlock to an unsigned integer. Mapper.initializeForBBs(M); for (Function &F : M) { if (F.empty()) continue; for (BasicBlock &BB : F) { // BB has potential to have similarity since it has a size greater than 2 // and can therefore match other regions greater than 2. Map it to a list // of unsigned integers. Mapper.convertToUnsignedVec(BB, InstrListForModule, IntegerMappingForModule); } BasicBlock::iterator It = F.begin()->end(); Mapper.mapToIllegalUnsigned(It, IntegerMappingForModule, InstrListForModule, true); if (InstrListForModule.size() > 0) Mapper.IDL->push_back(*InstrListForModule.back()); } // Insert the InstrListForModule at the end of the overall InstrList so that // we can have a long InstrList for the entire set of Modules being analyzed. llvm::append_range(InstrList, InstrListForModule); // Do the same as above, but for IntegerMapping. llvm::append_range(IntegerMapping, IntegerMappingForModule); } void IRSimilarityIdentifier::populateMapper( ArrayRef> &Modules, std::vector &InstrList, std::vector &IntegerMapping) { // Iterate over, and map the instructions in each module. for (const std::unique_ptr &M : Modules) populateMapper(*M, InstrList, IntegerMapping); } /// From a repeated subsequence, find all the different instances of the /// subsequence from the \p InstrList, and create an IRSimilarityCandidate from /// the IRInstructionData in subsequence. /// /// \param [in] Mapper - The instruction mapper for basic correctness checks. /// \param [in] InstrList - The vector that holds the instruction data. /// \param [in] IntegerMapping - The vector that holds the mapped integers. /// \param [out] CandsForRepSubstring - The vector to store the generated /// IRSimilarityCandidates. static void createCandidatesFromSuffixTree( const IRInstructionMapper& Mapper, std::vector &InstrList, std::vector &IntegerMapping, SuffixTree::RepeatedSubstring &RS, std::vector &CandsForRepSubstring) { unsigned StringLen = RS.Length; if (StringLen < 2) return; // Create an IRSimilarityCandidate for instance of this subsequence \p RS. for (const unsigned &StartIdx : RS.StartIndices) { unsigned EndIdx = StartIdx + StringLen - 1; // Check that this subsequence does not contain an illegal instruction. bool ContainsIllegal = false; for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) { unsigned Key = IntegerMapping[CurrIdx]; if (Key > Mapper.IllegalInstrNumber) { ContainsIllegal = true; break; } } // If we have an illegal instruction, we should not create an // IRSimilarityCandidate for this region. if (ContainsIllegal) continue; // We are getting iterators to the instructions in this region of code // by advancing the start and end indices from the start of the // InstrList. std::vector::iterator StartIt = InstrList.begin(); std::advance(StartIt, StartIdx); std::vector::iterator EndIt = InstrList.begin(); std::advance(EndIt, EndIdx); CandsForRepSubstring.emplace_back(StartIdx, StringLen, *StartIt, *EndIt); } } void IRSimilarityCandidate::createCanonicalRelationFrom( IRSimilarityCandidate &SourceCand, DenseMap> &ToSourceMapping, DenseMap> &FromSourceMapping) { assert(SourceCand.CanonNumToNumber.size() != 0 && "Base canonical relationship is empty!"); assert(SourceCand.NumberToCanonNum.size() != 0 && "Base canonical relationship is empty!"); assert(CanonNumToNumber.size() == 0 && "Canonical Relationship is non-empty"); assert(NumberToCanonNum.size() == 0 && "Canonical Relationship is non-empty"); DenseSet UsedGVNs; // Iterate over the mappings provided from this candidate to SourceCand. We // are then able to map the GVN in this candidate to the same canonical number // given to the corresponding GVN in SourceCand. for (std::pair> &GVNMapping : ToSourceMapping) { unsigned SourceGVN = GVNMapping.first; assert(GVNMapping.second.size() != 0 && "Possible GVNs is 0!"); unsigned ResultGVN; // We need special handling if we have more than one potential value. This // means that there are at least two GVNs that could correspond to this GVN. // This could lead to potential swapping later on, so we make a decision // here to ensure a one-to-one mapping. if (GVNMapping.second.size() > 1) { bool Found = false; for (unsigned Val : GVNMapping.second) { // We make sure the target value number hasn't already been reserved. if (UsedGVNs.contains(Val)) continue; // We make sure that the opposite mapping is still consistent. DenseMap>::iterator It = FromSourceMapping.find(Val); if (!It->second.contains(SourceGVN)) continue; // We pick the first item that satisfies these conditions. Found = true; ResultGVN = Val; break; } assert(Found && "Could not find matching value for source GVN"); (void)Found; } else ResultGVN = *GVNMapping.second.begin(); // Whatever GVN is found, we mark it as used. UsedGVNs.insert(ResultGVN); unsigned CanonNum = *SourceCand.getCanonicalNum(ResultGVN); CanonNumToNumber.insert(std::make_pair(CanonNum, SourceGVN)); NumberToCanonNum.insert(std::make_pair(SourceGVN, CanonNum)); } DenseSet BBSet; getBasicBlocks(BBSet); // Find canonical numbers for the BasicBlocks in the current candidate. // This is done by finding the corresponding value for the first instruction // in the block in the current candidate, finding the matching value in the // source candidate. Then by finding the parent of this value, use the // canonical number of the block in the source candidate for the canonical // number in the current candidate. for (BasicBlock *BB : BBSet) { unsigned BBGVNForCurrCand = ValueToNumber.find(BB)->second; // We can skip the BasicBlock if the canonical numbering has already been // found in a separate instruction. if (NumberToCanonNum.find(BBGVNForCurrCand) != NumberToCanonNum.end()) continue; // If the basic block is the starting block, then the shared instruction may // not be the first instruction in the block, it will be the first // instruction in the similarity region. Value *FirstOutlineInst = BB == getStartBB() ? frontInstruction() : &*BB->instructionsWithoutDebug().begin(); unsigned FirstInstGVN = *getGVN(FirstOutlineInst); unsigned FirstInstCanonNum = *getCanonicalNum(FirstInstGVN); unsigned SourceGVN = *SourceCand.fromCanonicalNum(FirstInstCanonNum); Value *SourceV = *SourceCand.fromGVN(SourceGVN); BasicBlock *SourceBB = cast(SourceV)->getParent(); unsigned SourceBBGVN = *SourceCand.getGVN(SourceBB); unsigned SourceCanonBBGVN = *SourceCand.getCanonicalNum(SourceBBGVN); CanonNumToNumber.insert(std::make_pair(SourceCanonBBGVN, BBGVNForCurrCand)); NumberToCanonNum.insert(std::make_pair(BBGVNForCurrCand, SourceCanonBBGVN)); } } void IRSimilarityCandidate::createCanonicalMappingFor( IRSimilarityCandidate &CurrCand) { assert(CurrCand.CanonNumToNumber.size() == 0 && "Canonical Relationship is non-empty"); assert(CurrCand.NumberToCanonNum.size() == 0 && "Canonical Relationship is non-empty"); unsigned CanonNum = 0; // Iterate over the value numbers found, the order does not matter in this // case. for (std::pair &NumToVal : CurrCand.NumberToValue) { CurrCand.NumberToCanonNum.insert(std::make_pair(NumToVal.first, CanonNum)); CurrCand.CanonNumToNumber.insert(std::make_pair(CanonNum, NumToVal.first)); CanonNum++; } } /// From the list of IRSimilarityCandidates, perform a comparison between each /// IRSimilarityCandidate to determine if there are overlapping /// IRInstructionData, or if they do not have the same structure. /// /// \param [in] CandsForRepSubstring - The vector containing the /// IRSimilarityCandidates. /// \param [out] StructuralGroups - the mapping of unsigned integers to vector /// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the /// vector are structurally similar to one another. static void findCandidateStructures( std::vector &CandsForRepSubstring, DenseMap &StructuralGroups) { std::vector::iterator CandIt, CandEndIt, InnerCandIt, InnerCandEndIt; // IRSimilarityCandidates each have a structure for operand use. It is // possible that two instances of the same subsequences have different // structure. Each type of structure found is assigned a number. This // DenseMap maps an IRSimilarityCandidate to which type of similarity // discovered it fits within. DenseMap CandToGroup; // Find the compatibility from each candidate to the others to determine // which candidates overlap and which have the same structure by mapping // each structure to a different group. bool SameStructure; bool Inserted; unsigned CurrentGroupNum = 0; unsigned OuterGroupNum; DenseMap::iterator CandToGroupIt; DenseMap::iterator CandToGroupItInner; DenseMap::iterator CurrentGroupPair; // Iterate over the candidates to determine its structural and overlapping // compatibility with other instructions DenseMap> ValueNumberMappingA; DenseMap> ValueNumberMappingB; for (CandIt = CandsForRepSubstring.begin(), CandEndIt = CandsForRepSubstring.end(); CandIt != CandEndIt; CandIt++) { // Determine if it has an assigned structural group already. CandToGroupIt = CandToGroup.find(&*CandIt); if (CandToGroupIt == CandToGroup.end()) { // If not, we assign it one, and add it to our mapping. std::tie(CandToGroupIt, Inserted) = CandToGroup.insert(std::make_pair(&*CandIt, CurrentGroupNum++)); } // Get the structural group number from the iterator. OuterGroupNum = CandToGroupIt->second; // Check if we already have a list of IRSimilarityCandidates for the current // structural group. Create one if one does not exist. CurrentGroupPair = StructuralGroups.find(OuterGroupNum); if (CurrentGroupPair == StructuralGroups.end()) { IRSimilarityCandidate::createCanonicalMappingFor(*CandIt); std::tie(CurrentGroupPair, Inserted) = StructuralGroups.insert( std::make_pair(OuterGroupNum, SimilarityGroup({*CandIt}))); } // Iterate over the IRSimilarityCandidates following the current // IRSimilarityCandidate in the list to determine whether the two // IRSimilarityCandidates are compatible. This is so we do not repeat pairs // of IRSimilarityCandidates. for (InnerCandIt = std::next(CandIt), InnerCandEndIt = CandsForRepSubstring.end(); InnerCandIt != InnerCandEndIt; InnerCandIt++) { // We check if the inner item has a group already, if it does, we skip it. CandToGroupItInner = CandToGroup.find(&*InnerCandIt); if (CandToGroupItInner != CandToGroup.end()) continue; // Otherwise we determine if they have the same structure and add it to // vector if they match. ValueNumberMappingA.clear(); ValueNumberMappingB.clear(); SameStructure = IRSimilarityCandidate::compareStructure( *CandIt, *InnerCandIt, ValueNumberMappingA, ValueNumberMappingB); if (!SameStructure) continue; InnerCandIt->createCanonicalRelationFrom(*CandIt, ValueNumberMappingA, ValueNumberMappingB); CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum)); CurrentGroupPair->second.push_back(*InnerCandIt); } } } void IRSimilarityIdentifier::findCandidates( std::vector &InstrList, std::vector &IntegerMapping) { SuffixTree ST(IntegerMapping); std::vector CandsForRepSubstring; std::vector NewCandidateGroups; DenseMap StructuralGroups; // Iterate over the subsequences found by the Suffix Tree to create // IRSimilarityCandidates for each repeated subsequence and determine which // instances are structurally similar to one another. for (SuffixTree::RepeatedSubstring &RS : ST) { createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, RS, CandsForRepSubstring); if (CandsForRepSubstring.size() < 2) continue; findCandidateStructures(CandsForRepSubstring, StructuralGroups); for (std::pair &Group : StructuralGroups) // We only add the group if it contains more than one // IRSimilarityCandidate. If there is only one, that means there is no // other repeated subsequence with the same structure. if (Group.second.size() > 1) SimilarityCandidates->push_back(Group.second); CandsForRepSubstring.clear(); StructuralGroups.clear(); NewCandidateGroups.clear(); } } SimilarityGroupList &IRSimilarityIdentifier::findSimilarity( ArrayRef> Modules) { resetSimilarityCandidates(); std::vector InstrList; std::vector IntegerMapping; Mapper.InstClassifier.EnableBranches = this->EnableBranches; Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls; Mapper.EnableMatchCallsByName = EnableMatchingCallsByName; Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics; Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls; populateMapper(Modules, InstrList, IntegerMapping); findCandidates(InstrList, IntegerMapping); return *SimilarityCandidates; } SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) { resetSimilarityCandidates(); Mapper.InstClassifier.EnableBranches = this->EnableBranches; Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls; Mapper.EnableMatchCallsByName = EnableMatchingCallsByName; Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics; Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls; std::vector InstrList; std::vector IntegerMapping; populateMapper(M, InstrList, IntegerMapping); findCandidates(InstrList, IntegerMapping); return *SimilarityCandidates; } INITIALIZE_PASS(IRSimilarityIdentifierWrapperPass, "ir-similarity-identifier", "ir-similarity-identifier", false, true) IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass() : ModulePass(ID) { initializeIRSimilarityIdentifierWrapperPassPass( *PassRegistry::getPassRegistry()); } bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) { IRSI.reset(new IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls, MatchCallsByName, !DisableIntrinsics, false)); return false; } bool IRSimilarityIdentifierWrapperPass::doFinalization(Module &M) { IRSI.reset(); return false; } bool IRSimilarityIdentifierWrapperPass::runOnModule(Module &M) { IRSI->findSimilarity(M); return false; } AnalysisKey IRSimilarityAnalysis::Key; IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M, ModuleAnalysisManager &) { auto IRSI = IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls, MatchCallsByName, !DisableIntrinsics, false); IRSI.findSimilarity(M); return IRSI; } PreservedAnalyses IRSimilarityAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &AM) { IRSimilarityIdentifier &IRSI = AM.getResult(M); Optional &SimilarityCandidatesOpt = IRSI.getSimilarity(); for (std::vector &CandVec : *SimilarityCandidatesOpt) { OS << CandVec.size() << " candidates of length " << CandVec.begin()->getLength() << ". Found in: \n"; for (IRSimilarityCandidate &Cand : CandVec) { OS << " Function: " << Cand.front()->Inst->getFunction()->getName().str() << ", Basic Block: "; if (Cand.front()->Inst->getParent()->getName().str() == "") OS << "(unnamed)"; else OS << Cand.front()->Inst->getParent()->getName().str(); OS << "\n Start Instruction: "; Cand.frontInstruction()->print(OS); OS << "\n End Instruction: "; Cand.backInstruction()->print(OS); OS << "\n"; } } return PreservedAnalyses::all(); } char IRSimilarityIdentifierWrapperPass::ID = 0;