xref: /freebsd/contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp (revision 51015e6d0f570239b0c2088dc6cf2b018928375d)
1 //===- IRSimilarityIdentifier.cpp - Find similarity in a module -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // Implementation file for the IRSimilarityIdentifier for identifying
11 // similarities in IR including the IRInstructionMapper.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Analysis/IRSimilarityIdentifier.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/IR/Intrinsics.h"
18 #include "llvm/IR/Operator.h"
19 #include "llvm/IR/User.h"
20 #include "llvm/InitializePasses.h"
21 #include "llvm/Support/SuffixTree.h"
22 
23 using namespace llvm;
24 using namespace IRSimilarity;
25 
26 namespace llvm {
27 cl::opt<bool>
28     DisableBranches("no-ir-sim-branch-matching", cl::init(false),
29                     cl::ReallyHidden,
30                     cl::desc("disable similarity matching, and outlining, "
31                              "across branches for debugging purposes."));
32 
33 cl::opt<bool>
34     DisableIndirectCalls("no-ir-sim-indirect-calls", cl::init(false),
35                          cl::ReallyHidden,
36                          cl::desc("disable outlining indirect calls."));
37 
38 cl::opt<bool>
39     MatchCallsByName("ir-sim-calls-by-name", cl::init(false), cl::ReallyHidden,
40                      cl::desc("only allow matching call instructions if the "
41                               "name and type signature match."));
42 
43 cl::opt<bool>
44     DisableIntrinsics("no-ir-sim-intrinsics", cl::init(false), cl::ReallyHidden,
45                       cl::desc("Don't match or outline intrinsics"));
46 } // namespace llvm
47 
48 IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
49                                      IRInstructionDataList &IDList)
50     : Inst(&I), Legal(Legality), IDL(&IDList) {
51   initializeInstruction();
52 }
53 
54 void IRInstructionData::initializeInstruction() {
55   // We check for whether we have a comparison instruction.  If it is, we
56   // find the "less than" version of the predicate for consistency for
57   // comparison instructions throught the program.
58   if (CmpInst *C = dyn_cast<CmpInst>(Inst)) {
59     CmpInst::Predicate Predicate = predicateForConsistency(C);
60     if (Predicate != C->getPredicate())
61       RevisedPredicate = Predicate;
62   }
63 
64   // Here we collect the operands and their types for determining whether
65   // the structure of the operand use matches between two different candidates.
66   for (Use &OI : Inst->operands()) {
67     if (isa<CmpInst>(Inst) && RevisedPredicate) {
68       // If we have a CmpInst where the predicate is reversed, it means the
69       // operands must be reversed as well.
70       OperVals.insert(OperVals.begin(), OI.get());
71       continue;
72     }
73 
74     OperVals.push_back(OI.get());
75   }
76 
77   // We capture the incoming BasicBlocks as values as well as the incoming
78   // Values in order to check for structural similarity.
79   if (PHINode *PN = dyn_cast<PHINode>(Inst))
80     for (BasicBlock *BB : PN->blocks())
81       OperVals.push_back(BB);
82 }
83 
84 IRInstructionData::IRInstructionData(IRInstructionDataList &IDList)
85     : IDL(&IDList) {}
86 
87 void IRInstructionData::setBranchSuccessors(
88     DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
89   assert(isa<BranchInst>(Inst) && "Instruction must be branch");
90 
91   BranchInst *BI = cast<BranchInst>(Inst);
92   DenseMap<BasicBlock *, unsigned>::iterator BBNumIt;
93 
94   BBNumIt = BasicBlockToInteger.find(BI->getParent());
95   assert(BBNumIt != BasicBlockToInteger.end() &&
96          "Could not find location for BasicBlock!");
97 
98   int CurrentBlockNumber = static_cast<int>(BBNumIt->second);
99 
100   for (BasicBlock *Successor : BI->successors()) {
101     BBNumIt = BasicBlockToInteger.find(Successor);
102     assert(BBNumIt != BasicBlockToInteger.end() &&
103            "Could not find number for BasicBlock!");
104     int OtherBlockNumber = static_cast<int>(BBNumIt->second);
105 
106     int Relative = OtherBlockNumber - CurrentBlockNumber;
107     RelativeBlockLocations.push_back(Relative);
108   }
109 }
110 
111 void IRInstructionData::setCalleeName(bool MatchByName) {
112   CallInst *CI = dyn_cast<CallInst>(Inst);
113   assert(CI && "Instruction must be call");
114 
115   CalleeName = "";
116   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
117     // To hash intrinsics, we use the opcode, and types like the other
118     // instructions, but also, the Intrinsic ID, and the Name of the
119     // intrinsic.
120     Intrinsic::ID IntrinsicID = II->getIntrinsicID();
121     FunctionType *FT = II->getFunctionType();
122     // If there is an overloaded name, we have to use the complex version
123     // of getName to get the entire string.
124     if (Intrinsic::isOverloaded(IntrinsicID))
125       CalleeName =
126           Intrinsic::getName(IntrinsicID, FT->params(), II->getModule(), FT);
127     // If there is not an overloaded name, we only need to use this version.
128     else
129       CalleeName = Intrinsic::getName(IntrinsicID).str();
130 
131     return;
132   }
133 
134   if (!CI->isIndirectCall() && MatchByName)
135     CalleeName = CI->getCalledFunction()->getName().str();
136 }
137 
138 void IRInstructionData::setPHIPredecessors(
139     DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
140   assert(isa<PHINode>(Inst) && "Instruction must be phi node");
141 
142   PHINode *PN = cast<PHINode>(Inst);
143   DenseMap<BasicBlock *, unsigned>::iterator BBNumIt;
144 
145   BBNumIt = BasicBlockToInteger.find(PN->getParent());
146   assert(BBNumIt != BasicBlockToInteger.end() &&
147          "Could not find location for BasicBlock!");
148 
149   int CurrentBlockNumber = static_cast<int>(BBNumIt->second);
150 
151   // Convert the incoming blocks of the PHINode to an integer value, based on
152   // the relative distances between the current block and the incoming block.
153   for (unsigned Idx = 0; Idx < PN->getNumIncomingValues(); Idx++) {
154     BasicBlock *Incoming = PN->getIncomingBlock(Idx);
155     BBNumIt = BasicBlockToInteger.find(Incoming);
156     assert(BBNumIt != BasicBlockToInteger.end() &&
157            "Could not find number for BasicBlock!");
158     int OtherBlockNumber = static_cast<int>(BBNumIt->second);
159 
160     int Relative = OtherBlockNumber - CurrentBlockNumber;
161     RelativeBlockLocations.push_back(Relative);
162     RelativeBlockLocations.push_back(Relative);
163   }
164 }
165 
166 CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) {
167   switch (CI->getPredicate()) {
168   case CmpInst::FCMP_OGT:
169   case CmpInst::FCMP_UGT:
170   case CmpInst::FCMP_OGE:
171   case CmpInst::FCMP_UGE:
172   case CmpInst::ICMP_SGT:
173   case CmpInst::ICMP_UGT:
174   case CmpInst::ICMP_SGE:
175   case CmpInst::ICMP_UGE:
176     return CI->getSwappedPredicate();
177   default:
178     return CI->getPredicate();
179   }
180 }
181 
182 CmpInst::Predicate IRInstructionData::getPredicate() const {
183   assert(isa<CmpInst>(Inst) &&
184          "Can only get a predicate from a compare instruction");
185 
186   if (RevisedPredicate)
187     return RevisedPredicate.value();
188 
189   return cast<CmpInst>(Inst)->getPredicate();
190 }
191 
192 StringRef IRInstructionData::getCalleeName() const {
193   assert(isa<CallInst>(Inst) &&
194          "Can only get a name from a call instruction");
195 
196   assert(CalleeName && "CalleeName has not been set");
197 
198   return *CalleeName;
199 }
200 
201 bool IRSimilarity::isClose(const IRInstructionData &A,
202                            const IRInstructionData &B) {
203 
204   if (!A.Legal || !B.Legal)
205     return false;
206 
207   // Check if we are performing the same sort of operation on the same types
208   // but not on the same values.
209   if (!A.Inst->isSameOperationAs(B.Inst)) {
210     // If there is a predicate, this means that either there is a swapped
211     // predicate, or that the types are different, we want to make sure that
212     // the predicates are equivalent via swapping.
213     if (isa<CmpInst>(A.Inst) && isa<CmpInst>(B.Inst)) {
214 
215       if (A.getPredicate() != B.getPredicate())
216         return false;
217 
218       // If the predicates are the same via swap, make sure that the types are
219       // still the same.
220       auto ZippedTypes = zip(A.OperVals, B.OperVals);
221 
222       return all_of(
223           ZippedTypes, [](std::tuple<llvm::Value *, llvm::Value *> R) {
224             return std::get<0>(R)->getType() == std::get<1>(R)->getType();
225           });
226     }
227 
228     return false;
229   }
230 
231   // Since any GEP Instruction operands after the first operand cannot be
232   // defined by a register, we must make sure that the operands after the first
233   // are the same in the two instructions
234   if (auto *GEP = dyn_cast<GetElementPtrInst>(A.Inst)) {
235     auto *OtherGEP = cast<GetElementPtrInst>(B.Inst);
236 
237     // If the instructions do not have the same inbounds restrictions, we do
238     // not consider them the same.
239     if (GEP->isInBounds() != OtherGEP->isInBounds())
240       return false;
241 
242     auto ZippedOperands = zip(GEP->indices(), OtherGEP->indices());
243 
244     // We increment here since we do not care about the first instruction,
245     // we only care about the following operands since they must be the
246     // exact same to be considered similar.
247     return all_of(drop_begin(ZippedOperands),
248                   [](std::tuple<llvm::Use &, llvm::Use &> R) {
249                     return std::get<0>(R) == std::get<1>(R);
250                   });
251   }
252 
253   // If the instructions are functions calls, we make sure that the function
254   // name is the same.  We already know that the types are since is
255   // isSameOperationAs is true.
256   if (isa<CallInst>(A.Inst) && isa<CallInst>(B.Inst)) {
257     if (A.getCalleeName().str() != B.getCalleeName().str())
258       return false;
259   }
260 
261   if (isa<BranchInst>(A.Inst) && isa<BranchInst>(B.Inst) &&
262       A.RelativeBlockLocations.size() != B.RelativeBlockLocations.size())
263     return false;
264 
265   return true;
266 }
267 
268 // TODO: This is the same as the MachineOutliner, and should be consolidated
269 // into the same interface.
270 void IRInstructionMapper::convertToUnsignedVec(
271     BasicBlock &BB, std::vector<IRInstructionData *> &InstrList,
272     std::vector<unsigned> &IntegerMapping) {
273   BasicBlock::iterator It = BB.begin();
274 
275   std::vector<unsigned> IntegerMappingForBB;
276   std::vector<IRInstructionData *> InstrListForBB;
277 
278   for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) {
279     switch (InstClassifier.visit(*It)) {
280     case InstrType::Legal:
281       mapToLegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
282       break;
283     case InstrType::Illegal:
284       mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
285       break;
286     case InstrType::Invisible:
287       AddedIllegalLastTime = false;
288       break;
289     }
290   }
291 
292   if (AddedIllegalLastTime)
293     mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true);
294   for (IRInstructionData *ID : InstrListForBB)
295     this->IDL->push_back(*ID);
296   llvm::append_range(InstrList, InstrListForBB);
297   llvm::append_range(IntegerMapping, IntegerMappingForBB);
298 }
299 
300 // TODO: This is the same as the MachineOutliner, and should be consolidated
301 // into the same interface.
302 unsigned IRInstructionMapper::mapToLegalUnsigned(
303     BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
304     std::vector<IRInstructionData *> &InstrListForBB) {
305   // We added something legal, so we should unset the AddedLegalLastTime
306   // flag.
307   AddedIllegalLastTime = false;
308 
309   // If we have at least two adjacent legal instructions (which may have
310   // invisible instructions in between), remember that.
311   if (CanCombineWithPrevInstr)
312     HaveLegalRange = true;
313   CanCombineWithPrevInstr = true;
314 
315   // Get the integer for this instruction or give it the current
316   // LegalInstrNumber.
317   IRInstructionData *ID = allocateIRInstructionData(*It, true, *IDL);
318   InstrListForBB.push_back(ID);
319 
320   if (isa<BranchInst>(*It))
321     ID->setBranchSuccessors(BasicBlockToInteger);
322 
323   if (isa<CallInst>(*It))
324     ID->setCalleeName(EnableMatchCallsByName);
325 
326   if (isa<PHINode>(*It))
327     ID->setPHIPredecessors(BasicBlockToInteger);
328 
329   // Add to the instruction list
330   bool WasInserted;
331   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator
332       ResultIt;
333   std::tie(ResultIt, WasInserted) =
334       InstructionIntegerMap.insert(std::make_pair(ID, LegalInstrNumber));
335   unsigned INumber = ResultIt->second;
336 
337   // There was an insertion.
338   if (WasInserted)
339     LegalInstrNumber++;
340 
341   IntegerMappingForBB.push_back(INumber);
342 
343   // Make sure we don't overflow or use any integers reserved by the DenseMap.
344   assert(LegalInstrNumber < IllegalInstrNumber &&
345          "Instruction mapping overflow!");
346 
347   assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
348          "Tried to assign DenseMap tombstone or empty key to instruction.");
349   assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
350          "Tried to assign DenseMap tombstone or empty key to instruction.");
351 
352   return INumber;
353 }
354 
355 IRInstructionData *
356 IRInstructionMapper::allocateIRInstructionData(Instruction &I, bool Legality,
357                                                IRInstructionDataList &IDL) {
358   return new (InstDataAllocator->Allocate()) IRInstructionData(I, Legality, IDL);
359 }
360 
361 IRInstructionData *
362 IRInstructionMapper::allocateIRInstructionData(IRInstructionDataList &IDL) {
363   return new (InstDataAllocator->Allocate()) IRInstructionData(IDL);
364 }
365 
366 IRInstructionDataList *
367 IRInstructionMapper::allocateIRInstructionDataList() {
368   return new (IDLAllocator->Allocate()) IRInstructionDataList();
369 }
370 
371 // TODO: This is the same as the MachineOutliner, and should be consolidated
372 // into the same interface.
373 unsigned IRInstructionMapper::mapToIllegalUnsigned(
374     BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
375     std::vector<IRInstructionData *> &InstrListForBB, bool End) {
376   // Can't combine an illegal instruction. Set the flag.
377   CanCombineWithPrevInstr = false;
378 
379   // Only add one illegal number per range of legal numbers.
380   if (AddedIllegalLastTime)
381     return IllegalInstrNumber;
382 
383   IRInstructionData *ID = nullptr;
384   if (!End)
385     ID = allocateIRInstructionData(*It, false, *IDL);
386   else
387     ID = allocateIRInstructionData(*IDL);
388   InstrListForBB.push_back(ID);
389 
390   // Remember that we added an illegal number last time.
391   AddedIllegalLastTime = true;
392   unsigned INumber = IllegalInstrNumber;
393   IntegerMappingForBB.push_back(IllegalInstrNumber--);
394 
395   assert(LegalInstrNumber < IllegalInstrNumber &&
396          "Instruction mapping overflow!");
397 
398   assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
399          "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
400 
401   assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
402          "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
403 
404   return INumber;
405 }
406 
407 IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
408                                              IRInstructionData *FirstInstIt,
409                                              IRInstructionData *LastInstIt)
410     : StartIdx(StartIdx), Len(Len) {
411 
412   assert(FirstInstIt != nullptr && "Instruction is nullptr!");
413   assert(LastInstIt != nullptr && "Instruction is nullptr!");
414   assert(StartIdx + Len > StartIdx &&
415          "Overflow for IRSimilarityCandidate range?");
416   assert(Len - 1 == static_cast<unsigned>(std::distance(
417                         iterator(FirstInstIt), iterator(LastInstIt))) &&
418          "Length of the first and last IRInstructionData do not match the "
419          "given length");
420 
421   // We iterate over the given instructions, and map each unique value
422   // to a unique number in the IRSimilarityCandidate ValueToNumber and
423   // NumberToValue maps.  A constant get its own value globally, the individual
424   // uses of the constants are not considered to be unique.
425   //
426   // IR:                    Mapping Added:
427   // %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
428   // %add2 = add i32 %a, %1    %add2 -> 4
429   // %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
430   //
431   // when replace with global values, starting from 1, would be
432   //
433   // 3 = add i32 1, 2
434   // 4 = add i32 1, 3
435   // 6 = add i32 5, 2
436   unsigned LocalValNumber = 1;
437   IRInstructionDataList::iterator ID = iterator(*FirstInstIt);
438   for (unsigned Loc = StartIdx; Loc < StartIdx + Len; Loc++, ID++) {
439     // Map the operand values to an unsigned integer if it does not already
440     // have an unsigned integer assigned to it.
441     for (Value *Arg : ID->OperVals)
442       if (ValueToNumber.find(Arg) == ValueToNumber.end()) {
443         ValueToNumber.try_emplace(Arg, LocalValNumber);
444         NumberToValue.try_emplace(LocalValNumber, Arg);
445         LocalValNumber++;
446       }
447 
448     // Mapping the instructions to an unsigned integer if it is not already
449     // exist in the mapping.
450     if (ValueToNumber.find(ID->Inst) == ValueToNumber.end()) {
451       ValueToNumber.try_emplace(ID->Inst, LocalValNumber);
452       NumberToValue.try_emplace(LocalValNumber, ID->Inst);
453       LocalValNumber++;
454     }
455   }
456 
457   // Setting the first and last instruction data pointers for the candidate.  If
458   // we got through the entire for loop without hitting an assert, we know
459   // that both of these instructions are not nullptrs.
460   FirstInst = FirstInstIt;
461   LastInst = LastInstIt;
462 
463   // Add the basic blocks contained in the set into the global value numbering.
464   DenseSet<BasicBlock *> BBSet;
465   getBasicBlocks(BBSet);
466   for (BasicBlock *BB : BBSet) {
467     if (ValueToNumber.find(BB) != ValueToNumber.end())
468       continue;
469 
470     ValueToNumber.try_emplace(BB, LocalValNumber);
471     NumberToValue.try_emplace(LocalValNumber, BB);
472     LocalValNumber++;
473   }
474 }
475 
476 bool IRSimilarityCandidate::isSimilar(const IRSimilarityCandidate &A,
477                                       const IRSimilarityCandidate &B) {
478   if (A.getLength() != B.getLength())
479     return false;
480 
481   auto InstrDataForBoth =
482       zip(make_range(A.begin(), A.end()), make_range(B.begin(), B.end()));
483 
484   return all_of(InstrDataForBoth,
485                 [](std::tuple<IRInstructionData &, IRInstructionData &> R) {
486                   IRInstructionData &A = std::get<0>(R);
487                   IRInstructionData &B = std::get<1>(R);
488                   if (!A.Legal || !B.Legal)
489                     return false;
490                   return isClose(A, B);
491                 });
492 }
493 
494 /// Determine if one or more of the assigned global value numbers for the
495 /// operands in \p TargetValueNumbers is in the current mapping set for operand
496 /// numbers in \p SourceOperands.  The set of possible corresponding global
497 /// value numbers are replaced with the most recent version of compatible
498 /// values.
499 ///
500 /// \param [in] SourceValueToNumberMapping - The mapping of a Value to global
501 /// value number for the source IRInstructionCandidate.
502 /// \param [in, out] CurrentSrcTgtNumberMapping - The current mapping of source
503 /// IRSimilarityCandidate global value numbers to a set of possible numbers in
504 /// the target.
505 /// \param [in] SourceOperands - The operands in the original
506 /// IRSimilarityCandidate in the current instruction.
507 /// \param [in] TargetValueNumbers - The global value numbers of the operands in
508 /// the corresponding Instruction in the other IRSimilarityCandidate.
509 /// \returns true if there exists a possible mapping between the source
510 /// Instruction operands and the target Instruction operands, and false if not.
511 static bool checkNumberingAndReplaceCommutative(
512   const DenseMap<Value *, unsigned> &SourceValueToNumberMapping,
513   DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
514   ArrayRef<Value *> &SourceOperands,
515   DenseSet<unsigned> &TargetValueNumbers){
516 
517   DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
518 
519   unsigned ArgVal;
520   bool WasInserted;
521 
522   // Iterate over the operands in the source IRSimilarityCandidate to determine
523   // whether there exists an operand in the other IRSimilarityCandidate that
524   // creates a valid mapping of Value to Value between the
525   // IRSimilarityCaniddates.
526   for (Value *V : SourceOperands) {
527     ArgVal = SourceValueToNumberMapping.find(V)->second;
528 
529     // Instead of finding a current mapping, we attempt to insert a set.
530     std::tie(ValueMappingIt, WasInserted) = CurrentSrcTgtNumberMapping.insert(
531         std::make_pair(ArgVal, TargetValueNumbers));
532 
533     // We need to iterate over the items in other IRSimilarityCandidate's
534     // Instruction to determine whether there is a valid mapping of
535     // Value to Value.
536     DenseSet<unsigned> NewSet;
537     for (unsigned &Curr : ValueMappingIt->second)
538       // If we can find the value in the mapping, we add it to the new set.
539       if (TargetValueNumbers.contains(Curr))
540         NewSet.insert(Curr);
541 
542     // If we could not find a Value, return 0.
543     if (NewSet.empty())
544       return false;
545 
546     // Otherwise replace the old mapping with the newly constructed one.
547     if (NewSet.size() != ValueMappingIt->second.size())
548       ValueMappingIt->second.swap(NewSet);
549 
550     // We have reached no conclusions about the mapping, and cannot remove
551     // any items from the other operands, so we move to check the next operand.
552     if (ValueMappingIt->second.size() != 1)
553       continue;
554 
555     unsigned ValToRemove = *ValueMappingIt->second.begin();
556     // When there is only one item left in the mapping for and operand, remove
557     // the value from the other operands.  If it results in there being no
558     // mapping, return false, it means the mapping is wrong
559     for (Value *InnerV : SourceOperands) {
560       if (V == InnerV)
561         continue;
562 
563       unsigned InnerVal = SourceValueToNumberMapping.find(InnerV)->second;
564       ValueMappingIt = CurrentSrcTgtNumberMapping.find(InnerVal);
565       if (ValueMappingIt == CurrentSrcTgtNumberMapping.end())
566         continue;
567 
568       ValueMappingIt->second.erase(ValToRemove);
569       if (ValueMappingIt->second.empty())
570         return false;
571     }
572   }
573 
574   return true;
575 }
576 
577 /// Determine if operand number \p TargetArgVal is in the current mapping set
578 /// for operand number \p SourceArgVal.
579 ///
580 /// \param [in, out] CurrentSrcTgtNumberMapping current mapping of global
581 /// value numbers from source IRSimilarityCandidate to target
582 /// IRSimilarityCandidate.
583 /// \param [in] SourceArgVal The global value number for an operand in the
584 /// in the original candidate.
585 /// \param [in] TargetArgVal The global value number for the corresponding
586 /// operand in the other candidate.
587 /// \returns True if there exists a mapping and false if not.
588 bool checkNumberingAndReplace(
589     DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
590     unsigned SourceArgVal, unsigned TargetArgVal) {
591   // We are given two unsigned integers representing the global values of
592   // the operands in different IRSimilarityCandidates and a current mapping
593   // between the two.
594   //
595   // Source Operand GVN: 1
596   // Target Operand GVN: 2
597   // CurrentMapping: {1: {1, 2}}
598   //
599   // Since we have mapping, and the target operand is contained in the set, we
600   // update it to:
601   // CurrentMapping: {1: {2}}
602   // and can return true. But, if the mapping was
603   // CurrentMapping: {1: {3}}
604   // we would return false.
605 
606   bool WasInserted;
607   DenseMap<unsigned, DenseSet<unsigned>>::iterator Val;
608 
609   std::tie(Val, WasInserted) = CurrentSrcTgtNumberMapping.insert(
610       std::make_pair(SourceArgVal, DenseSet<unsigned>({TargetArgVal})));
611 
612   // If we created a new mapping, then we are done.
613   if (WasInserted)
614     return true;
615 
616   // If there is more than one option in the mapping set, and the target value
617   // is included in the mapping set replace that set with one that only includes
618   // the target value, as it is the only valid mapping via the non commutative
619   // instruction.
620 
621   DenseSet<unsigned> &TargetSet = Val->second;
622   if (TargetSet.size() > 1 && TargetSet.contains(TargetArgVal)) {
623     TargetSet.clear();
624     TargetSet.insert(TargetArgVal);
625     return true;
626   }
627 
628   // Return true if we can find the value in the set.
629   return TargetSet.contains(TargetArgVal);
630 }
631 
632 bool IRSimilarityCandidate::compareNonCommutativeOperandMapping(
633     OperandMapping A, OperandMapping B) {
634   // Iterators to keep track of where we are in the operands for each
635   // Instruction.
636   ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
637   ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
638   unsigned OperandLength = A.OperVals.size();
639 
640   // For each operand, get the value numbering and ensure it is consistent.
641   for (unsigned Idx = 0; Idx < OperandLength; Idx++, VItA++, VItB++) {
642     unsigned OperValA = A.IRSC.ValueToNumber.find(*VItA)->second;
643     unsigned OperValB = B.IRSC.ValueToNumber.find(*VItB)->second;
644 
645     // Attempt to add a set with only the target value.  If there is no mapping
646     // we can create it here.
647     //
648     // For an instruction like a subtraction:
649     // IRSimilarityCandidateA:  IRSimilarityCandidateB:
650     // %resultA = sub %a, %b    %resultB = sub %d, %e
651     //
652     // We map %a -> %d and %b -> %e.
653     //
654     // And check to see whether their mapping is consistent in
655     // checkNumberingAndReplace.
656 
657     if (!checkNumberingAndReplace(A.ValueNumberMapping, OperValA, OperValB))
658       return false;
659 
660     if (!checkNumberingAndReplace(B.ValueNumberMapping, OperValB, OperValA))
661       return false;
662   }
663   return true;
664 }
665 
666 bool IRSimilarityCandidate::compareCommutativeOperandMapping(
667     OperandMapping A, OperandMapping B) {
668   DenseSet<unsigned> ValueNumbersA;
669   DenseSet<unsigned> ValueNumbersB;
670 
671   ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
672   ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
673   unsigned OperandLength = A.OperVals.size();
674 
675   // Find the value number sets for the operands.
676   for (unsigned Idx = 0; Idx < OperandLength;
677        Idx++, VItA++, VItB++) {
678     ValueNumbersA.insert(A.IRSC.ValueToNumber.find(*VItA)->second);
679     ValueNumbersB.insert(B.IRSC.ValueToNumber.find(*VItB)->second);
680   }
681 
682   // Iterate over the operands in the first IRSimilarityCandidate and make sure
683   // there exists a possible mapping with the operands in the second
684   // IRSimilarityCandidate.
685   if (!checkNumberingAndReplaceCommutative(A.IRSC.ValueToNumber,
686                                            A.ValueNumberMapping, A.OperVals,
687                                            ValueNumbersB))
688     return false;
689 
690   // Iterate over the operands in the second IRSimilarityCandidate and make sure
691   // there exists a possible mapping with the operands in the first
692   // IRSimilarityCandidate.
693   if (!checkNumberingAndReplaceCommutative(B.IRSC.ValueToNumber,
694                                            B.ValueNumberMapping, B.OperVals,
695                                            ValueNumbersA))
696     return false;
697 
698   return true;
699 }
700 
701 bool IRSimilarityCandidate::checkRelativeLocations(RelativeLocMapping A,
702                                                    RelativeLocMapping B) {
703   // Get the basic blocks the label refers to.
704   BasicBlock *ABB = static_cast<BasicBlock *>(A.OperVal);
705   BasicBlock *BBB = static_cast<BasicBlock *>(B.OperVal);
706 
707   // Get the basic blocks contained in each region.
708   DenseSet<BasicBlock *> BasicBlockA;
709   DenseSet<BasicBlock *> BasicBlockB;
710   A.IRSC.getBasicBlocks(BasicBlockA);
711   B.IRSC.getBasicBlocks(BasicBlockB);
712 
713   // Determine if the block is contained in the region.
714   bool AContained = BasicBlockA.contains(ABB);
715   bool BContained = BasicBlockB.contains(BBB);
716 
717   // Both blocks need to be contained in the region, or both need to be outside
718   // the reigon.
719   if (AContained != BContained)
720     return false;
721 
722   // If both are contained, then we need to make sure that the relative
723   // distance to the target blocks are the same.
724   if (AContained)
725     return A.RelativeLocation == B.RelativeLocation;
726   return true;
727 }
728 
729 bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
730                                              const IRSimilarityCandidate &B) {
731   DenseMap<unsigned, DenseSet<unsigned>> MappingA;
732   DenseMap<unsigned, DenseSet<unsigned>> MappingB;
733   return IRSimilarityCandidate::compareStructure(A, B, MappingA, MappingB);
734 }
735 
736 typedef detail::zippy<detail::zip_shortest, SmallVector<int, 4> &,
737                       SmallVector<int, 4> &, ArrayRef<Value *> &,
738                       ArrayRef<Value *> &>
739     ZippedRelativeLocationsT;
740 
741 bool IRSimilarityCandidate::compareStructure(
742     const IRSimilarityCandidate &A, const IRSimilarityCandidate &B,
743     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
744     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB) {
745   if (A.getLength() != B.getLength())
746     return false;
747 
748   if (A.ValueToNumber.size() != B.ValueToNumber.size())
749     return false;
750 
751   iterator ItA = A.begin();
752   iterator ItB = B.begin();
753 
754   // These ValueNumber Mapping sets create a create a mapping between the values
755   // in one candidate to values in the other candidate.  If we create a set with
756   // one element, and that same element maps to the original element in the
757   // candidate we have a good mapping.
758   DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
759 
760 
761   // Iterate over the instructions contained in each candidate
762   unsigned SectionLength = A.getStartIdx() + A.getLength();
763   for (unsigned Loc = A.getStartIdx(); Loc < SectionLength;
764        ItA++, ItB++, Loc++) {
765     // Make sure the instructions are similar to one another.
766     if (!isClose(*ItA, *ItB))
767       return false;
768 
769     Instruction *IA = ItA->Inst;
770     Instruction *IB = ItB->Inst;
771 
772     if (!ItA->Legal || !ItB->Legal)
773       return false;
774 
775     // Get the operand sets for the instructions.
776     ArrayRef<Value *> OperValsA = ItA->OperVals;
777     ArrayRef<Value *> OperValsB = ItB->OperVals;
778 
779     unsigned InstValA = A.ValueToNumber.find(IA)->second;
780     unsigned InstValB = B.ValueToNumber.find(IB)->second;
781 
782     bool WasInserted;
783     // Ensure that the mappings for the instructions exists.
784     std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingA.insert(
785         std::make_pair(InstValA, DenseSet<unsigned>({InstValB})));
786     if (!WasInserted && !ValueMappingIt->second.contains(InstValB))
787       return false;
788 
789     std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingB.insert(
790         std::make_pair(InstValB, DenseSet<unsigned>({InstValA})));
791     if (!WasInserted && !ValueMappingIt->second.contains(InstValA))
792       return false;
793 
794     // We have different paths for commutative instructions and non-commutative
795     // instructions since commutative instructions could allow multiple mappings
796     // to certain values.
797     if (IA->isCommutative() && !isa<FPMathOperator>(IA) &&
798         !isa<IntrinsicInst>(IA)) {
799       if (!compareCommutativeOperandMapping(
800               {A, OperValsA, ValueNumberMappingA},
801               {B, OperValsB, ValueNumberMappingB}))
802         return false;
803       continue;
804     }
805 
806     // Handle the non-commutative cases.
807     if (!compareNonCommutativeOperandMapping(
808             {A, OperValsA, ValueNumberMappingA},
809             {B, OperValsB, ValueNumberMappingB}))
810       return false;
811 
812     // Here we check that between two corresponding instructions,
813     // when referring to a basic block in the same region, the
814     // relative locations are the same. And, that the instructions refer to
815     // basic blocks outside the region in the same corresponding locations.
816 
817     // We are able to make the assumption about blocks outside of the region
818     // since the target block labels are considered values and will follow the
819     // same number matching that we defined for the other instructions in the
820     // region.  So, at this point, in each location we target a specific block
821     // outside the region, we are targeting a corresponding block in each
822     // analagous location in the region we are comparing to.
823     if (!(isa<BranchInst>(IA) && isa<BranchInst>(IB)) &&
824         !(isa<PHINode>(IA) && isa<PHINode>(IB)))
825       continue;
826 
827     SmallVector<int, 4> &RelBlockLocsA = ItA->RelativeBlockLocations;
828     SmallVector<int, 4> &RelBlockLocsB = ItB->RelativeBlockLocations;
829     if (RelBlockLocsA.size() != RelBlockLocsB.size() &&
830         OperValsA.size() != OperValsB.size())
831       return false;
832 
833     ZippedRelativeLocationsT ZippedRelativeLocations =
834         zip(RelBlockLocsA, RelBlockLocsB, OperValsA, OperValsB);
835     if (any_of(ZippedRelativeLocations,
836                [&A, &B](std::tuple<int, int, Value *, Value *> R) {
837                  return !checkRelativeLocations(
838                      {A, std::get<0>(R), std::get<2>(R)},
839                      {B, std::get<1>(R), std::get<3>(R)});
840                }))
841       return false;
842   }
843   return true;
844 }
845 
846 bool IRSimilarityCandidate::overlap(const IRSimilarityCandidate &A,
847                                     const IRSimilarityCandidate &B) {
848   auto DoesOverlap = [](const IRSimilarityCandidate &X,
849                         const IRSimilarityCandidate &Y) {
850     // Check:
851     // XXXXXX        X starts before Y ends
852     //      YYYYYYY  Y starts after X starts
853     return X.StartIdx <= Y.getEndIdx() && Y.StartIdx >= X.StartIdx;
854   };
855 
856   return DoesOverlap(A, B) || DoesOverlap(B, A);
857 }
858 
859 void IRSimilarityIdentifier::populateMapper(
860     Module &M, std::vector<IRInstructionData *> &InstrList,
861     std::vector<unsigned> &IntegerMapping) {
862 
863   std::vector<IRInstructionData *> InstrListForModule;
864   std::vector<unsigned> IntegerMappingForModule;
865   // Iterate over the functions in the module to map each Instruction in each
866   // BasicBlock to an unsigned integer.
867   Mapper.initializeForBBs(M);
868 
869   for (Function &F : M) {
870 
871     if (F.empty())
872       continue;
873 
874     for (BasicBlock &BB : F) {
875 
876       // BB has potential to have similarity since it has a size greater than 2
877       // and can therefore match other regions greater than 2. Map it to a list
878       // of unsigned integers.
879       Mapper.convertToUnsignedVec(BB, InstrListForModule,
880                                   IntegerMappingForModule);
881     }
882 
883     BasicBlock::iterator It = F.begin()->end();
884     Mapper.mapToIllegalUnsigned(It, IntegerMappingForModule, InstrListForModule,
885                                 true);
886     if (InstrListForModule.size() > 0)
887       Mapper.IDL->push_back(*InstrListForModule.back());
888   }
889 
890   // Insert the InstrListForModule at the end of the overall InstrList so that
891   // we can have a long InstrList for the entire set of Modules being analyzed.
892   llvm::append_range(InstrList, InstrListForModule);
893   // Do the same as above, but for IntegerMapping.
894   llvm::append_range(IntegerMapping, IntegerMappingForModule);
895 }
896 
897 void IRSimilarityIdentifier::populateMapper(
898     ArrayRef<std::unique_ptr<Module>> &Modules,
899     std::vector<IRInstructionData *> &InstrList,
900     std::vector<unsigned> &IntegerMapping) {
901 
902   // Iterate over, and map the instructions in each module.
903   for (const std::unique_ptr<Module> &M : Modules)
904     populateMapper(*M, InstrList, IntegerMapping);
905 }
906 
907 /// From a repeated subsequence, find all the different instances of the
908 /// subsequence from the \p InstrList, and create an IRSimilarityCandidate from
909 /// the IRInstructionData in subsequence.
910 ///
911 /// \param [in] Mapper - The instruction mapper for basic correctness checks.
912 /// \param [in] InstrList - The vector that holds the instruction data.
913 /// \param [in] IntegerMapping - The vector that holds the mapped integers.
914 /// \param [out] CandsForRepSubstring - The vector to store the generated
915 /// IRSimilarityCandidates.
916 static void createCandidatesFromSuffixTree(
917     const IRInstructionMapper& Mapper, std::vector<IRInstructionData *> &InstrList,
918     std::vector<unsigned> &IntegerMapping, SuffixTree::RepeatedSubstring &RS,
919     std::vector<IRSimilarityCandidate> &CandsForRepSubstring) {
920 
921   unsigned StringLen = RS.Length;
922   if (StringLen < 2)
923     return;
924 
925   // Create an IRSimilarityCandidate for instance of this subsequence \p RS.
926   for (const unsigned &StartIdx : RS.StartIndices) {
927     unsigned EndIdx = StartIdx + StringLen - 1;
928 
929     // Check that this subsequence does not contain an illegal instruction.
930     bool ContainsIllegal = false;
931     for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) {
932       unsigned Key = IntegerMapping[CurrIdx];
933       if (Key > Mapper.IllegalInstrNumber) {
934         ContainsIllegal = true;
935         break;
936       }
937     }
938 
939     // If we have an illegal instruction, we should not create an
940     // IRSimilarityCandidate for this region.
941     if (ContainsIllegal)
942       continue;
943 
944     // We are getting iterators to the instructions in this region of code
945     // by advancing the start and end indices from the start of the
946     // InstrList.
947     std::vector<IRInstructionData *>::iterator StartIt = InstrList.begin();
948     std::advance(StartIt, StartIdx);
949     std::vector<IRInstructionData *>::iterator EndIt = InstrList.begin();
950     std::advance(EndIt, EndIdx);
951 
952     CandsForRepSubstring.emplace_back(StartIdx, StringLen, *StartIt, *EndIt);
953   }
954 }
955 
956 void IRSimilarityCandidate::createCanonicalRelationFrom(
957     IRSimilarityCandidate &SourceCand,
958     DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping,
959     DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping) {
960   assert(SourceCand.CanonNumToNumber.size() != 0 &&
961          "Base canonical relationship is empty!");
962   assert(SourceCand.NumberToCanonNum.size() != 0 &&
963          "Base canonical relationship is empty!");
964 
965   assert(CanonNumToNumber.size() == 0 && "Canonical Relationship is non-empty");
966   assert(NumberToCanonNum.size() == 0 && "Canonical Relationship is non-empty");
967 
968   DenseSet<unsigned> UsedGVNs;
969   // Iterate over the mappings provided from this candidate to SourceCand.  We
970   // are then able to map the GVN in this candidate to the same canonical number
971   // given to the corresponding GVN in SourceCand.
972   for (std::pair<unsigned, DenseSet<unsigned>> &GVNMapping : ToSourceMapping) {
973     unsigned SourceGVN = GVNMapping.first;
974 
975     assert(GVNMapping.second.size() != 0 && "Possible GVNs is 0!");
976 
977     unsigned ResultGVN;
978     // We need special handling if we have more than one potential value.  This
979     // means that there are at least two GVNs that could correspond to this GVN.
980     // This could lead to potential swapping later on, so we make a decision
981     // here to ensure a one-to-one mapping.
982     if (GVNMapping.second.size() > 1) {
983       bool Found = false;
984       for (unsigned Val : GVNMapping.second) {
985         // We make sure the target value number hasn't already been reserved.
986         if (UsedGVNs.contains(Val))
987           continue;
988 
989         // We make sure that the opposite mapping is still consistent.
990         DenseMap<unsigned, DenseSet<unsigned>>::iterator It =
991             FromSourceMapping.find(Val);
992 
993         if (!It->second.contains(SourceGVN))
994           continue;
995 
996         // We pick the first item that satisfies these conditions.
997         Found = true;
998         ResultGVN = Val;
999         break;
1000       }
1001 
1002       assert(Found && "Could not find matching value for source GVN");
1003       (void)Found;
1004 
1005     } else
1006       ResultGVN = *GVNMapping.second.begin();
1007 
1008     // Whatever GVN is found, we mark it as used.
1009     UsedGVNs.insert(ResultGVN);
1010 
1011     unsigned CanonNum = *SourceCand.getCanonicalNum(ResultGVN);
1012     CanonNumToNumber.insert(std::make_pair(CanonNum, SourceGVN));
1013     NumberToCanonNum.insert(std::make_pair(SourceGVN, CanonNum));
1014   }
1015 
1016   DenseSet<BasicBlock *> BBSet;
1017   getBasicBlocks(BBSet);
1018   // Find canonical numbers for the BasicBlocks in the current candidate.
1019   // This is done by finding the corresponding value for the first instruction
1020   // in the block in the current candidate, finding the matching value in the
1021   // source candidate.  Then by finding the parent of this value, use the
1022   // canonical number of the block in the source candidate for the canonical
1023   // number in the current candidate.
1024   for (BasicBlock *BB : BBSet) {
1025     unsigned BBGVNForCurrCand = ValueToNumber.find(BB)->second;
1026 
1027     // We can skip the BasicBlock if the canonical numbering has already been
1028     // found in a separate instruction.
1029     if (NumberToCanonNum.find(BBGVNForCurrCand) != NumberToCanonNum.end())
1030       continue;
1031 
1032     // If the basic block is the starting block, then the shared instruction may
1033     // not be the first instruction in the block, it will be the first
1034     // instruction in the similarity region.
1035     Value *FirstOutlineInst = BB == getStartBB()
1036                                   ? frontInstruction()
1037                                   : &*BB->instructionsWithoutDebug().begin();
1038 
1039     unsigned FirstInstGVN = *getGVN(FirstOutlineInst);
1040     unsigned FirstInstCanonNum = *getCanonicalNum(FirstInstGVN);
1041     unsigned SourceGVN = *SourceCand.fromCanonicalNum(FirstInstCanonNum);
1042     Value *SourceV = *SourceCand.fromGVN(SourceGVN);
1043     BasicBlock *SourceBB = cast<Instruction>(SourceV)->getParent();
1044     unsigned SourceBBGVN = *SourceCand.getGVN(SourceBB);
1045     unsigned SourceCanonBBGVN = *SourceCand.getCanonicalNum(SourceBBGVN);
1046     CanonNumToNumber.insert(std::make_pair(SourceCanonBBGVN, BBGVNForCurrCand));
1047     NumberToCanonNum.insert(std::make_pair(BBGVNForCurrCand, SourceCanonBBGVN));
1048   }
1049 }
1050 
1051 void IRSimilarityCandidate::createCanonicalMappingFor(
1052     IRSimilarityCandidate &CurrCand) {
1053   assert(CurrCand.CanonNumToNumber.size() == 0 &&
1054          "Canonical Relationship is non-empty");
1055   assert(CurrCand.NumberToCanonNum.size() == 0 &&
1056          "Canonical Relationship is non-empty");
1057 
1058   unsigned CanonNum = 0;
1059   // Iterate over the value numbers found, the order does not matter in this
1060   // case.
1061   for (std::pair<unsigned, Value *> &NumToVal : CurrCand.NumberToValue) {
1062     CurrCand.NumberToCanonNum.insert(std::make_pair(NumToVal.first, CanonNum));
1063     CurrCand.CanonNumToNumber.insert(std::make_pair(CanonNum, NumToVal.first));
1064     CanonNum++;
1065   }
1066 }
1067 
1068 /// From the list of IRSimilarityCandidates, perform a comparison between each
1069 /// IRSimilarityCandidate to determine if there are overlapping
1070 /// IRInstructionData, or if they do not have the same structure.
1071 ///
1072 /// \param [in] CandsForRepSubstring - The vector containing the
1073 /// IRSimilarityCandidates.
1074 /// \param [out] StructuralGroups - the mapping of unsigned integers to vector
1075 /// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the
1076 /// vector are structurally similar to one another.
1077 static void findCandidateStructures(
1078     std::vector<IRSimilarityCandidate> &CandsForRepSubstring,
1079     DenseMap<unsigned, SimilarityGroup> &StructuralGroups) {
1080   std::vector<IRSimilarityCandidate>::iterator CandIt, CandEndIt, InnerCandIt,
1081       InnerCandEndIt;
1082 
1083   // IRSimilarityCandidates each have a structure for operand use.  It is
1084   // possible that two instances of the same subsequences have different
1085   // structure. Each type of structure found is assigned a number.  This
1086   // DenseMap maps an IRSimilarityCandidate to which type of similarity
1087   // discovered it fits within.
1088   DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
1089 
1090   // Find the compatibility from each candidate to the others to determine
1091   // which candidates overlap and which have the same structure by mapping
1092   // each structure to a different group.
1093   bool SameStructure;
1094   bool Inserted;
1095   unsigned CurrentGroupNum = 0;
1096   unsigned OuterGroupNum;
1097   DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupIt;
1098   DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupItInner;
1099   DenseMap<unsigned, SimilarityGroup>::iterator CurrentGroupPair;
1100 
1101   // Iterate over the candidates to determine its structural and overlapping
1102   // compatibility with other instructions
1103   DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingA;
1104   DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingB;
1105   for (CandIt = CandsForRepSubstring.begin(),
1106       CandEndIt = CandsForRepSubstring.end();
1107        CandIt != CandEndIt; CandIt++) {
1108 
1109     // Determine if it has an assigned structural group already.
1110     CandToGroupIt = CandToGroup.find(&*CandIt);
1111     if (CandToGroupIt == CandToGroup.end()) {
1112       // If not, we assign it one, and add it to our mapping.
1113       std::tie(CandToGroupIt, Inserted) =
1114           CandToGroup.insert(std::make_pair(&*CandIt, CurrentGroupNum++));
1115     }
1116 
1117     // Get the structural group number from the iterator.
1118     OuterGroupNum = CandToGroupIt->second;
1119 
1120     // Check if we already have a list of IRSimilarityCandidates for the current
1121     // structural group.  Create one if one does not exist.
1122     CurrentGroupPair = StructuralGroups.find(OuterGroupNum);
1123     if (CurrentGroupPair == StructuralGroups.end()) {
1124       IRSimilarityCandidate::createCanonicalMappingFor(*CandIt);
1125       std::tie(CurrentGroupPair, Inserted) = StructuralGroups.insert(
1126           std::make_pair(OuterGroupNum, SimilarityGroup({*CandIt})));
1127     }
1128 
1129     // Iterate over the IRSimilarityCandidates following the current
1130     // IRSimilarityCandidate in the list to determine whether the two
1131     // IRSimilarityCandidates are compatible.  This is so we do not repeat pairs
1132     // of IRSimilarityCandidates.
1133     for (InnerCandIt = std::next(CandIt),
1134         InnerCandEndIt = CandsForRepSubstring.end();
1135          InnerCandIt != InnerCandEndIt; InnerCandIt++) {
1136 
1137       // We check if the inner item has a group already, if it does, we skip it.
1138       CandToGroupItInner = CandToGroup.find(&*InnerCandIt);
1139       if (CandToGroupItInner != CandToGroup.end())
1140         continue;
1141 
1142       // Otherwise we determine if they have the same structure and add it to
1143       // vector if they match.
1144       ValueNumberMappingA.clear();
1145       ValueNumberMappingB.clear();
1146       SameStructure = IRSimilarityCandidate::compareStructure(
1147           *CandIt, *InnerCandIt, ValueNumberMappingA, ValueNumberMappingB);
1148       if (!SameStructure)
1149         continue;
1150 
1151       InnerCandIt->createCanonicalRelationFrom(*CandIt, ValueNumberMappingA,
1152                                                ValueNumberMappingB);
1153       CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum));
1154       CurrentGroupPair->second.push_back(*InnerCandIt);
1155     }
1156   }
1157 }
1158 
1159 void IRSimilarityIdentifier::findCandidates(
1160     std::vector<IRInstructionData *> &InstrList,
1161     std::vector<unsigned> &IntegerMapping) {
1162   SuffixTree ST(IntegerMapping);
1163 
1164   std::vector<IRSimilarityCandidate> CandsForRepSubstring;
1165   std::vector<SimilarityGroup> NewCandidateGroups;
1166 
1167   DenseMap<unsigned, SimilarityGroup> StructuralGroups;
1168 
1169   // Iterate over the subsequences found by the Suffix Tree to create
1170   // IRSimilarityCandidates for each repeated subsequence and determine which
1171   // instances are structurally similar to one another.
1172   for (SuffixTree::RepeatedSubstring &RS : ST) {
1173     createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, RS,
1174                                    CandsForRepSubstring);
1175 
1176     if (CandsForRepSubstring.size() < 2)
1177       continue;
1178 
1179     findCandidateStructures(CandsForRepSubstring, StructuralGroups);
1180     for (std::pair<unsigned, SimilarityGroup> &Group : StructuralGroups)
1181       // We only add the group if it contains more than one
1182       // IRSimilarityCandidate.  If there is only one, that means there is no
1183       // other repeated subsequence with the same structure.
1184       if (Group.second.size() > 1)
1185         SimilarityCandidates->push_back(Group.second);
1186 
1187     CandsForRepSubstring.clear();
1188     StructuralGroups.clear();
1189     NewCandidateGroups.clear();
1190   }
1191 }
1192 
1193 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
1194     ArrayRef<std::unique_ptr<Module>> Modules) {
1195   resetSimilarityCandidates();
1196 
1197   std::vector<IRInstructionData *> InstrList;
1198   std::vector<unsigned> IntegerMapping;
1199   Mapper.InstClassifier.EnableBranches = this->EnableBranches;
1200   Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
1201   Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
1202   Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics;
1203   Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls;
1204 
1205   populateMapper(Modules, InstrList, IntegerMapping);
1206   findCandidates(InstrList, IntegerMapping);
1207 
1208   return *SimilarityCandidates;
1209 }
1210 
1211 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
1212   resetSimilarityCandidates();
1213   Mapper.InstClassifier.EnableBranches = this->EnableBranches;
1214   Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
1215   Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
1216   Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics;
1217   Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls;
1218 
1219   std::vector<IRInstructionData *> InstrList;
1220   std::vector<unsigned> IntegerMapping;
1221 
1222   populateMapper(M, InstrList, IntegerMapping);
1223   findCandidates(InstrList, IntegerMapping);
1224 
1225   return *SimilarityCandidates;
1226 }
1227 
1228 INITIALIZE_PASS(IRSimilarityIdentifierWrapperPass, "ir-similarity-identifier",
1229                 "ir-similarity-identifier", false, true)
1230 
1231 IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass()
1232     : ModulePass(ID) {
1233   initializeIRSimilarityIdentifierWrapperPassPass(
1234       *PassRegistry::getPassRegistry());
1235 }
1236 
1237 bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) {
1238   IRSI.reset(new IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
1239                                         MatchCallsByName, !DisableIntrinsics,
1240                                         false));
1241   return false;
1242 }
1243 
1244 bool IRSimilarityIdentifierWrapperPass::doFinalization(Module &M) {
1245   IRSI.reset();
1246   return false;
1247 }
1248 
1249 bool IRSimilarityIdentifierWrapperPass::runOnModule(Module &M) {
1250   IRSI->findSimilarity(M);
1251   return false;
1252 }
1253 
1254 AnalysisKey IRSimilarityAnalysis::Key;
1255 IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M,
1256                                                  ModuleAnalysisManager &) {
1257   auto IRSI = IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
1258                                      MatchCallsByName, !DisableIntrinsics,
1259                                      false);
1260   IRSI.findSimilarity(M);
1261   return IRSI;
1262 }
1263 
1264 PreservedAnalyses
1265 IRSimilarityAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &AM) {
1266   IRSimilarityIdentifier &IRSI = AM.getResult<IRSimilarityAnalysis>(M);
1267   Optional<SimilarityGroupList> &SimilarityCandidatesOpt = IRSI.getSimilarity();
1268 
1269   for (std::vector<IRSimilarityCandidate> &CandVec : *SimilarityCandidatesOpt) {
1270     OS << CandVec.size() << " candidates of length "
1271        << CandVec.begin()->getLength() << ".  Found in: \n";
1272     for (IRSimilarityCandidate &Cand : CandVec) {
1273       OS << "  Function: " << Cand.front()->Inst->getFunction()->getName().str()
1274          << ", Basic Block: ";
1275       if (Cand.front()->Inst->getParent()->getName().str() == "")
1276         OS << "(unnamed)";
1277       else
1278         OS << Cand.front()->Inst->getParent()->getName().str();
1279       OS << "\n    Start Instruction: ";
1280       Cand.frontInstruction()->print(OS);
1281       OS << "\n      End Instruction: ";
1282       Cand.backInstruction()->print(OS);
1283       OS << "\n";
1284     }
1285   }
1286 
1287   return PreservedAnalyses::all();
1288 }
1289 
1290 char IRSimilarityIdentifierWrapperPass::ID = 0;
1291