xref: /freebsd/contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp (revision 0e8011faf58b743cc652e3b2ad0f7671227610df)
1 //===- IRSimilarityIdentifier.cpp - Find similarity in a module -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // Implementation file for the IRSimilarityIdentifier for identifying
11 // similarities in IR including the IRInstructionMapper.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Analysis/IRSimilarityIdentifier.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/ADT/SetOperations.h"
18 #include "llvm/IR/Intrinsics.h"
19 #include "llvm/IR/Operator.h"
20 #include "llvm/IR/User.h"
21 #include "llvm/InitializePasses.h"
22 #include "llvm/Support/SuffixTree.h"
23 
24 using namespace llvm;
25 using namespace IRSimilarity;
26 
27 namespace llvm {
28 cl::opt<bool>
29     DisableBranches("no-ir-sim-branch-matching", cl::init(false),
30                     cl::ReallyHidden,
31                     cl::desc("disable similarity matching, and outlining, "
32                              "across branches for debugging purposes."));
33 
34 cl::opt<bool>
35     DisableIndirectCalls("no-ir-sim-indirect-calls", cl::init(false),
36                          cl::ReallyHidden,
37                          cl::desc("disable outlining indirect calls."));
38 
39 cl::opt<bool>
40     MatchCallsByName("ir-sim-calls-by-name", cl::init(false), cl::ReallyHidden,
41                      cl::desc("only allow matching call instructions if the "
42                               "name and type signature match."));
43 
44 cl::opt<bool>
45     DisableIntrinsics("no-ir-sim-intrinsics", cl::init(false), cl::ReallyHidden,
46                       cl::desc("Don't match or outline intrinsics"));
47 } // namespace llvm
48 
49 IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
50                                      IRInstructionDataList &IDList)
51     : Inst(&I), Legal(Legality), IDL(&IDList) {
52   initializeInstruction();
53 }
54 
55 void IRInstructionData::initializeInstruction() {
56   // We check for whether we have a comparison instruction.  If it is, we
57   // find the "less than" version of the predicate for consistency for
58   // comparison instructions throught the program.
59   if (CmpInst *C = dyn_cast<CmpInst>(Inst)) {
60     CmpInst::Predicate Predicate = predicateForConsistency(C);
61     if (Predicate != C->getPredicate())
62       RevisedPredicate = Predicate;
63   }
64 
65   // Here we collect the operands and their types for determining whether
66   // the structure of the operand use matches between two different candidates.
67   for (Use &OI : Inst->operands()) {
68     if (isa<CmpInst>(Inst) && RevisedPredicate) {
69       // If we have a CmpInst where the predicate is reversed, it means the
70       // operands must be reversed as well.
71       OperVals.insert(OperVals.begin(), OI.get());
72       continue;
73     }
74 
75     OperVals.push_back(OI.get());
76   }
77 
78   // We capture the incoming BasicBlocks as values as well as the incoming
79   // Values in order to check for structural similarity.
80   if (PHINode *PN = dyn_cast<PHINode>(Inst))
81     for (BasicBlock *BB : PN->blocks())
82       OperVals.push_back(BB);
83 }
84 
85 IRInstructionData::IRInstructionData(IRInstructionDataList &IDList)
86     : IDL(&IDList) {}
87 
88 void IRInstructionData::setBranchSuccessors(
89     DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
90   assert(isa<BranchInst>(Inst) && "Instruction must be branch");
91 
92   BranchInst *BI = cast<BranchInst>(Inst);
93   DenseMap<BasicBlock *, unsigned>::iterator BBNumIt;
94 
95   BBNumIt = BasicBlockToInteger.find(BI->getParent());
96   assert(BBNumIt != BasicBlockToInteger.end() &&
97          "Could not find location for BasicBlock!");
98 
99   int CurrentBlockNumber = static_cast<int>(BBNumIt->second);
100 
101   for (Value *V : getBlockOperVals()) {
102     BasicBlock *Successor = cast<BasicBlock>(V);
103     BBNumIt = BasicBlockToInteger.find(Successor);
104     assert(BBNumIt != BasicBlockToInteger.end() &&
105            "Could not find number for BasicBlock!");
106     int OtherBlockNumber = static_cast<int>(BBNumIt->second);
107 
108     int Relative = OtherBlockNumber - CurrentBlockNumber;
109     RelativeBlockLocations.push_back(Relative);
110   }
111 }
112 
113 ArrayRef<Value *> IRInstructionData::getBlockOperVals() {
114   assert((isa<BranchInst>(Inst) ||
115          isa<PHINode>(Inst)) && "Instruction must be branch or PHINode");
116 
117   if (BranchInst *BI = dyn_cast<BranchInst>(Inst))
118     return ArrayRef<Value *>(
119       std::next(OperVals.begin(), BI->isConditional() ? 1 : 0),
120       OperVals.end()
121     );
122 
123   if (PHINode *PN = dyn_cast<PHINode>(Inst))
124     return ArrayRef<Value *>(
125       std::next(OperVals.begin(), PN->getNumIncomingValues()),
126       OperVals.end()
127     );
128 
129   return ArrayRef<Value *>();
130 }
131 
132 void IRInstructionData::setCalleeName(bool MatchByName) {
133   CallInst *CI = dyn_cast<CallInst>(Inst);
134   assert(CI && "Instruction must be call");
135 
136   CalleeName = "";
137   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
138     // To hash intrinsics, we use the opcode, and types like the other
139     // instructions, but also, the Intrinsic ID, and the Name of the
140     // intrinsic.
141     Intrinsic::ID IntrinsicID = II->getIntrinsicID();
142     FunctionType *FT = II->getFunctionType();
143     // If there is an overloaded name, we have to use the complex version
144     // of getName to get the entire string.
145     if (Intrinsic::isOverloaded(IntrinsicID))
146       CalleeName =
147           Intrinsic::getName(IntrinsicID, FT->params(), II->getModule(), FT);
148     // If there is not an overloaded name, we only need to use this version.
149     else
150       CalleeName = Intrinsic::getName(IntrinsicID).str();
151 
152     return;
153   }
154 
155   if (!CI->isIndirectCall() && MatchByName)
156     CalleeName = CI->getCalledFunction()->getName().str();
157 }
158 
159 void IRInstructionData::setPHIPredecessors(
160     DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
161   assert(isa<PHINode>(Inst) && "Instruction must be phi node");
162 
163   PHINode *PN = cast<PHINode>(Inst);
164   DenseMap<BasicBlock *, unsigned>::iterator BBNumIt;
165 
166   BBNumIt = BasicBlockToInteger.find(PN->getParent());
167   assert(BBNumIt != BasicBlockToInteger.end() &&
168          "Could not find location for BasicBlock!");
169 
170   int CurrentBlockNumber = static_cast<int>(BBNumIt->second);
171 
172   // Convert the incoming blocks of the PHINode to an integer value, based on
173   // the relative distances between the current block and the incoming block.
174   for (unsigned Idx = 0; Idx < PN->getNumIncomingValues(); Idx++) {
175     BasicBlock *Incoming = PN->getIncomingBlock(Idx);
176     BBNumIt = BasicBlockToInteger.find(Incoming);
177     assert(BBNumIt != BasicBlockToInteger.end() &&
178            "Could not find number for BasicBlock!");
179     int OtherBlockNumber = static_cast<int>(BBNumIt->second);
180 
181     int Relative = OtherBlockNumber - CurrentBlockNumber;
182     RelativeBlockLocations.push_back(Relative);
183   }
184 }
185 
186 CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) {
187   switch (CI->getPredicate()) {
188   case CmpInst::FCMP_OGT:
189   case CmpInst::FCMP_UGT:
190   case CmpInst::FCMP_OGE:
191   case CmpInst::FCMP_UGE:
192   case CmpInst::ICMP_SGT:
193   case CmpInst::ICMP_UGT:
194   case CmpInst::ICMP_SGE:
195   case CmpInst::ICMP_UGE:
196     return CI->getSwappedPredicate();
197   default:
198     return CI->getPredicate();
199   }
200 }
201 
202 CmpInst::Predicate IRInstructionData::getPredicate() const {
203   assert(isa<CmpInst>(Inst) &&
204          "Can only get a predicate from a compare instruction");
205 
206   if (RevisedPredicate)
207     return *RevisedPredicate;
208 
209   return cast<CmpInst>(Inst)->getPredicate();
210 }
211 
212 StringRef IRInstructionData::getCalleeName() const {
213   assert(isa<CallInst>(Inst) &&
214          "Can only get a name from a call instruction");
215 
216   assert(CalleeName && "CalleeName has not been set");
217 
218   return *CalleeName;
219 }
220 
221 bool IRSimilarity::isClose(const IRInstructionData &A,
222                            const IRInstructionData &B) {
223 
224   if (!A.Legal || !B.Legal)
225     return false;
226 
227   // Check if we are performing the same sort of operation on the same types
228   // but not on the same values.
229   if (!A.Inst->isSameOperationAs(B.Inst)) {
230     // If there is a predicate, this means that either there is a swapped
231     // predicate, or that the types are different, we want to make sure that
232     // the predicates are equivalent via swapping.
233     if (isa<CmpInst>(A.Inst) && isa<CmpInst>(B.Inst)) {
234 
235       if (A.getPredicate() != B.getPredicate())
236         return false;
237 
238       // If the predicates are the same via swap, make sure that the types are
239       // still the same.
240       auto ZippedTypes = zip(A.OperVals, B.OperVals);
241 
242       return all_of(
243           ZippedTypes, [](std::tuple<llvm::Value *, llvm::Value *> R) {
244             return std::get<0>(R)->getType() == std::get<1>(R)->getType();
245           });
246     }
247 
248     return false;
249   }
250 
251   // Since any GEP Instruction operands after the first operand cannot be
252   // defined by a register, we must make sure that the operands after the first
253   // are the same in the two instructions
254   if (auto *GEP = dyn_cast<GetElementPtrInst>(A.Inst)) {
255     auto *OtherGEP = cast<GetElementPtrInst>(B.Inst);
256 
257     // If the instructions do not have the same inbounds restrictions, we do
258     // not consider them the same.
259     if (GEP->isInBounds() != OtherGEP->isInBounds())
260       return false;
261 
262     auto ZippedOperands = zip(GEP->indices(), OtherGEP->indices());
263 
264     // We increment here since we do not care about the first instruction,
265     // we only care about the following operands since they must be the
266     // exact same to be considered similar.
267     return all_of(drop_begin(ZippedOperands),
268                   [](std::tuple<llvm::Use &, llvm::Use &> R) {
269                     return std::get<0>(R) == std::get<1>(R);
270                   });
271   }
272 
273   // If the instructions are functions calls, we make sure that the function
274   // name is the same.  We already know that the types are since is
275   // isSameOperationAs is true.
276   if (isa<CallInst>(A.Inst) && isa<CallInst>(B.Inst)) {
277     if (A.getCalleeName() != B.getCalleeName())
278       return false;
279   }
280 
281   if (isa<BranchInst>(A.Inst) && isa<BranchInst>(B.Inst) &&
282       A.RelativeBlockLocations.size() != B.RelativeBlockLocations.size())
283     return false;
284 
285   return true;
286 }
287 
288 // TODO: This is the same as the MachineOutliner, and should be consolidated
289 // into the same interface.
290 void IRInstructionMapper::convertToUnsignedVec(
291     BasicBlock &BB, std::vector<IRInstructionData *> &InstrList,
292     std::vector<unsigned> &IntegerMapping) {
293   BasicBlock::iterator It = BB.begin();
294 
295   std::vector<unsigned> IntegerMappingForBB;
296   std::vector<IRInstructionData *> InstrListForBB;
297 
298   for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) {
299     switch (InstClassifier.visit(*It)) {
300     case InstrType::Legal:
301       mapToLegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
302       break;
303     case InstrType::Illegal:
304       mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
305       break;
306     case InstrType::Invisible:
307       AddedIllegalLastTime = false;
308       break;
309     }
310   }
311 
312   if (AddedIllegalLastTime)
313     mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true);
314   for (IRInstructionData *ID : InstrListForBB)
315     this->IDL->push_back(*ID);
316   llvm::append_range(InstrList, InstrListForBB);
317   llvm::append_range(IntegerMapping, IntegerMappingForBB);
318 }
319 
320 // TODO: This is the same as the MachineOutliner, and should be consolidated
321 // into the same interface.
322 unsigned IRInstructionMapper::mapToLegalUnsigned(
323     BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
324     std::vector<IRInstructionData *> &InstrListForBB) {
325   // We added something legal, so we should unset the AddedLegalLastTime
326   // flag.
327   AddedIllegalLastTime = false;
328 
329   // If we have at least two adjacent legal instructions (which may have
330   // invisible instructions in between), remember that.
331   if (CanCombineWithPrevInstr)
332     HaveLegalRange = true;
333   CanCombineWithPrevInstr = true;
334 
335   // Get the integer for this instruction or give it the current
336   // LegalInstrNumber.
337   IRInstructionData *ID = allocateIRInstructionData(*It, true, *IDL);
338   InstrListForBB.push_back(ID);
339 
340   if (isa<BranchInst>(*It))
341     ID->setBranchSuccessors(BasicBlockToInteger);
342 
343   if (isa<CallInst>(*It))
344     ID->setCalleeName(EnableMatchCallsByName);
345 
346   if (isa<PHINode>(*It))
347     ID->setPHIPredecessors(BasicBlockToInteger);
348 
349   // Add to the instruction list
350   bool WasInserted;
351   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator
352       ResultIt;
353   std::tie(ResultIt, WasInserted) =
354       InstructionIntegerMap.insert(std::make_pair(ID, LegalInstrNumber));
355   unsigned INumber = ResultIt->second;
356 
357   // There was an insertion.
358   if (WasInserted)
359     LegalInstrNumber++;
360 
361   IntegerMappingForBB.push_back(INumber);
362 
363   // Make sure we don't overflow or use any integers reserved by the DenseMap.
364   assert(LegalInstrNumber < IllegalInstrNumber &&
365          "Instruction mapping overflow!");
366 
367   assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
368          "Tried to assign DenseMap tombstone or empty key to instruction.");
369   assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
370          "Tried to assign DenseMap tombstone or empty key to instruction.");
371 
372   return INumber;
373 }
374 
375 IRInstructionData *
376 IRInstructionMapper::allocateIRInstructionData(Instruction &I, bool Legality,
377                                                IRInstructionDataList &IDL) {
378   return new (InstDataAllocator->Allocate()) IRInstructionData(I, Legality, IDL);
379 }
380 
381 IRInstructionData *
382 IRInstructionMapper::allocateIRInstructionData(IRInstructionDataList &IDL) {
383   return new (InstDataAllocator->Allocate()) IRInstructionData(IDL);
384 }
385 
386 IRInstructionDataList *
387 IRInstructionMapper::allocateIRInstructionDataList() {
388   return new (IDLAllocator->Allocate()) IRInstructionDataList();
389 }
390 
391 // TODO: This is the same as the MachineOutliner, and should be consolidated
392 // into the same interface.
393 unsigned IRInstructionMapper::mapToIllegalUnsigned(
394     BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
395     std::vector<IRInstructionData *> &InstrListForBB, bool End) {
396   // Can't combine an illegal instruction. Set the flag.
397   CanCombineWithPrevInstr = false;
398 
399   // Only add one illegal number per range of legal numbers.
400   if (AddedIllegalLastTime)
401     return IllegalInstrNumber;
402 
403   IRInstructionData *ID = nullptr;
404   if (!End)
405     ID = allocateIRInstructionData(*It, false, *IDL);
406   else
407     ID = allocateIRInstructionData(*IDL);
408   InstrListForBB.push_back(ID);
409 
410   // Remember that we added an illegal number last time.
411   AddedIllegalLastTime = true;
412   unsigned INumber = IllegalInstrNumber;
413   IntegerMappingForBB.push_back(IllegalInstrNumber--);
414 
415   assert(LegalInstrNumber < IllegalInstrNumber &&
416          "Instruction mapping overflow!");
417 
418   assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
419          "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
420 
421   assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
422          "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
423 
424   return INumber;
425 }
426 
427 IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
428                                              IRInstructionData *FirstInstIt,
429                                              IRInstructionData *LastInstIt)
430     : StartIdx(StartIdx), Len(Len) {
431 
432   assert(FirstInstIt != nullptr && "Instruction is nullptr!");
433   assert(LastInstIt != nullptr && "Instruction is nullptr!");
434   assert(StartIdx + Len > StartIdx &&
435          "Overflow for IRSimilarityCandidate range?");
436   assert(Len - 1 == static_cast<unsigned>(std::distance(
437                         iterator(FirstInstIt), iterator(LastInstIt))) &&
438          "Length of the first and last IRInstructionData do not match the "
439          "given length");
440 
441   // We iterate over the given instructions, and map each unique value
442   // to a unique number in the IRSimilarityCandidate ValueToNumber and
443   // NumberToValue maps.  A constant get its own value globally, the individual
444   // uses of the constants are not considered to be unique.
445   //
446   // IR:                    Mapping Added:
447   // %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
448   // %add2 = add i32 %a, %1    %add2 -> 4
449   // %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
450   //
451   // when replace with global values, starting from 1, would be
452   //
453   // 3 = add i32 1, 2
454   // 4 = add i32 1, 3
455   // 6 = add i32 5, 2
456   unsigned LocalValNumber = 1;
457   IRInstructionDataList::iterator ID = iterator(*FirstInstIt);
458   for (unsigned Loc = StartIdx; Loc < StartIdx + Len; Loc++, ID++) {
459     // Map the operand values to an unsigned integer if it does not already
460     // have an unsigned integer assigned to it.
461     for (Value *Arg : ID->OperVals)
462       if (!ValueToNumber.contains(Arg)) {
463         ValueToNumber.try_emplace(Arg, LocalValNumber);
464         NumberToValue.try_emplace(LocalValNumber, Arg);
465         LocalValNumber++;
466       }
467 
468     // Mapping the instructions to an unsigned integer if it is not already
469     // exist in the mapping.
470     if (!ValueToNumber.contains(ID->Inst)) {
471       ValueToNumber.try_emplace(ID->Inst, LocalValNumber);
472       NumberToValue.try_emplace(LocalValNumber, ID->Inst);
473       LocalValNumber++;
474     }
475   }
476 
477   // Setting the first and last instruction data pointers for the candidate.  If
478   // we got through the entire for loop without hitting an assert, we know
479   // that both of these instructions are not nullptrs.
480   FirstInst = FirstInstIt;
481   LastInst = LastInstIt;
482 
483   // Add the basic blocks contained in the set into the global value numbering.
484   DenseSet<BasicBlock *> BBSet;
485   getBasicBlocks(BBSet);
486   for (BasicBlock *BB : BBSet) {
487     if (ValueToNumber.contains(BB))
488       continue;
489 
490     ValueToNumber.try_emplace(BB, LocalValNumber);
491     NumberToValue.try_emplace(LocalValNumber, BB);
492     LocalValNumber++;
493   }
494 }
495 
496 bool IRSimilarityCandidate::isSimilar(const IRSimilarityCandidate &A,
497                                       const IRSimilarityCandidate &B) {
498   if (A.getLength() != B.getLength())
499     return false;
500 
501   auto InstrDataForBoth =
502       zip(make_range(A.begin(), A.end()), make_range(B.begin(), B.end()));
503 
504   return all_of(InstrDataForBoth,
505                 [](std::tuple<IRInstructionData &, IRInstructionData &> R) {
506                   IRInstructionData &A = std::get<0>(R);
507                   IRInstructionData &B = std::get<1>(R);
508                   if (!A.Legal || !B.Legal)
509                     return false;
510                   return isClose(A, B);
511                 });
512 }
513 
514 /// Determine if one or more of the assigned global value numbers for the
515 /// operands in \p TargetValueNumbers is in the current mapping set for operand
516 /// numbers in \p SourceOperands.  The set of possible corresponding global
517 /// value numbers are replaced with the most recent version of compatible
518 /// values.
519 ///
520 /// \param [in] SourceValueToNumberMapping - The mapping of a Value to global
521 /// value number for the source IRInstructionCandidate.
522 /// \param [in, out] CurrentSrcTgtNumberMapping - The current mapping of source
523 /// IRSimilarityCandidate global value numbers to a set of possible numbers in
524 /// the target.
525 /// \param [in] SourceOperands - The operands in the original
526 /// IRSimilarityCandidate in the current instruction.
527 /// \param [in] TargetValueNumbers - The global value numbers of the operands in
528 /// the corresponding Instruction in the other IRSimilarityCandidate.
529 /// \returns true if there exists a possible mapping between the source
530 /// Instruction operands and the target Instruction operands, and false if not.
531 static bool checkNumberingAndReplaceCommutative(
532   const DenseMap<Value *, unsigned> &SourceValueToNumberMapping,
533   DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
534   ArrayRef<Value *> &SourceOperands,
535   DenseSet<unsigned> &TargetValueNumbers){
536 
537   DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
538 
539   unsigned ArgVal;
540   bool WasInserted;
541 
542   // Iterate over the operands in the source IRSimilarityCandidate to determine
543   // whether there exists an operand in the other IRSimilarityCandidate that
544   // creates a valid mapping of Value to Value between the
545   // IRSimilarityCaniddates.
546   for (Value *V : SourceOperands) {
547     ArgVal = SourceValueToNumberMapping.find(V)->second;
548 
549     // Instead of finding a current mapping, we attempt to insert a set.
550     std::tie(ValueMappingIt, WasInserted) = CurrentSrcTgtNumberMapping.insert(
551         std::make_pair(ArgVal, TargetValueNumbers));
552 
553     // We need to iterate over the items in other IRSimilarityCandidate's
554     // Instruction to determine whether there is a valid mapping of
555     // Value to Value.
556     DenseSet<unsigned> NewSet;
557     for (unsigned &Curr : ValueMappingIt->second)
558       // If we can find the value in the mapping, we add it to the new set.
559       if (TargetValueNumbers.contains(Curr))
560         NewSet.insert(Curr);
561 
562     // If we could not find a Value, return 0.
563     if (NewSet.empty())
564       return false;
565 
566     // Otherwise replace the old mapping with the newly constructed one.
567     if (NewSet.size() != ValueMappingIt->second.size())
568       ValueMappingIt->second.swap(NewSet);
569 
570     // We have reached no conclusions about the mapping, and cannot remove
571     // any items from the other operands, so we move to check the next operand.
572     if (ValueMappingIt->second.size() != 1)
573       continue;
574 
575     unsigned ValToRemove = *ValueMappingIt->second.begin();
576     // When there is only one item left in the mapping for and operand, remove
577     // the value from the other operands.  If it results in there being no
578     // mapping, return false, it means the mapping is wrong
579     for (Value *InnerV : SourceOperands) {
580       if (V == InnerV)
581         continue;
582 
583       unsigned InnerVal = SourceValueToNumberMapping.find(InnerV)->second;
584       ValueMappingIt = CurrentSrcTgtNumberMapping.find(InnerVal);
585       if (ValueMappingIt == CurrentSrcTgtNumberMapping.end())
586         continue;
587 
588       ValueMappingIt->second.erase(ValToRemove);
589       if (ValueMappingIt->second.empty())
590         return false;
591     }
592   }
593 
594   return true;
595 }
596 
597 /// Determine if operand number \p TargetArgVal is in the current mapping set
598 /// for operand number \p SourceArgVal.
599 ///
600 /// \param [in, out] CurrentSrcTgtNumberMapping current mapping of global
601 /// value numbers from source IRSimilarityCandidate to target
602 /// IRSimilarityCandidate.
603 /// \param [in] SourceArgVal The global value number for an operand in the
604 /// in the original candidate.
605 /// \param [in] TargetArgVal The global value number for the corresponding
606 /// operand in the other candidate.
607 /// \returns True if there exists a mapping and false if not.
608 bool checkNumberingAndReplace(
609     DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
610     unsigned SourceArgVal, unsigned TargetArgVal) {
611   // We are given two unsigned integers representing the global values of
612   // the operands in different IRSimilarityCandidates and a current mapping
613   // between the two.
614   //
615   // Source Operand GVN: 1
616   // Target Operand GVN: 2
617   // CurrentMapping: {1: {1, 2}}
618   //
619   // Since we have mapping, and the target operand is contained in the set, we
620   // update it to:
621   // CurrentMapping: {1: {2}}
622   // and can return true. But, if the mapping was
623   // CurrentMapping: {1: {3}}
624   // we would return false.
625 
626   bool WasInserted;
627   DenseMap<unsigned, DenseSet<unsigned>>::iterator Val;
628 
629   std::tie(Val, WasInserted) = CurrentSrcTgtNumberMapping.insert(
630       std::make_pair(SourceArgVal, DenseSet<unsigned>({TargetArgVal})));
631 
632   // If we created a new mapping, then we are done.
633   if (WasInserted)
634     return true;
635 
636   // If there is more than one option in the mapping set, and the target value
637   // is included in the mapping set replace that set with one that only includes
638   // the target value, as it is the only valid mapping via the non commutative
639   // instruction.
640 
641   DenseSet<unsigned> &TargetSet = Val->second;
642   if (TargetSet.size() > 1 && TargetSet.contains(TargetArgVal)) {
643     TargetSet.clear();
644     TargetSet.insert(TargetArgVal);
645     return true;
646   }
647 
648   // Return true if we can find the value in the set.
649   return TargetSet.contains(TargetArgVal);
650 }
651 
652 bool IRSimilarityCandidate::compareNonCommutativeOperandMapping(
653     OperandMapping A, OperandMapping B) {
654   // Iterators to keep track of where we are in the operands for each
655   // Instruction.
656   ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
657   ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
658   unsigned OperandLength = A.OperVals.size();
659 
660   // For each operand, get the value numbering and ensure it is consistent.
661   for (unsigned Idx = 0; Idx < OperandLength; Idx++, VItA++, VItB++) {
662     unsigned OperValA = A.IRSC.ValueToNumber.find(*VItA)->second;
663     unsigned OperValB = B.IRSC.ValueToNumber.find(*VItB)->second;
664 
665     // Attempt to add a set with only the target value.  If there is no mapping
666     // we can create it here.
667     //
668     // For an instruction like a subtraction:
669     // IRSimilarityCandidateA:  IRSimilarityCandidateB:
670     // %resultA = sub %a, %b    %resultB = sub %d, %e
671     //
672     // We map %a -> %d and %b -> %e.
673     //
674     // And check to see whether their mapping is consistent in
675     // checkNumberingAndReplace.
676 
677     if (!checkNumberingAndReplace(A.ValueNumberMapping, OperValA, OperValB))
678       return false;
679 
680     if (!checkNumberingAndReplace(B.ValueNumberMapping, OperValB, OperValA))
681       return false;
682   }
683   return true;
684 }
685 
686 bool IRSimilarityCandidate::compareCommutativeOperandMapping(
687     OperandMapping A, OperandMapping B) {
688   DenseSet<unsigned> ValueNumbersA;
689   DenseSet<unsigned> ValueNumbersB;
690 
691   ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
692   ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
693   unsigned OperandLength = A.OperVals.size();
694 
695   // Find the value number sets for the operands.
696   for (unsigned Idx = 0; Idx < OperandLength;
697        Idx++, VItA++, VItB++) {
698     ValueNumbersA.insert(A.IRSC.ValueToNumber.find(*VItA)->second);
699     ValueNumbersB.insert(B.IRSC.ValueToNumber.find(*VItB)->second);
700   }
701 
702   // Iterate over the operands in the first IRSimilarityCandidate and make sure
703   // there exists a possible mapping with the operands in the second
704   // IRSimilarityCandidate.
705   if (!checkNumberingAndReplaceCommutative(A.IRSC.ValueToNumber,
706                                            A.ValueNumberMapping, A.OperVals,
707                                            ValueNumbersB))
708     return false;
709 
710   // Iterate over the operands in the second IRSimilarityCandidate and make sure
711   // there exists a possible mapping with the operands in the first
712   // IRSimilarityCandidate.
713   if (!checkNumberingAndReplaceCommutative(B.IRSC.ValueToNumber,
714                                            B.ValueNumberMapping, B.OperVals,
715                                            ValueNumbersA))
716     return false;
717 
718   return true;
719 }
720 
721 bool IRSimilarityCandidate::compareAssignmentMapping(
722     const unsigned InstValA, const unsigned &InstValB,
723     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
724     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB) {
725   DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
726   bool WasInserted;
727   std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingA.insert(
728       std::make_pair(InstValA, DenseSet<unsigned>({InstValB})));
729   if (!WasInserted && !ValueMappingIt->second.contains(InstValB))
730     return false;
731   else if (ValueMappingIt->second.size() != 1) {
732     for (unsigned OtherVal : ValueMappingIt->second) {
733       if (OtherVal == InstValB)
734         continue;
735       if (!ValueNumberMappingA.contains(OtherVal))
736         continue;
737       if (!ValueNumberMappingA[OtherVal].contains(InstValA))
738         continue;
739       ValueNumberMappingA[OtherVal].erase(InstValA);
740     }
741     ValueNumberMappingA.erase(ValueMappingIt);
742     std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingA.insert(
743       std::make_pair(InstValA, DenseSet<unsigned>({InstValB})));
744   }
745 
746   return true;
747 }
748 
749 bool IRSimilarityCandidate::checkRelativeLocations(RelativeLocMapping A,
750                                                    RelativeLocMapping B) {
751   // Get the basic blocks the label refers to.
752   BasicBlock *ABB = cast<BasicBlock>(A.OperVal);
753   BasicBlock *BBB = cast<BasicBlock>(B.OperVal);
754 
755   // Get the basic blocks contained in each region.
756   DenseSet<BasicBlock *> BasicBlockA;
757   DenseSet<BasicBlock *> BasicBlockB;
758   A.IRSC.getBasicBlocks(BasicBlockA);
759   B.IRSC.getBasicBlocks(BasicBlockB);
760 
761   // Determine if the block is contained in the region.
762   bool AContained = BasicBlockA.contains(ABB);
763   bool BContained = BasicBlockB.contains(BBB);
764 
765   // Both blocks need to be contained in the region, or both need to be outside
766   // the region.
767   if (AContained != BContained)
768     return false;
769 
770   // If both are contained, then we need to make sure that the relative
771   // distance to the target blocks are the same.
772   if (AContained)
773     return A.RelativeLocation == B.RelativeLocation;
774   return true;
775 }
776 
777 bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
778                                              const IRSimilarityCandidate &B) {
779   DenseMap<unsigned, DenseSet<unsigned>> MappingA;
780   DenseMap<unsigned, DenseSet<unsigned>> MappingB;
781   return IRSimilarityCandidate::compareStructure(A, B, MappingA, MappingB);
782 }
783 
784 typedef detail::zippy<detail::zip_shortest, SmallVector<int, 4> &,
785                       SmallVector<int, 4> &, ArrayRef<Value *> &,
786                       ArrayRef<Value *> &>
787     ZippedRelativeLocationsT;
788 
789 bool IRSimilarityCandidate::compareStructure(
790     const IRSimilarityCandidate &A, const IRSimilarityCandidate &B,
791     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
792     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB) {
793   if (A.getLength() != B.getLength())
794     return false;
795 
796   if (A.ValueToNumber.size() != B.ValueToNumber.size())
797     return false;
798 
799   iterator ItA = A.begin();
800   iterator ItB = B.begin();
801 
802   // These ValueNumber Mapping sets create a create a mapping between the values
803   // in one candidate to values in the other candidate.  If we create a set with
804   // one element, and that same element maps to the original element in the
805   // candidate we have a good mapping.
806 
807   // Iterate over the instructions contained in each candidate
808   unsigned SectionLength = A.getStartIdx() + A.getLength();
809   for (unsigned Loc = A.getStartIdx(); Loc < SectionLength;
810        ItA++, ItB++, Loc++) {
811     // Make sure the instructions are similar to one another.
812     if (!isClose(*ItA, *ItB))
813       return false;
814 
815     Instruction *IA = ItA->Inst;
816     Instruction *IB = ItB->Inst;
817 
818     if (!ItA->Legal || !ItB->Legal)
819       return false;
820 
821     // Get the operand sets for the instructions.
822     ArrayRef<Value *> OperValsA = ItA->OperVals;
823     ArrayRef<Value *> OperValsB = ItB->OperVals;
824 
825     unsigned InstValA = A.ValueToNumber.find(IA)->second;
826     unsigned InstValB = B.ValueToNumber.find(IB)->second;
827 
828     // Ensure that the mappings for the instructions exists.
829     if (!compareAssignmentMapping(InstValA, InstValB, ValueNumberMappingA,
830                                   ValueNumberMappingB))
831       return false;
832 
833     if (!compareAssignmentMapping(InstValB, InstValA, ValueNumberMappingB,
834                                   ValueNumberMappingA))
835       return false;
836 
837     // We have different paths for commutative instructions and non-commutative
838     // instructions since commutative instructions could allow multiple mappings
839     // to certain values.
840     if (IA->isCommutative() && !isa<FPMathOperator>(IA) &&
841         !isa<IntrinsicInst>(IA)) {
842       if (!compareCommutativeOperandMapping(
843               {A, OperValsA, ValueNumberMappingA},
844               {B, OperValsB, ValueNumberMappingB}))
845         return false;
846       continue;
847     }
848 
849     // Handle the non-commutative cases.
850     if (!compareNonCommutativeOperandMapping(
851             {A, OperValsA, ValueNumberMappingA},
852             {B, OperValsB, ValueNumberMappingB}))
853       return false;
854 
855     // Here we check that between two corresponding instructions,
856     // when referring to a basic block in the same region, the
857     // relative locations are the same. And, that the instructions refer to
858     // basic blocks outside the region in the same corresponding locations.
859 
860     // We are able to make the assumption about blocks outside of the region
861     // since the target block labels are considered values and will follow the
862     // same number matching that we defined for the other instructions in the
863     // region.  So, at this point, in each location we target a specific block
864     // outside the region, we are targeting a corresponding block in each
865     // analagous location in the region we are comparing to.
866     if (!(isa<BranchInst>(IA) && isa<BranchInst>(IB)) &&
867         !(isa<PHINode>(IA) && isa<PHINode>(IB)))
868       continue;
869 
870     SmallVector<int, 4> &RelBlockLocsA = ItA->RelativeBlockLocations;
871     SmallVector<int, 4> &RelBlockLocsB = ItB->RelativeBlockLocations;
872     ArrayRef<Value *> ABL = ItA->getBlockOperVals();
873     ArrayRef<Value *> BBL = ItB->getBlockOperVals();
874 
875     // Check to make sure that the number of operands, and branching locations
876     // between BranchInsts is the same.
877     if (RelBlockLocsA.size() != RelBlockLocsB.size() &&
878         ABL.size() != BBL.size())
879       return false;
880 
881     assert(RelBlockLocsA.size() == ABL.size() &&
882            "Block information vectors not the same size.");
883     assert(RelBlockLocsB.size() == BBL.size() &&
884            "Block information vectors not the same size.");
885 
886     ZippedRelativeLocationsT ZippedRelativeLocations =
887         zip(RelBlockLocsA, RelBlockLocsB, ABL, BBL);
888     if (any_of(ZippedRelativeLocations,
889                [&A, &B](std::tuple<int, int, Value *, Value *> R) {
890                  return !checkRelativeLocations(
891                      {A, std::get<0>(R), std::get<2>(R)},
892                      {B, std::get<1>(R), std::get<3>(R)});
893                }))
894       return false;
895   }
896   return true;
897 }
898 
899 bool IRSimilarityCandidate::overlap(const IRSimilarityCandidate &A,
900                                     const IRSimilarityCandidate &B) {
901   auto DoesOverlap = [](const IRSimilarityCandidate &X,
902                         const IRSimilarityCandidate &Y) {
903     // Check:
904     // XXXXXX        X starts before Y ends
905     //      YYYYYYY  Y starts after X starts
906     return X.StartIdx <= Y.getEndIdx() && Y.StartIdx >= X.StartIdx;
907   };
908 
909   return DoesOverlap(A, B) || DoesOverlap(B, A);
910 }
911 
912 void IRSimilarityIdentifier::populateMapper(
913     Module &M, std::vector<IRInstructionData *> &InstrList,
914     std::vector<unsigned> &IntegerMapping) {
915 
916   std::vector<IRInstructionData *> InstrListForModule;
917   std::vector<unsigned> IntegerMappingForModule;
918   // Iterate over the functions in the module to map each Instruction in each
919   // BasicBlock to an unsigned integer.
920   Mapper.initializeForBBs(M);
921 
922   for (Function &F : M) {
923 
924     if (F.empty())
925       continue;
926 
927     for (BasicBlock &BB : F) {
928 
929       // BB has potential to have similarity since it has a size greater than 2
930       // and can therefore match other regions greater than 2. Map it to a list
931       // of unsigned integers.
932       Mapper.convertToUnsignedVec(BB, InstrListForModule,
933                                   IntegerMappingForModule);
934     }
935 
936     BasicBlock::iterator It = F.begin()->end();
937     Mapper.mapToIllegalUnsigned(It, IntegerMappingForModule, InstrListForModule,
938                                 true);
939     if (InstrListForModule.size() > 0)
940       Mapper.IDL->push_back(*InstrListForModule.back());
941   }
942 
943   // Insert the InstrListForModule at the end of the overall InstrList so that
944   // we can have a long InstrList for the entire set of Modules being analyzed.
945   llvm::append_range(InstrList, InstrListForModule);
946   // Do the same as above, but for IntegerMapping.
947   llvm::append_range(IntegerMapping, IntegerMappingForModule);
948 }
949 
950 void IRSimilarityIdentifier::populateMapper(
951     ArrayRef<std::unique_ptr<Module>> &Modules,
952     std::vector<IRInstructionData *> &InstrList,
953     std::vector<unsigned> &IntegerMapping) {
954 
955   // Iterate over, and map the instructions in each module.
956   for (const std::unique_ptr<Module> &M : Modules)
957     populateMapper(*M, InstrList, IntegerMapping);
958 }
959 
960 /// From a repeated subsequence, find all the different instances of the
961 /// subsequence from the \p InstrList, and create an IRSimilarityCandidate from
962 /// the IRInstructionData in subsequence.
963 ///
964 /// \param [in] Mapper - The instruction mapper for basic correctness checks.
965 /// \param [in] InstrList - The vector that holds the instruction data.
966 /// \param [in] IntegerMapping - The vector that holds the mapped integers.
967 /// \param [out] CandsForRepSubstring - The vector to store the generated
968 /// IRSimilarityCandidates.
969 static void createCandidatesFromSuffixTree(
970     const IRInstructionMapper& Mapper, std::vector<IRInstructionData *> &InstrList,
971     std::vector<unsigned> &IntegerMapping, SuffixTree::RepeatedSubstring &RS,
972     std::vector<IRSimilarityCandidate> &CandsForRepSubstring) {
973 
974   unsigned StringLen = RS.Length;
975   if (StringLen < 2)
976     return;
977 
978   // Create an IRSimilarityCandidate for instance of this subsequence \p RS.
979   for (const unsigned &StartIdx : RS.StartIndices) {
980     unsigned EndIdx = StartIdx + StringLen - 1;
981 
982     // Check that this subsequence does not contain an illegal instruction.
983     bool ContainsIllegal = false;
984     for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) {
985       unsigned Key = IntegerMapping[CurrIdx];
986       if (Key > Mapper.IllegalInstrNumber) {
987         ContainsIllegal = true;
988         break;
989       }
990     }
991 
992     // If we have an illegal instruction, we should not create an
993     // IRSimilarityCandidate for this region.
994     if (ContainsIllegal)
995       continue;
996 
997     // We are getting iterators to the instructions in this region of code
998     // by advancing the start and end indices from the start of the
999     // InstrList.
1000     std::vector<IRInstructionData *>::iterator StartIt = InstrList.begin();
1001     std::advance(StartIt, StartIdx);
1002     std::vector<IRInstructionData *>::iterator EndIt = InstrList.begin();
1003     std::advance(EndIt, EndIdx);
1004 
1005     CandsForRepSubstring.emplace_back(StartIdx, StringLen, *StartIt, *EndIt);
1006   }
1007 }
1008 
1009 void IRSimilarityCandidate::createCanonicalRelationFrom(
1010     IRSimilarityCandidate &SourceCand,
1011     DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping,
1012     DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping) {
1013   assert(SourceCand.CanonNumToNumber.size() != 0 &&
1014          "Base canonical relationship is empty!");
1015   assert(SourceCand.NumberToCanonNum.size() != 0 &&
1016          "Base canonical relationship is empty!");
1017 
1018   assert(CanonNumToNumber.size() == 0 && "Canonical Relationship is non-empty");
1019   assert(NumberToCanonNum.size() == 0 && "Canonical Relationship is non-empty");
1020 
1021   DenseSet<unsigned> UsedGVNs;
1022   // Iterate over the mappings provided from this candidate to SourceCand.  We
1023   // are then able to map the GVN in this candidate to the same canonical number
1024   // given to the corresponding GVN in SourceCand.
1025   for (std::pair<unsigned, DenseSet<unsigned>> &GVNMapping : ToSourceMapping) {
1026     unsigned SourceGVN = GVNMapping.first;
1027 
1028     assert(GVNMapping.second.size() != 0 && "Possible GVNs is 0!");
1029 
1030     unsigned ResultGVN;
1031     // We need special handling if we have more than one potential value.  This
1032     // means that there are at least two GVNs that could correspond to this GVN.
1033     // This could lead to potential swapping later on, so we make a decision
1034     // here to ensure a one-to-one mapping.
1035     if (GVNMapping.second.size() > 1) {
1036       bool Found = false;
1037       for (unsigned Val : GVNMapping.second) {
1038         // We make sure the target value number hasn't already been reserved.
1039         if (UsedGVNs.contains(Val))
1040           continue;
1041 
1042         // We make sure that the opposite mapping is still consistent.
1043         DenseMap<unsigned, DenseSet<unsigned>>::iterator It =
1044             FromSourceMapping.find(Val);
1045 
1046         if (!It->second.contains(SourceGVN))
1047           continue;
1048 
1049         // We pick the first item that satisfies these conditions.
1050         Found = true;
1051         ResultGVN = Val;
1052         break;
1053       }
1054 
1055       assert(Found && "Could not find matching value for source GVN");
1056       (void)Found;
1057 
1058     } else
1059       ResultGVN = *GVNMapping.second.begin();
1060 
1061     // Whatever GVN is found, we mark it as used.
1062     UsedGVNs.insert(ResultGVN);
1063 
1064     unsigned CanonNum = *SourceCand.getCanonicalNum(ResultGVN);
1065     CanonNumToNumber.insert(std::make_pair(CanonNum, SourceGVN));
1066     NumberToCanonNum.insert(std::make_pair(SourceGVN, CanonNum));
1067   }
1068 
1069   DenseSet<BasicBlock *> BBSet;
1070   getBasicBlocks(BBSet);
1071   // Find canonical numbers for the BasicBlocks in the current candidate.
1072   // This is done by finding the corresponding value for the first instruction
1073   // in the block in the current candidate, finding the matching value in the
1074   // source candidate.  Then by finding the parent of this value, use the
1075   // canonical number of the block in the source candidate for the canonical
1076   // number in the current candidate.
1077   for (BasicBlock *BB : BBSet) {
1078     unsigned BBGVNForCurrCand = ValueToNumber.find(BB)->second;
1079 
1080     // We can skip the BasicBlock if the canonical numbering has already been
1081     // found in a separate instruction.
1082     if (NumberToCanonNum.contains(BBGVNForCurrCand))
1083       continue;
1084 
1085     // If the basic block is the starting block, then the shared instruction may
1086     // not be the first instruction in the block, it will be the first
1087     // instruction in the similarity region.
1088     Value *FirstOutlineInst = BB == getStartBB()
1089                                   ? frontInstruction()
1090                                   : &*BB->instructionsWithoutDebug().begin();
1091 
1092     unsigned FirstInstGVN = *getGVN(FirstOutlineInst);
1093     unsigned FirstInstCanonNum = *getCanonicalNum(FirstInstGVN);
1094     unsigned SourceGVN = *SourceCand.fromCanonicalNum(FirstInstCanonNum);
1095     Value *SourceV = *SourceCand.fromGVN(SourceGVN);
1096     BasicBlock *SourceBB = cast<Instruction>(SourceV)->getParent();
1097     unsigned SourceBBGVN = *SourceCand.getGVN(SourceBB);
1098     unsigned SourceCanonBBGVN = *SourceCand.getCanonicalNum(SourceBBGVN);
1099     CanonNumToNumber.insert(std::make_pair(SourceCanonBBGVN, BBGVNForCurrCand));
1100     NumberToCanonNum.insert(std::make_pair(BBGVNForCurrCand, SourceCanonBBGVN));
1101   }
1102 }
1103 
1104 void IRSimilarityCandidate::createCanonicalRelationFrom(
1105     IRSimilarityCandidate &SourceCand, IRSimilarityCandidate &SourceCandLarge,
1106     IRSimilarityCandidate &TargetCandLarge) {
1107   assert(!SourceCand.CanonNumToNumber.empty() &&
1108          "Canonical Relationship is non-empty");
1109   assert(!SourceCand.NumberToCanonNum.empty() &&
1110          "Canonical Relationship is non-empty");
1111 
1112   assert(!SourceCandLarge.CanonNumToNumber.empty() &&
1113          "Canonical Relationship is non-empty");
1114   assert(!SourceCandLarge.NumberToCanonNum.empty() &&
1115          "Canonical Relationship is non-empty");
1116 
1117   assert(!TargetCandLarge.CanonNumToNumber.empty() &&
1118          "Canonical Relationship is non-empty");
1119   assert(!TargetCandLarge.NumberToCanonNum.empty() &&
1120          "Canonical Relationship is non-empty");
1121 
1122   assert(CanonNumToNumber.empty() && "Canonical Relationship is non-empty");
1123   assert(NumberToCanonNum.empty() && "Canonical Relationship is non-empty");
1124 
1125   // We're going to use the larger candidates as a "bridge" to create the
1126   // canonical number for the target candidate since we have idetified two
1127   // candidates as subsequences of larger sequences, and therefore must be
1128   // structurally similar.
1129   for (std::pair<Value *, unsigned> &ValueNumPair : ValueToNumber) {
1130     Value *CurrVal = ValueNumPair.first;
1131     unsigned TargetCandGVN = ValueNumPair.second;
1132 
1133     // Find the numbering in the large candidate that surrounds the
1134     // current candidate.
1135     std::optional<unsigned> OLargeTargetGVN = TargetCandLarge.getGVN(CurrVal);
1136     assert(OLargeTargetGVN.has_value() && "GVN not found for Value");
1137 
1138     // Get the canonical numbering in the large target candidate.
1139     std::optional<unsigned> OTargetCandCanon =
1140         TargetCandLarge.getCanonicalNum(OLargeTargetGVN.value());
1141     assert(OTargetCandCanon.has_value() &&
1142            "Canononical Number not found for GVN");
1143 
1144     // Get the GVN in the large source candidate from the canonical numbering.
1145     std::optional<unsigned> OLargeSourceGVN =
1146         SourceCandLarge.fromCanonicalNum(OTargetCandCanon.value());
1147     assert(OLargeSourceGVN.has_value() &&
1148            "GVN Number not found for Canonical Number");
1149 
1150     // Get the Value from the GVN in the large source candidate.
1151     std::optional<Value *> OLargeSourceV =
1152         SourceCandLarge.fromGVN(OLargeSourceGVN.value());
1153     assert(OLargeSourceV.has_value() && "Value not found for GVN");
1154 
1155     // Get the GVN number for the Value in the source candidate.
1156     std::optional<unsigned> OSourceGVN =
1157         SourceCand.getGVN(OLargeSourceV.value());
1158     assert(OSourceGVN.has_value() && "GVN Number not found for Value");
1159 
1160     // Get the canonical numbering from the GVN/
1161     std::optional<unsigned> OSourceCanon =
1162         SourceCand.getCanonicalNum(OSourceGVN.value());
1163     assert(OSourceCanon.has_value() && "Canon Number not found for GVN");
1164 
1165     // Insert the canonical numbering and GVN pair into their respective
1166     // mappings.
1167     CanonNumToNumber.insert(
1168         std::make_pair(OSourceCanon.value(), TargetCandGVN));
1169     NumberToCanonNum.insert(
1170         std::make_pair(TargetCandGVN, OSourceCanon.value()));
1171   }
1172 }
1173 
1174 void IRSimilarityCandidate::createCanonicalMappingFor(
1175     IRSimilarityCandidate &CurrCand) {
1176   assert(CurrCand.CanonNumToNumber.size() == 0 &&
1177          "Canonical Relationship is non-empty");
1178   assert(CurrCand.NumberToCanonNum.size() == 0 &&
1179          "Canonical Relationship is non-empty");
1180 
1181   unsigned CanonNum = 0;
1182   // Iterate over the value numbers found, the order does not matter in this
1183   // case.
1184   for (std::pair<unsigned, Value *> &NumToVal : CurrCand.NumberToValue) {
1185     CurrCand.NumberToCanonNum.insert(std::make_pair(NumToVal.first, CanonNum));
1186     CurrCand.CanonNumToNumber.insert(std::make_pair(CanonNum, NumToVal.first));
1187     CanonNum++;
1188   }
1189 }
1190 
1191 /// Look for larger IRSimilarityCandidates From the previously matched
1192 /// IRSimilarityCandidates that fully contain \p CandA or \p CandB.  If there is
1193 /// an overlap, return a pair of structurally similar, larger
1194 /// IRSimilarityCandidates.
1195 ///
1196 /// \param [in] CandA - The first candidate we are trying to determine the
1197 /// structure of.
1198 /// \param [in] CandB - The second candidate we are trying to determine the
1199 /// structure of.
1200 /// \param [in] IndexToIncludedCand - Mapping of index of the an instruction in
1201 /// a circuit to the IRSimilarityCandidates that include this instruction.
1202 /// \param [in] CandToOverallGroup - Mapping of IRSimilarityCandidate to a
1203 /// number representing the structural group assigned to it.
1204 static std::optional<
1205     std::pair<IRSimilarityCandidate *, IRSimilarityCandidate *>>
1206 CheckLargerCands(
1207     IRSimilarityCandidate &CandA, IRSimilarityCandidate &CandB,
1208     DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>> &IndexToIncludedCand,
1209     DenseMap<IRSimilarityCandidate *, unsigned> &CandToGroup) {
1210   DenseMap<unsigned, IRSimilarityCandidate *> IncludedGroupAndCandA;
1211   DenseMap<unsigned, IRSimilarityCandidate *> IncludedGroupAndCandB;
1212   DenseSet<unsigned> IncludedGroupsA;
1213   DenseSet<unsigned> IncludedGroupsB;
1214 
1215   // Find the overall similarity group numbers that fully contain the candidate,
1216   // and record the larger candidate for each group.
1217   auto IdxToCandidateIt = IndexToIncludedCand.find(CandA.getStartIdx());
1218   std::optional<std::pair<IRSimilarityCandidate *, IRSimilarityCandidate *>>
1219       Result;
1220 
1221   unsigned CandAStart = CandA.getStartIdx();
1222   unsigned CandAEnd = CandA.getEndIdx();
1223   unsigned CandBStart = CandB.getStartIdx();
1224   unsigned CandBEnd = CandB.getEndIdx();
1225   if (IdxToCandidateIt == IndexToIncludedCand.end())
1226     return Result;
1227   for (IRSimilarityCandidate *MatchedCand : IdxToCandidateIt->second) {
1228     if (MatchedCand->getStartIdx() > CandAStart ||
1229         (MatchedCand->getEndIdx() < CandAEnd))
1230       continue;
1231     unsigned GroupNum = CandToGroup.find(MatchedCand)->second;
1232     IncludedGroupAndCandA.insert(std::make_pair(GroupNum, MatchedCand));
1233     IncludedGroupsA.insert(GroupNum);
1234   }
1235 
1236   // Find the overall similarity group numbers that fully contain the next
1237   // candidate, and record the larger candidate for each group.
1238   IdxToCandidateIt = IndexToIncludedCand.find(CandBStart);
1239   if (IdxToCandidateIt == IndexToIncludedCand.end())
1240     return Result;
1241   for (IRSimilarityCandidate *MatchedCand : IdxToCandidateIt->second) {
1242     if (MatchedCand->getStartIdx() > CandBStart ||
1243         MatchedCand->getEndIdx() < CandBEnd)
1244       continue;
1245     unsigned GroupNum = CandToGroup.find(MatchedCand)->second;
1246     IncludedGroupAndCandB.insert(std::make_pair(GroupNum, MatchedCand));
1247     IncludedGroupsB.insert(GroupNum);
1248   }
1249 
1250   // Find the intersection between the two groups, these are the groups where
1251   // the larger candidates exist.
1252   set_intersect(IncludedGroupsA, IncludedGroupsB);
1253 
1254   // If there is no intersection between the sets, then we cannot determine
1255   // whether or not there is a match.
1256   if (IncludedGroupsA.empty())
1257     return Result;
1258 
1259   // Create a pair that contains the larger candidates.
1260   auto ItA = IncludedGroupAndCandA.find(*IncludedGroupsA.begin());
1261   auto ItB = IncludedGroupAndCandB.find(*IncludedGroupsA.begin());
1262   Result = std::make_pair(ItA->second, ItB->second);
1263   return Result;
1264 }
1265 
1266 /// From the list of IRSimilarityCandidates, perform a comparison between each
1267 /// IRSimilarityCandidate to determine if there are overlapping
1268 /// IRInstructionData, or if they do not have the same structure.
1269 ///
1270 /// \param [in] CandsForRepSubstring - The vector containing the
1271 /// IRSimilarityCandidates.
1272 /// \param [out] StructuralGroups - the mapping of unsigned integers to vector
1273 /// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the
1274 /// vector are structurally similar to one another.
1275 /// \param [in] IndexToIncludedCand - Mapping of index of the an instruction in
1276 /// a circuit to the IRSimilarityCandidates that include this instruction.
1277 /// \param [in] CandToOverallGroup - Mapping of IRSimilarityCandidate to a
1278 /// number representing the structural group assigned to it.
1279 static void findCandidateStructures(
1280     std::vector<IRSimilarityCandidate> &CandsForRepSubstring,
1281     DenseMap<unsigned, SimilarityGroup> &StructuralGroups,
1282     DenseMap<unsigned,  DenseSet<IRSimilarityCandidate *>> &IndexToIncludedCand,
1283     DenseMap<IRSimilarityCandidate *, unsigned> &CandToOverallGroup
1284     ) {
1285   std::vector<IRSimilarityCandidate>::iterator CandIt, CandEndIt, InnerCandIt,
1286       InnerCandEndIt;
1287 
1288   // IRSimilarityCandidates each have a structure for operand use.  It is
1289   // possible that two instances of the same subsequences have different
1290   // structure. Each type of structure found is assigned a number.  This
1291   // DenseMap maps an IRSimilarityCandidate to which type of similarity
1292   // discovered it fits within.
1293   DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
1294 
1295   // Find the compatibility from each candidate to the others to determine
1296   // which candidates overlap and which have the same structure by mapping
1297   // each structure to a different group.
1298   bool SameStructure;
1299   bool Inserted;
1300   unsigned CurrentGroupNum = 0;
1301   unsigned OuterGroupNum;
1302   DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupIt;
1303   DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupItInner;
1304   DenseMap<unsigned, SimilarityGroup>::iterator CurrentGroupPair;
1305 
1306   // Iterate over the candidates to determine its structural and overlapping
1307   // compatibility with other instructions
1308   DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingA;
1309   DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingB;
1310   for (CandIt = CandsForRepSubstring.begin(),
1311       CandEndIt = CandsForRepSubstring.end();
1312        CandIt != CandEndIt; CandIt++) {
1313 
1314     // Determine if it has an assigned structural group already.
1315     CandToGroupIt = CandToGroup.find(&*CandIt);
1316     if (CandToGroupIt == CandToGroup.end()) {
1317       // If not, we assign it one, and add it to our mapping.
1318       std::tie(CandToGroupIt, Inserted) =
1319           CandToGroup.insert(std::make_pair(&*CandIt, CurrentGroupNum++));
1320     }
1321 
1322     // Get the structural group number from the iterator.
1323     OuterGroupNum = CandToGroupIt->second;
1324 
1325     // Check if we already have a list of IRSimilarityCandidates for the current
1326     // structural group.  Create one if one does not exist.
1327     CurrentGroupPair = StructuralGroups.find(OuterGroupNum);
1328     if (CurrentGroupPair == StructuralGroups.end()) {
1329       IRSimilarityCandidate::createCanonicalMappingFor(*CandIt);
1330       std::tie(CurrentGroupPair, Inserted) = StructuralGroups.insert(
1331           std::make_pair(OuterGroupNum, SimilarityGroup({*CandIt})));
1332     }
1333 
1334     // Iterate over the IRSimilarityCandidates following the current
1335     // IRSimilarityCandidate in the list to determine whether the two
1336     // IRSimilarityCandidates are compatible.  This is so we do not repeat pairs
1337     // of IRSimilarityCandidates.
1338     for (InnerCandIt = std::next(CandIt),
1339         InnerCandEndIt = CandsForRepSubstring.end();
1340          InnerCandIt != InnerCandEndIt; InnerCandIt++) {
1341 
1342       // We check if the inner item has a group already, if it does, we skip it.
1343       CandToGroupItInner = CandToGroup.find(&*InnerCandIt);
1344       if (CandToGroupItInner != CandToGroup.end())
1345         continue;
1346 
1347       // Check if we have found structural similarity between two candidates
1348       // that fully contains the first and second candidates.
1349       std::optional<std::pair<IRSimilarityCandidate *, IRSimilarityCandidate *>>
1350           LargerPair = CheckLargerCands(
1351               *CandIt, *InnerCandIt, IndexToIncludedCand, CandToOverallGroup);
1352 
1353       // If a pair was found, it means that we can assume that these smaller
1354       // substrings are also structurally similar.  Use the larger candidates to
1355       // determine the canonical mapping between the two sections.
1356       if (LargerPair.has_value()) {
1357         SameStructure = true;
1358         InnerCandIt->createCanonicalRelationFrom(
1359             *CandIt, *LargerPair.value().first, *LargerPair.value().second);
1360         CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum));
1361         CurrentGroupPair->second.push_back(*InnerCandIt);
1362         continue;
1363       }
1364 
1365       // Otherwise we determine if they have the same structure and add it to
1366       // vector if they match.
1367       ValueNumberMappingA.clear();
1368       ValueNumberMappingB.clear();
1369       SameStructure = IRSimilarityCandidate::compareStructure(
1370           *CandIt, *InnerCandIt, ValueNumberMappingA, ValueNumberMappingB);
1371       if (!SameStructure)
1372         continue;
1373 
1374       InnerCandIt->createCanonicalRelationFrom(*CandIt, ValueNumberMappingA,
1375                                                ValueNumberMappingB);
1376       CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum));
1377       CurrentGroupPair->second.push_back(*InnerCandIt);
1378     }
1379   }
1380 }
1381 
1382 void IRSimilarityIdentifier::findCandidates(
1383     std::vector<IRInstructionData *> &InstrList,
1384     std::vector<unsigned> &IntegerMapping) {
1385   SuffixTree ST(IntegerMapping);
1386 
1387   std::vector<IRSimilarityCandidate> CandsForRepSubstring;
1388   std::vector<SimilarityGroup> NewCandidateGroups;
1389 
1390   DenseMap<unsigned, SimilarityGroup> StructuralGroups;
1391   DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>> IndexToIncludedCand;
1392   DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
1393 
1394   // Iterate over the subsequences found by the Suffix Tree to create
1395   // IRSimilarityCandidates for each repeated subsequence and determine which
1396   // instances are structurally similar to one another.
1397 
1398   // Sort the suffix tree from longest substring to shortest.
1399   std::vector<SuffixTree::RepeatedSubstring> RSes;
1400   for (SuffixTree::RepeatedSubstring &RS : ST)
1401     RSes.push_back(RS);
1402 
1403   llvm::stable_sort(RSes, [](const SuffixTree::RepeatedSubstring &LHS,
1404                              const SuffixTree::RepeatedSubstring &RHS) {
1405     return LHS.Length > RHS.Length;
1406   });
1407   for (SuffixTree::RepeatedSubstring &RS : RSes) {
1408     createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, RS,
1409                                    CandsForRepSubstring);
1410 
1411     if (CandsForRepSubstring.size() < 2)
1412       continue;
1413 
1414     findCandidateStructures(CandsForRepSubstring, StructuralGroups,
1415                             IndexToIncludedCand, CandToGroup);
1416     for (std::pair<unsigned, SimilarityGroup> &Group : StructuralGroups) {
1417       // We only add the group if it contains more than one
1418       // IRSimilarityCandidate.  If there is only one, that means there is no
1419       // other repeated subsequence with the same structure.
1420       if (Group.second.size() > 1) {
1421         SimilarityCandidates->push_back(Group.second);
1422         // Iterate over each candidate in the group, and add an entry for each
1423         // instruction included with a mapping to a set of
1424         // IRSimilarityCandidates that include that instruction.
1425         for (IRSimilarityCandidate &IRCand : SimilarityCandidates->back()) {
1426           for (unsigned Idx = IRCand.getStartIdx(), Edx = IRCand.getEndIdx();
1427                Idx <= Edx; ++Idx) {
1428             DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>>::iterator
1429                 IdIt;
1430             IdIt = IndexToIncludedCand.find(Idx);
1431             bool Inserted = false;
1432             if (IdIt == IndexToIncludedCand.end())
1433               std::tie(IdIt, Inserted) = IndexToIncludedCand.insert(
1434                   std::make_pair(Idx, DenseSet<IRSimilarityCandidate *>()));
1435             IdIt->second.insert(&IRCand);
1436           }
1437           // Add mapping of candidate to the overall similarity group number.
1438           CandToGroup.insert(
1439               std::make_pair(&IRCand, SimilarityCandidates->size() - 1));
1440         }
1441       }
1442     }
1443 
1444     CandsForRepSubstring.clear();
1445     StructuralGroups.clear();
1446     NewCandidateGroups.clear();
1447   }
1448 }
1449 
1450 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
1451     ArrayRef<std::unique_ptr<Module>> Modules) {
1452   resetSimilarityCandidates();
1453 
1454   std::vector<IRInstructionData *> InstrList;
1455   std::vector<unsigned> IntegerMapping;
1456   Mapper.InstClassifier.EnableBranches = this->EnableBranches;
1457   Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
1458   Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
1459   Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics;
1460   Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls;
1461 
1462   populateMapper(Modules, InstrList, IntegerMapping);
1463   findCandidates(InstrList, IntegerMapping);
1464 
1465   return *SimilarityCandidates;
1466 }
1467 
1468 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
1469   resetSimilarityCandidates();
1470   Mapper.InstClassifier.EnableBranches = this->EnableBranches;
1471   Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
1472   Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
1473   Mapper.InstClassifier.EnableIntrinsics = EnableIntrinsics;
1474   Mapper.InstClassifier.EnableMustTailCalls = EnableMustTailCalls;
1475 
1476   std::vector<IRInstructionData *> InstrList;
1477   std::vector<unsigned> IntegerMapping;
1478 
1479   populateMapper(M, InstrList, IntegerMapping);
1480   findCandidates(InstrList, IntegerMapping);
1481 
1482   return *SimilarityCandidates;
1483 }
1484 
1485 INITIALIZE_PASS(IRSimilarityIdentifierWrapperPass, "ir-similarity-identifier",
1486                 "ir-similarity-identifier", false, true)
1487 
1488 IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass()
1489     : ModulePass(ID) {
1490   initializeIRSimilarityIdentifierWrapperPassPass(
1491       *PassRegistry::getPassRegistry());
1492 }
1493 
1494 bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) {
1495   IRSI.reset(new IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
1496                                         MatchCallsByName, !DisableIntrinsics,
1497                                         false));
1498   return false;
1499 }
1500 
1501 bool IRSimilarityIdentifierWrapperPass::doFinalization(Module &M) {
1502   IRSI.reset();
1503   return false;
1504 }
1505 
1506 bool IRSimilarityIdentifierWrapperPass::runOnModule(Module &M) {
1507   IRSI->findSimilarity(M);
1508   return false;
1509 }
1510 
1511 AnalysisKey IRSimilarityAnalysis::Key;
1512 IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M,
1513                                                  ModuleAnalysisManager &) {
1514   auto IRSI = IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
1515                                      MatchCallsByName, !DisableIntrinsics,
1516                                      false);
1517   IRSI.findSimilarity(M);
1518   return IRSI;
1519 }
1520 
1521 PreservedAnalyses
1522 IRSimilarityAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &AM) {
1523   IRSimilarityIdentifier &IRSI = AM.getResult<IRSimilarityAnalysis>(M);
1524   std::optional<SimilarityGroupList> &SimilarityCandidatesOpt =
1525       IRSI.getSimilarity();
1526 
1527   for (std::vector<IRSimilarityCandidate> &CandVec : *SimilarityCandidatesOpt) {
1528     OS << CandVec.size() << " candidates of length "
1529        << CandVec.begin()->getLength() << ".  Found in: \n";
1530     for (IRSimilarityCandidate &Cand : CandVec) {
1531       OS << "  Function: " << Cand.front()->Inst->getFunction()->getName().str()
1532          << ", Basic Block: ";
1533       if (Cand.front()->Inst->getParent()->getName().str() == "")
1534         OS << "(unnamed)";
1535       else
1536         OS << Cand.front()->Inst->getParent()->getName().str();
1537       OS << "\n    Start Instruction: ";
1538       Cand.frontInstruction()->print(OS);
1539       OS << "\n      End Instruction: ";
1540       Cand.backInstruction()->print(OS);
1541       OS << "\n";
1542     }
1543   }
1544 
1545   return PreservedAnalyses::all();
1546 }
1547 
1548 char IRSimilarityIdentifierWrapperPass::ID = 0;
1549