1 //===- IRSimilarityIdentifier.h - Find similarity in a module --------------==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // Interface file for the IRSimilarityIdentifier for identifying similarities in 11 // IR including the IRInstructionMapper, which maps an Instruction to unsigned 12 // integers. 13 // 14 // Two sequences of instructions are called "similar" if they perform the same 15 // series of operations for all inputs. 16 // 17 // \code 18 // %1 = add i32 %a, 10 19 // %2 = add i32 %a, %1 20 // %3 = icmp slt icmp %1, %2 21 // \endcode 22 // 23 // and 24 // 25 // \code 26 // %1 = add i32 11, %a 27 // %2 = sub i32 %a, %1 28 // %3 = icmp sgt icmp %2, %1 29 // \endcode 30 // 31 // ultimately have the same result, even if the inputs, and structure are 32 // slightly different. 33 // 34 // For instructions, we do not worry about operands that do not have fixed 35 // semantic meaning to the program. We consider the opcode that the instruction 36 // has, the types, parameters, and extra information such as the function name, 37 // or comparison predicate. These are used to create a hash to map instructions 38 // to integers to be used in similarity matching in sequences of instructions 39 // 40 // Terminology: 41 // An IRSimilarityCandidate is a region of IRInstructionData (wrapped 42 // Instructions), usually used to denote a region of similarity has been found. 43 // 44 // A SimilarityGroup is a set of IRSimilarityCandidates that are structurally 45 // similar to one another. 46 // 47 //===----------------------------------------------------------------------===// 48 49 #ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H 50 #define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H 51 52 #include "llvm/IR/InstVisitor.h" 53 #include "llvm/IR/Instructions.h" 54 #include "llvm/IR/PassManager.h" 55 #include "llvm/Pass.h" 56 #include "llvm/Support/Allocator.h" 57 #include <optional> 58 59 namespace llvm { 60 class Module; 61 62 namespace IRSimilarity { 63 64 struct IRInstructionDataList; 65 66 /// This represents what is and is not supported when finding similarity in 67 /// Instructions. 68 /// 69 /// Legal Instructions are considered when looking at similarity between 70 /// Instructions. 71 /// 72 /// Illegal Instructions cannot be considered when looking for similarity 73 /// between Instructions. They act as boundaries between similarity regions. 74 /// 75 /// Invisible Instructions are skipped over during analysis. 76 // TODO: Shared with MachineOutliner 77 enum InstrType { Legal, Illegal, Invisible }; 78 79 /// This provides the utilities for hashing an Instruction to an unsigned 80 /// integer. Two IRInstructionDatas produce the same hash value when their 81 /// underlying Instructions perform the same operation (even if they don't have 82 /// the same input operands.) 83 /// As a more concrete example, consider the following: 84 /// 85 /// \code 86 /// %add1 = add i32 %a, %b 87 /// %add2 = add i32 %c, %d 88 /// %add3 = add i64 %e, %f 89 /// \endcode 90 /// 91 // Then the IRInstructionData wrappers for these Instructions may be hashed like 92 /// so: 93 /// 94 /// \code 95 /// ; These two adds have the same types and operand types, so they hash to the 96 /// ; same number. 97 /// %add1 = add i32 %a, %b ; Hash: 1 98 /// %add2 = add i32 %c, %d ; Hash: 1 99 /// ; This add produces an i64. This differentiates it from %add1 and %add2. So, 100 /// ; it hashes to a different number. 101 /// %add3 = add i64 %e, %f; Hash: 2 102 /// \endcode 103 /// 104 /// 105 /// This hashing scheme will be used to represent the program as a very long 106 /// string. This string can then be placed in a data structure which can be used 107 /// for similarity queries. 108 /// 109 /// TODO: Handle types of Instructions which can be equal even with different 110 /// operands. (E.g. comparisons with swapped predicates.) 111 /// TODO: Handle CallInsts, which are only checked for function type 112 /// by \ref isSameOperationAs. 113 /// TODO: Handle GetElementPtrInsts, as some of the operands have to be the 114 /// exact same, and some do not. 115 struct IRInstructionData 116 : ilist_node<IRInstructionData, ilist_sentinel_tracking<true>> { 117 118 /// The source Instruction that is being wrapped. 119 Instruction *Inst = nullptr; 120 /// The values of the operands in the Instruction. 121 SmallVector<Value *, 4> OperVals; 122 /// The legality of the wrapped instruction. This is informed by InstrType, 123 /// and is used when checking when two instructions are considered similar. 124 /// If either instruction is not legal, the instructions are automatically not 125 /// considered similar. 126 bool Legal = false; 127 128 /// This is only relevant if we are wrapping a CmpInst where we needed to 129 /// change the predicate of a compare instruction from a greater than form 130 /// to a less than form. It is std::nullopt otherwise. 131 std::optional<CmpInst::Predicate> RevisedPredicate; 132 133 /// This is only relevant if we are wrapping a CallInst. If we are requiring 134 /// that the function calls have matching names as well as types, and the 135 /// call is not an indirect call, this will hold the name of the function. If 136 /// it is an indirect string, it will be the empty string. However, if this 137 /// requirement is not in place it will be the empty string regardless of the 138 /// function call type. The value held here is used to create the hash of the 139 /// instruction, and check to make sure two instructions are close to one 140 /// another. 141 std::optional<std::string> CalleeName; 142 143 /// This structure holds the distances of how far "ahead of" or "behind" the 144 /// target blocks of a branch, or the incoming blocks of a phi nodes are. 145 /// If the value is negative, it means that the block was registered before 146 /// the block of this instruction in terms of blocks in the function. 147 /// Code Example: 148 /// \code 149 /// block_1: 150 /// br i1 %0, label %block_2, label %block_3 151 /// block_2: 152 /// br i1 %1, label %block_1, label %block_2 153 /// block_3: 154 /// br i1 %2, label %block_2, label %block_1 155 /// ; Replacing the labels with relative values, this becomes: 156 /// block_1: 157 /// br i1 %0, distance 1, distance 2 158 /// block_2: 159 /// br i1 %1, distance -1, distance 0 160 /// block_3: 161 /// br i1 %2, distance -1, distance -2 162 /// \endcode 163 /// Taking block_2 as our example, block_1 is "behind" block_2, and block_2 is 164 /// "ahead" of block_2. 165 SmallVector<int, 4> RelativeBlockLocations; 166 167 /// Gather the information that is difficult to gather for an Instruction, or 168 /// is changed. i.e. the operands of an Instruction and the Types of those 169 /// operands. This extra information allows for similarity matching to make 170 /// assertions that allow for more flexibility when checking for whether an 171 /// Instruction performs the same operation. 172 IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL); 173 IRInstructionData(IRInstructionDataList &IDL); 174 175 /// Fills data stuctures for IRInstructionData when it is constructed from a 176 // reference or a pointer. 177 void initializeInstruction(); 178 179 /// Get the predicate that the compare instruction is using for hashing the 180 /// instruction. the IRInstructionData must be wrapping a CmpInst. 181 CmpInst::Predicate getPredicate() const; 182 183 /// Get the callee name that the call instruction is using for hashing the 184 /// instruction. The IRInstructionData must be wrapping a CallInst. 185 StringRef getCalleeName() const; 186 187 /// A function that swaps the predicates to their less than form if they are 188 /// in a greater than form. Otherwise, the predicate is unchanged. 189 /// 190 /// \param CI - The comparison operation to find a consistent preidcate for. 191 /// \return the consistent comparison predicate. 192 static CmpInst::Predicate predicateForConsistency(CmpInst *CI); 193 194 /// For an IRInstructionData containing a branch, finds the 195 /// relative distances from the source basic block to the target by taking 196 /// the difference of the number assigned to the current basic block and the 197 /// target basic block of the branch. 198 /// 199 /// \param BasicBlockToInteger - The mapping of basic blocks to their location 200 /// in the module. 201 void 202 setBranchSuccessors(DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger); 203 204 /// For an IRInstructionData containing a CallInst, set the function name 205 /// appropriately. This will be an empty string if it is an indirect call, 206 /// or we are not matching by name of the called function. It will be the 207 /// name of the function if \p MatchByName is true and it is not an indirect 208 /// call. We may decide not to match by name in order to expand the 209 /// size of the regions we can match. If a function name has the same type 210 /// signature, but the different name, the region of code is still almost the 211 /// same. Since function names can be treated as constants, the name itself 212 /// could be extrapolated away. However, matching by name provides a 213 /// specificity and more "identical" code than not matching by name. 214 /// 215 /// \param MatchByName - A flag to mark whether we are using the called 216 /// function name as a differentiating parameter. 217 void setCalleeName(bool MatchByName = true); 218 219 /// For an IRInstructionData containing a PHINode, finds the 220 /// relative distances from the incoming basic block to the current block by 221 /// taking the difference of the number assigned to the current basic block 222 /// and the incoming basic block of the branch. 223 /// 224 /// \param BasicBlockToInteger - The mapping of basic blocks to their location 225 /// in the module. 226 void 227 setPHIPredecessors(DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger); 228 229 /// Get the BasicBlock based operands for PHINodes and BranchInsts. 230 /// 231 /// \returns A list of relevant BasicBlocks. 232 ArrayRef<Value *> getBlockOperVals(); 233 234 /// Hashes \p Value based on its opcode, types, and operand types. 235 /// Two IRInstructionData instances produce the same hash when they perform 236 /// the same operation. 237 /// 238 /// As a simple example, consider the following instructions. 239 /// 240 /// \code 241 /// %add1 = add i32 %x1, %y1 242 /// %add2 = add i32 %x2, %y2 243 /// 244 /// %sub = sub i32 %x1, %y1 245 /// 246 /// %add_i64 = add i64 %x2, %y2 247 /// \endcode 248 /// 249 /// Because the first two adds operate the same types, and are performing the 250 /// same action, they will be hashed to the same value. 251 /// 252 /// However, the subtraction instruction is not the same as an addition, and 253 /// will be hashed to a different value. 254 /// 255 /// Finally, the last add has a different type compared to the first two add 256 /// instructions, so it will also be hashed to a different value that any of 257 /// the previous instructions. 258 /// 259 /// \param [in] ID - The IRInstructionData instance to be hashed. 260 /// \returns A hash_value of the IRInstructionData. 261 friend hash_code hash_value(const IRInstructionData &ID) { 262 SmallVector<Type *, 4> OperTypes; 263 for (Value *V : ID.OperVals) 264 OperTypes.push_back(V->getType()); 265 266 if (isa<CmpInst>(ID.Inst)) 267 return llvm::hash_combine( 268 llvm::hash_value(ID.Inst->getOpcode()), 269 llvm::hash_value(ID.Inst->getType()), 270 llvm::hash_value(ID.getPredicate()), 271 llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); 272 273 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(ID.Inst)) { 274 // To hash intrinsics, we use the opcode, and types like the other 275 // instructions, but also, the Intrinsic ID, and the Name of the 276 // intrinsic. 277 Intrinsic::ID IntrinsicID = II->getIntrinsicID(); 278 return llvm::hash_combine( 279 llvm::hash_value(ID.Inst->getOpcode()), 280 llvm::hash_value(ID.Inst->getType()), llvm::hash_value(IntrinsicID), 281 llvm::hash_value(*ID.CalleeName), 282 llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); 283 } 284 285 if (isa<CallInst>(ID.Inst)) { 286 std::string FunctionName = *ID.CalleeName; 287 return llvm::hash_combine( 288 llvm::hash_value(ID.Inst->getOpcode()), 289 llvm::hash_value(ID.Inst->getType()), 290 llvm::hash_value(ID.Inst->getType()), llvm::hash_value(FunctionName), 291 llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); 292 } 293 294 return llvm::hash_combine( 295 llvm::hash_value(ID.Inst->getOpcode()), 296 llvm::hash_value(ID.Inst->getType()), 297 llvm::hash_combine_range(OperTypes.begin(), OperTypes.end())); 298 } 299 300 IRInstructionDataList *IDL = nullptr; 301 }; 302 303 struct IRInstructionDataList 304 : simple_ilist<IRInstructionData, ilist_sentinel_tracking<true>> {}; 305 306 /// Compare one IRInstructionData class to another IRInstructionData class for 307 /// whether they are performing a the same operation, and can mapped to the 308 /// same value. For regular instructions if the hash value is the same, then 309 /// they will also be close. 310 /// 311 /// \param A - The first IRInstructionData class to compare 312 /// \param B - The second IRInstructionData class to compare 313 /// \returns true if \p A and \p B are similar enough to be mapped to the same 314 /// value. 315 bool isClose(const IRInstructionData &A, const IRInstructionData &B); 316 317 struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> { 318 static inline IRInstructionData *getEmptyKey() { return nullptr; } 319 static inline IRInstructionData *getTombstoneKey() { 320 return reinterpret_cast<IRInstructionData *>(-1); 321 } 322 323 static unsigned getHashValue(const IRInstructionData *E) { 324 using llvm::hash_value; 325 assert(E && "IRInstructionData is a nullptr?"); 326 return hash_value(*E); 327 } 328 329 static bool isEqual(const IRInstructionData *LHS, 330 const IRInstructionData *RHS) { 331 if (RHS == getEmptyKey() || RHS == getTombstoneKey() || 332 LHS == getEmptyKey() || LHS == getTombstoneKey()) 333 return LHS == RHS; 334 335 assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?"); 336 return isClose(*LHS, *RHS); 337 } 338 }; 339 340 /// Helper struct for converting the Instructions in a Module into a vector of 341 /// unsigned integers. This vector of unsigned integers can be thought of as a 342 /// "numeric string". This numeric string can then be queried by, for example, 343 /// data structures that find repeated substrings. 344 /// 345 /// This hashing is done per BasicBlock in the module. To hash Instructions 346 /// based off of their operations, each Instruction is wrapped in an 347 /// IRInstructionData struct. The unsigned integer for an IRInstructionData 348 /// depends on: 349 /// - The hash provided by the IRInstructionData. 350 /// - Which member of InstrType the IRInstructionData is classified as. 351 // See InstrType for more details on the possible classifications, and how they 352 // manifest in the numeric string. 353 /// 354 /// The numeric string for an individual BasicBlock is terminated by an unique 355 /// unsigned integer. This prevents data structures which rely on repetition 356 /// from matching across BasicBlocks. (For example, the SuffixTree.) 357 /// As a concrete example, if we have the following two BasicBlocks: 358 /// \code 359 /// bb0: 360 /// %add1 = add i32 %a, %b 361 /// %add2 = add i32 %c, %d 362 /// %add3 = add i64 %e, %f 363 /// bb1: 364 /// %sub = sub i32 %c, %d 365 /// \endcode 366 /// We may hash the Instructions like this (via IRInstructionData): 367 /// \code 368 /// bb0: 369 /// %add1 = add i32 %a, %b ; Hash: 1 370 /// %add2 = add i32 %c, %d; Hash: 1 371 /// %add3 = add i64 %e, %f; Hash: 2 372 /// bb1: 373 /// %sub = sub i32 %c, %d; Hash: 3 374 /// %add4 = add i32 %c, %d ; Hash: 1 375 /// \endcode 376 /// And produce a "numeric string representation" like so: 377 /// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2 378 /// 379 /// TODO: This is very similar to the MachineOutliner, and should be 380 /// consolidated into the same interface. 381 struct IRInstructionMapper { 382 /// The starting illegal instruction number to map to. 383 /// 384 /// Set to -3 for compatibility with DenseMapInfo<unsigned>. 385 unsigned IllegalInstrNumber = static_cast<unsigned>(-3); 386 387 /// The next available integer to assign to a legal Instruction to. 388 unsigned LegalInstrNumber = 0; 389 390 /// Correspondence from IRInstructionData to unsigned integers. 391 DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits> 392 InstructionIntegerMap; 393 394 /// A mapping for a basic block in a module to its assigned number/location 395 /// in the module. 396 DenseMap<BasicBlock *, unsigned> BasicBlockToInteger; 397 398 /// Set if we added an illegal number in the previous step. 399 /// Since each illegal number is unique, we only need one of them between 400 /// each range of legal numbers. This lets us make sure we don't add more 401 /// than one illegal number per range. 402 bool AddedIllegalLastTime = false; 403 404 /// Marks whether we found a illegal instruction in the previous step. 405 bool CanCombineWithPrevInstr = false; 406 407 /// Marks whether we have found a set of instructions that is long enough 408 /// to be considered for similarity. 409 bool HaveLegalRange = false; 410 411 /// Marks whether we should use exact function names, as well as types to 412 /// find similarity between calls. 413 bool EnableMatchCallsByName = false; 414 415 /// This allocator pointer is in charge of holding on to the IRInstructionData 416 /// so it is not deallocated until whatever external tool is using it is done 417 /// with the information. 418 SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr; 419 420 /// This allocator pointer is in charge of creating the IRInstructionDataList 421 /// so it is not deallocated until whatever external tool is using it is done 422 /// with the information. 423 SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr; 424 425 /// Get an allocated IRInstructionData struct using the InstDataAllocator. 426 /// 427 /// \param I - The Instruction to wrap with IRInstructionData. 428 /// \param Legality - A boolean value that is true if the instruction is to 429 /// be considered for similarity, and false if not. 430 /// \param IDL - The InstructionDataList that the IRInstructionData is 431 /// inserted into. 432 /// \returns An allocated IRInstructionData struct. 433 IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality, 434 IRInstructionDataList &IDL); 435 436 /// Get an empty allocated IRInstructionData struct using the 437 /// InstDataAllocator. 438 /// 439 /// \param IDL - The InstructionDataList that the IRInstructionData is 440 /// inserted into. 441 /// \returns An allocated IRInstructionData struct. 442 IRInstructionData *allocateIRInstructionData(IRInstructionDataList &IDL); 443 444 /// Get an allocated IRInstructionDataList object using the IDLAllocator. 445 /// 446 /// \returns An allocated IRInstructionDataList object. 447 IRInstructionDataList *allocateIRInstructionDataList(); 448 449 IRInstructionDataList *IDL = nullptr; 450 451 /// Assigns values to all the basic blocks in function \p F starting from 452 /// integer \p BBNumber. 453 /// 454 /// \param F - The function containing the basic blocks to assign numbers to. 455 /// \param BBNumber - The number to start from. 456 void initializeForBBs(Function &F, unsigned &BBNumber) { 457 for (BasicBlock &BB : F) 458 BasicBlockToInteger.insert(std::make_pair(&BB, BBNumber++)); 459 } 460 461 /// Assigns values to all the basic blocks in Module \p M. 462 /// \param M - The module containing the basic blocks to assign numbers to. 463 void initializeForBBs(Module &M) { 464 unsigned BBNumber = 0; 465 for (Function &F : M) 466 initializeForBBs(F, BBNumber); 467 } 468 469 /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers 470 /// determined by \p InstrType. Two Instructions are mapped to the same value 471 /// if they are close as defined by the InstructionData class above. 472 /// 473 /// \param [in] BB - The BasicBlock to be mapped to integers. 474 /// \param [in,out] InstrList - Vector of IRInstructionData to append to. 475 /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to. 476 void convertToUnsignedVec(BasicBlock &BB, 477 std::vector<IRInstructionData *> &InstrList, 478 std::vector<unsigned> &IntegerMapping); 479 480 /// Maps an Instruction to a legal integer. 481 /// 482 /// \param [in] It - The Instruction to be mapped to an integer. 483 /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to 484 /// append to. 485 /// \param [in,out] InstrListForBB - Vector of InstructionData to append to. 486 /// \returns The integer \p It was mapped to. 487 unsigned mapToLegalUnsigned(BasicBlock::iterator &It, 488 std::vector<unsigned> &IntegerMappingForBB, 489 std::vector<IRInstructionData *> &InstrListForBB); 490 491 /// Maps an Instruction to an illegal integer. 492 /// 493 /// \param [in] It - The \p Instruction to be mapped to an integer. 494 /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to 495 /// append to. 496 /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to. 497 /// \param End - true if creating a dummy IRInstructionData at the end of a 498 /// basic block. 499 /// \returns The integer \p It was mapped to. 500 unsigned mapToIllegalUnsigned( 501 BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB, 502 std::vector<IRInstructionData *> &InstrListForBB, bool End = false); 503 504 IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA, 505 SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA) 506 : InstDataAllocator(IDA), IDLAllocator(IDLA) { 507 // Make sure that the implementation of DenseMapInfo<unsigned> hasn't 508 // changed. 509 assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) && 510 "DenseMapInfo<unsigned>'s empty key isn't -1!"); 511 assert(DenseMapInfo<unsigned>::getTombstoneKey() == 512 static_cast<unsigned>(-2) && 513 "DenseMapInfo<unsigned>'s tombstone key isn't -2!"); 514 515 IDL = new (IDLAllocator->Allocate()) 516 IRInstructionDataList(); 517 } 518 519 /// Custom InstVisitor to classify different instructions for whether it can 520 /// be analyzed for similarity. 521 struct InstructionClassification 522 : public InstVisitor<InstructionClassification, InstrType> { 523 InstructionClassification() = default; 524 525 // TODO: Determine a scheme to resolve when the label is similar enough. 526 InstrType visitBranchInst(BranchInst &BI) { 527 if (EnableBranches) 528 return Legal; 529 return Illegal; 530 } 531 InstrType visitPHINode(PHINode &PN) { 532 if (EnableBranches) 533 return Legal; 534 return Illegal; 535 } 536 // TODO: Handle allocas. 537 InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; } 538 // We exclude variable argument instructions since variable arguments 539 // requires extra checking of the argument list. 540 InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; } 541 // We exclude all exception handling cases since they are so context 542 // dependent. 543 InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; } 544 InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; } 545 // DebugInfo should be included in the regions, but should not be 546 // analyzed for similarity as it has no bearing on the outcome of the 547 // program. 548 InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; } 549 InstrType visitIntrinsicInst(IntrinsicInst &II) { 550 // These are disabled due to complications in the CodeExtractor when 551 // outlining these instructions. For instance, It is unclear what we 552 // should do when moving only the start or end lifetime instruction into 553 // an outlined function. Also, assume-like intrinsics could be removed 554 // from the region, removing arguments, causing discrepencies in the 555 // number of inputs between different regions. 556 if (II.isAssumeLikeIntrinsic()) 557 return Illegal; 558 return EnableIntrinsics ? Legal : Illegal; 559 } 560 // We only allow call instructions where the function has a name and 561 // is not an indirect call. 562 InstrType visitCallInst(CallInst &CI) { 563 Function *F = CI.getCalledFunction(); 564 bool IsIndirectCall = CI.isIndirectCall(); 565 if (IsIndirectCall && !EnableIndirectCalls) 566 return Illegal; 567 if (!F && !IsIndirectCall) 568 return Illegal; 569 // Functions marked with the swifttailcc and tailcc calling conventions 570 // require special handling when outlining musttail functions. The 571 // calling convention must be passed down to the outlined function as 572 // well. Further, there is special handling for musttail calls as well, 573 // requiring a return call directly after. For now, the outliner does not 574 // support this, so we do not handle matching this case either. 575 if ((CI.getCallingConv() == CallingConv::SwiftTail || 576 CI.getCallingConv() == CallingConv::Tail) && 577 !EnableMustTailCalls) 578 return Illegal; 579 if (CI.isMustTailCall() && !EnableMustTailCalls) 580 return Illegal; 581 return Legal; 582 } 583 // TODO: We do not current handle similarity that changes the control flow. 584 InstrType visitInvokeInst(InvokeInst &II) { return Illegal; } 585 // TODO: We do not current handle similarity that changes the control flow. 586 InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; } 587 // TODO: Handle interblock similarity. 588 InstrType visitTerminator(Instruction &I) { return Illegal; } 589 InstrType visitInstruction(Instruction &I) { return Legal; } 590 591 // The flag variable that lets the classifier know whether we should 592 // allow branches to be checked for similarity. 593 bool EnableBranches = false; 594 595 // The flag variable that lets the classifier know whether we should 596 // allow indirect calls to be considered legal instructions. 597 bool EnableIndirectCalls = false; 598 599 // Flag that lets the classifier know whether we should allow intrinsics to 600 // be checked for similarity. 601 bool EnableIntrinsics = false; 602 603 // Flag that lets the classifier know whether we should allow tail calls to 604 // be checked for similarity. 605 bool EnableMustTailCalls = false; 606 }; 607 608 /// Maps an Instruction to a member of InstrType. 609 InstructionClassification InstClassifier; 610 }; 611 612 /// This is a class that wraps a range of IRInstructionData from one point to 613 /// another in the vector of IRInstructionData, which is a region of the 614 /// program. It is also responsible for defining the structure within this 615 /// region of instructions. 616 /// 617 /// The structure of a region is defined through a value numbering system 618 /// assigned to each unique value in a region at the creation of the 619 /// IRSimilarityCandidate. 620 /// 621 /// For example, for each Instruction we add a mapping for each new 622 /// value seen in that Instruction. 623 /// IR: Mapping Added: 624 /// %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2 625 /// %add2 = add i32 %a, %1 %add2 -> 4 626 /// %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5 627 /// 628 /// We can compare IRSimilarityCandidates against one another. 629 /// The \ref isSimilar function compares each IRInstructionData against one 630 /// another and if we have the same sequences of IRInstructionData that would 631 /// create the same hash, we have similar IRSimilarityCandidates. 632 /// 633 /// We can also compare the structure of IRSimilarityCandidates. If we can 634 /// create a mapping of registers in the region contained by one 635 /// IRSimilarityCandidate to the region contained by different 636 /// IRSimilarityCandidate, they can be considered structurally similar. 637 /// 638 /// IRSimilarityCandidate1: IRSimilarityCandidate2: 639 /// %add1 = add i32 %a, %b %add1 = add i32 %d, %e 640 /// %add2 = add i32 %a, %c %add2 = add i32 %d, %f 641 /// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4 642 /// 643 /// Can have the following mapping from candidate to candidate of: 644 /// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4 645 /// and can be considered similar. 646 /// 647 /// IRSimilarityCandidate1: IRSimilarityCandidate2: 648 /// %add1 = add i32 %a, %b %add1 = add i32 %d, c4 649 /// %add2 = add i32 %a, %c %add2 = add i32 %d, %f 650 /// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4 651 /// 652 /// We cannot create the same mapping since the use of c4 is not used in the 653 /// same way as %b or c2. 654 class IRSimilarityCandidate { 655 private: 656 /// The start index of this IRSimilarityCandidate in the instruction list. 657 unsigned StartIdx = 0; 658 659 /// The number of instructions in this IRSimilarityCandidate. 660 unsigned Len = 0; 661 662 /// The first instruction in this IRSimilarityCandidate. 663 IRInstructionData *FirstInst = nullptr; 664 665 /// The last instruction in this IRSimilarityCandidate. 666 IRInstructionData *LastInst = nullptr; 667 668 /// Global Value Numbering structures 669 /// @{ 670 /// Stores the mapping of the value to the number assigned to it in the 671 /// IRSimilarityCandidate. 672 DenseMap<Value *, unsigned> ValueToNumber; 673 /// Stores the mapping of the number to the value assigned this number. 674 DenseMap<unsigned, Value *> NumberToValue; 675 /// Stores the mapping of a value's number to canonical numbering in the 676 /// candidate's respective similarity group. 677 DenseMap<unsigned, unsigned> NumberToCanonNum; 678 /// Stores the mapping of canonical number in the candidate's respective 679 /// similarity group to a value number. 680 DenseMap<unsigned, unsigned> CanonNumToNumber; 681 /// @} 682 683 public: 684 /// \param StartIdx - The starting location of the region. 685 /// \param Len - The length of the region. 686 /// \param FirstInstIt - The starting IRInstructionData of the region. 687 /// \param LastInstIt - The ending IRInstructionData of the region. 688 IRSimilarityCandidate(unsigned StartIdx, unsigned Len, 689 IRInstructionData *FirstInstIt, 690 IRInstructionData *LastInstIt); 691 692 /// \param A - The first IRInstructionCandidate to compare. 693 /// \param B - The second IRInstructionCandidate to compare. 694 /// \returns True when every IRInstructionData in \p A is similar to every 695 /// IRInstructionData in \p B. 696 static bool isSimilar(const IRSimilarityCandidate &A, 697 const IRSimilarityCandidate &B); 698 699 /// \param [in] A - The first IRInstructionCandidate to compare. 700 /// \param [in] B - The second IRInstructionCandidate to compare. 701 /// \returns True when every IRInstructionData in \p A is structurally similar 702 /// to \p B. 703 static bool compareStructure(const IRSimilarityCandidate &A, 704 const IRSimilarityCandidate &B); 705 706 /// \param [in] A - The first IRInstructionCandidate to compare. 707 /// \param [in] B - The second IRInstructionCandidate to compare. 708 /// \param [in,out] ValueNumberMappingA - A mapping of value numbers from 709 /// candidate \p A to candidate \B. 710 /// \param [in,out] ValueNumberMappingB - A mapping of value numbers from 711 /// candidate \p B to candidate \A. 712 /// \returns True when every IRInstructionData in \p A is structurally similar 713 /// to \p B. 714 static bool 715 compareStructure(const IRSimilarityCandidate &A, 716 const IRSimilarityCandidate &B, 717 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA, 718 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB); 719 720 struct OperandMapping { 721 /// The IRSimilarityCandidate that holds the instruction the OperVals were 722 /// pulled from. 723 const IRSimilarityCandidate &IRSC; 724 725 /// The operand values to be analyzed. 726 ArrayRef<Value *> &OperVals; 727 728 /// The current mapping of global value numbers from one IRSimilarityCandidate 729 /// to another IRSimilarityCandidate. 730 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping; 731 }; 732 733 /// A helper struct to hold the candidate, for a branch instruction, the 734 /// relative location of a label, and the label itself. This is mostly to 735 /// group the values together before passing them as a bundle to a function. 736 struct RelativeLocMapping { 737 /// The IRSimilarityCandidate that holds the instruction the relative 738 /// location was pulled from. 739 const IRSimilarityCandidate &IRSC; 740 741 /// The relative location to be analyzed. 742 int RelativeLocation; 743 744 /// The corresponding value. 745 Value *OperVal; 746 }; 747 748 /// Compare the operands in \p A and \p B and check that the current mapping 749 /// of global value numbers from \p A to \p B and \p B to \A is consistent. 750 /// 751 /// \param A - The first IRInstructionCandidate, operand values, and current 752 /// operand mappings to compare. 753 /// \param B - The second IRInstructionCandidate, operand values, and current 754 /// operand mappings to compare. 755 /// \returns true if the IRSimilarityCandidates operands are compatible. 756 static bool compareNonCommutativeOperandMapping(OperandMapping A, 757 OperandMapping B); 758 759 /// Compare the operands in \p A and \p B and check that the current mapping 760 /// of global value numbers from \p A to \p B and \p B to \A is consistent 761 /// given that the operands are commutative. 762 /// 763 /// \param A - The first IRInstructionCandidate, operand values, and current 764 /// operand mappings to compare. 765 /// \param B - The second IRInstructionCandidate, operand values, and current 766 /// operand mappings to compare. 767 /// \returns true if the IRSimilarityCandidates operands are compatible. 768 static bool compareCommutativeOperandMapping(OperandMapping A, 769 OperandMapping B); 770 771 /// Compare the GVN of the assignment value in corresponding instructions in 772 /// IRSimilarityCandidates \p A and \p B and check that there exists a mapping 773 /// between the values and replaces the mapping with a one-to-one value if 774 /// needed. 775 /// 776 /// \param InstValA - The assignment GVN from the first IRSimilarityCandidate. 777 /// \param InstValB - The assignment GVN from the second 778 /// IRSimilarityCandidate. 779 /// \param [in,out] ValueNumberMappingA - A mapping of value numbers from 780 /// candidate \p A to candidate \B. 781 /// \param [in,out] ValueNumberMappingB - A mapping of value numbers from 782 /// candidate \p B to candidate \A. 783 /// \returns true if the IRSimilarityCandidates assignments are compatible. 784 static bool compareAssignmentMapping( 785 const unsigned InstValA, const unsigned &InstValB, 786 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA, 787 DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB); 788 789 /// Compare the relative locations in \p A and \p B and check that the 790 /// distances match if both locations are contained in the region, and that 791 /// the branches both point outside the region if they do not. 792 /// Example Region: 793 /// \code 794 /// entry: 795 /// br i1 %0, label %block_1, label %block_3 796 /// block_0: 797 /// br i1 %0, label %block_1, label %block_2 798 /// block_1: 799 /// br i1 %0, label %block_2, label %block_3 800 /// block_2: 801 /// br i1 %1, label %block_1, label %block_4 802 /// block_3: 803 /// br i1 %2, label %block_2, label %block_5 804 /// \endcode 805 /// If we compare the branches in block_0 and block_1 the relative values are 806 /// 1 and 2 for both, so we consider this a match. 807 /// 808 /// If we compare the branches in entry and block_0 the relative values are 809 /// 2 and 3, and 1 and 2 respectively. Since these are not the same we do not 810 /// consider them a match. 811 /// 812 /// If we compare the branches in block_1 and block_2 the relative values are 813 /// 1 and 2, and -1 and None respectively. As a result we do not consider 814 /// these to be the same 815 /// 816 /// If we compare the branches in block_2 and block_3 the relative values are 817 /// -1 and None for both. We do consider these to be a match. 818 /// 819 /// \param A - The first IRInstructionCandidate, relative location value, 820 /// and incoming block. 821 /// \param B - The second IRInstructionCandidate, relative location value, 822 /// and incoming block. 823 /// \returns true if the relative locations match. 824 static bool checkRelativeLocations(RelativeLocMapping A, 825 RelativeLocMapping B); 826 827 /// Create a mapping from the value numbering to a different separate set of 828 /// numbers. This will serve as a guide for relating one candidate to another. 829 /// The canonical number gives use the ability identify which global value 830 /// number in one candidate relates to the global value number in the other. 831 /// 832 /// \param [in, out] CurrCand - The IRSimilarityCandidate to create a 833 /// canonical numbering for. 834 static void createCanonicalMappingFor(IRSimilarityCandidate &CurrCand); 835 836 /// Create a mapping for the value numbering of the calling 837 /// IRSimilarityCandidate, to a different separate set of numbers, based on 838 /// the canonical ordering in \p SourceCand. These are defined based on the 839 /// found mappings in \p ToSourceMapping and \p FromSourceMapping. Both of 840 /// these relationships should have the same information, just in opposite 841 /// directions. 842 /// 843 /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a 844 /// canonical numbering from. 845 /// \param ToSourceMapping - The mapping of value numbers from this candidate 846 /// to \p SourceCand. 847 /// \param FromSourceMapping - The mapping of value numbers from \p SoureCand 848 /// to this candidate. 849 void createCanonicalRelationFrom( 850 IRSimilarityCandidate &SourceCand, 851 DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping, 852 DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping); 853 854 /// Create a mapping for the value numbering of the calling 855 /// IRSimilarityCandidate, to a different separate set of numbers, based on 856 /// the canonical ordering in \p SourceCand. These are defined based on the 857 /// found mappings in \p ToSourceMapping and \p FromSourceMapping. Both of 858 /// these relationships should have the same information, just in opposite 859 /// directions. Uses the \p OneToOne mapping from target candidate to \p 860 /// SourceCand GVNs to determine the mapping first for values with multiple 861 /// mappings. This mapping is created by the ordering of operands in the 862 /// instruction they are first seen in the candidates. 863 /// 864 /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a 865 /// canonical numbering from. 866 /// \param [in,out] OneToOne - A mapping of value numbers from candidate 867 /// \p A to candidate \B using the structure of the original instructions. 868 /// \param ToSourceMapping - The mapping of value numbers from this candidate 869 /// to \p SourceCand. 870 /// \param FromSourceMapping - The mapping of value numbers from \p SoureCand 871 /// to this candidate. 872 void createCanonicalRelationFrom( 873 IRSimilarityCandidate &SourceCand, 874 DenseMap<unsigned, unsigned> &OneToOne, 875 DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping, 876 DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping); 877 878 /// Create a mapping for the value numbering of the calling 879 /// IRSimilarityCandidate, to a different separate set of numbers, based on 880 /// the canonical ordering in \p SourceCand. These are defined based on the 881 /// canonical mapping defined between \p SoureCandLarge and 882 /// \p TargetCandLarge. These IRSimilarityCandidates are already structurally 883 /// similar, and fully encapsulate the IRSimilarityCandidates in question. 884 /// These are used as a "bridge" from the \p SourceCand to the target. 885 /// 886 /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a 887 /// canonical numbering from. 888 /// \param SoureCandLarge - The IRSimilarityCandidate fully containing 889 /// \p SourceCand. 890 /// \param TargetCandLarge - The IRSimilarityCandidate fully containing 891 /// this Candidate. 892 void createCanonicalRelationFrom( 893 IRSimilarityCandidate &SourceCand, 894 IRSimilarityCandidate &SourceCandLarge, 895 IRSimilarityCandidate &TargetCandLarge); 896 897 /// \param [in,out] BBSet - The set to track the basic blocks. 898 void getBasicBlocks(DenseSet<BasicBlock *> &BBSet) const { 899 for (IRInstructionData &ID : *this) { 900 BasicBlock *BB = ID.Inst->getParent(); 901 BBSet.insert(BB); 902 } 903 } 904 905 /// \param [in,out] BBSet - The set to track the basic blocks. 906 /// \param [in,out] BBList - A list in order of use to track the basic blocks. 907 void getBasicBlocks(DenseSet<BasicBlock *> &BBSet, 908 SmallVector<BasicBlock *> &BBList) const { 909 for (IRInstructionData &ID : *this) { 910 BasicBlock *BB = ID.Inst->getParent(); 911 if (BBSet.insert(BB).second) 912 BBList.push_back(BB); 913 } 914 } 915 916 /// Compare the start and end indices of the two IRSimilarityCandidates for 917 /// whether they overlap. If the start instruction of one 918 /// IRSimilarityCandidate is less than the end instruction of the other, and 919 /// the start instruction of one is greater than the start instruction of the 920 /// other, they overlap. 921 /// 922 /// \returns true if the IRSimilarityCandidates do not have overlapping 923 /// instructions. 924 static bool overlap(const IRSimilarityCandidate &A, 925 const IRSimilarityCandidate &B); 926 927 /// \returns the number of instructions in this Candidate. 928 unsigned getLength() const { return Len; } 929 930 /// \returns the start index of this IRSimilarityCandidate. 931 unsigned getStartIdx() const { return StartIdx; } 932 933 /// \returns the end index of this IRSimilarityCandidate. 934 unsigned getEndIdx() const { return StartIdx + Len - 1; } 935 936 /// \returns The first IRInstructionData. 937 IRInstructionData *front() const { return FirstInst; } 938 /// \returns The last IRInstructionData. 939 IRInstructionData *back() const { return LastInst; } 940 941 /// \returns The first Instruction. 942 Instruction *frontInstruction() { return FirstInst->Inst; } 943 /// \returns The last Instruction 944 Instruction *backInstruction() { return LastInst->Inst; } 945 946 /// \returns The BasicBlock the IRSimilarityCandidate starts in. 947 BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); } 948 /// \returns The BasicBlock the IRSimilarityCandidate ends in. 949 BasicBlock *getEndBB() { return LastInst->Inst->getParent(); } 950 951 /// \returns The Function that the IRSimilarityCandidate is located in. 952 Function *getFunction() { return getStartBB()->getParent(); } 953 954 /// Finds the positive number associated with \p V if it has been mapped. 955 /// \param [in] V - the Value to find. 956 /// \returns The positive number corresponding to the value. 957 /// \returns std::nullopt if not present. 958 std::optional<unsigned> getGVN(Value *V) { 959 assert(V != nullptr && "Value is a nullptr?"); 960 DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V); 961 if (VNIt == ValueToNumber.end()) 962 return std::nullopt; 963 return VNIt->second; 964 } 965 966 /// Finds the Value associate with \p Num if it exists. 967 /// \param [in] Num - the number to find. 968 /// \returns The Value associated with the number. 969 /// \returns std::nullopt if not present. 970 std::optional<Value *> fromGVN(unsigned Num) { 971 DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num); 972 if (VNIt == NumberToValue.end()) 973 return std::nullopt; 974 assert(VNIt->second != nullptr && "Found value is a nullptr!"); 975 return VNIt->second; 976 } 977 978 /// Find the canonical number from the global value number \p N stored in the 979 /// candidate. 980 /// 981 /// \param N - The global value number to find the canonical number for. 982 /// \returns An optional containing the value, and std::nullopt if it could 983 /// not be found. 984 std::optional<unsigned> getCanonicalNum(unsigned N) { 985 DenseMap<unsigned, unsigned>::iterator NCIt = NumberToCanonNum.find(N); 986 if (NCIt == NumberToCanonNum.end()) 987 return std::nullopt; 988 return NCIt->second; 989 } 990 991 /// Find the global value number from the canonical number \p N stored in the 992 /// candidate. 993 /// 994 /// \param N - The canonical number to find the global vlaue number for. 995 /// \returns An optional containing the value, and std::nullopt if it could 996 /// not be found. 997 std::optional<unsigned> fromCanonicalNum(unsigned N) { 998 DenseMap<unsigned, unsigned>::iterator CNIt = CanonNumToNumber.find(N); 999 if (CNIt == CanonNumToNumber.end()) 1000 return std::nullopt; 1001 return CNIt->second; 1002 } 1003 1004 /// \param RHS -The IRSimilarityCandidate to compare against 1005 /// \returns true if the IRSimilarityCandidate is occurs after the 1006 /// IRSimilarityCandidate in the program. 1007 bool operator<(const IRSimilarityCandidate &RHS) const { 1008 return getStartIdx() > RHS.getStartIdx(); 1009 } 1010 1011 using iterator = IRInstructionDataList::iterator; 1012 iterator begin() const { return iterator(front()); } 1013 iterator end() const { return std::next(iterator(back())); } 1014 }; 1015 1016 typedef DenseMap<IRSimilarityCandidate *, 1017 DenseMap<unsigned, DenseSet<unsigned>>> 1018 CandidateGVNMapping; 1019 typedef std::vector<IRSimilarityCandidate> SimilarityGroup; 1020 typedef std::vector<SimilarityGroup> SimilarityGroupList; 1021 1022 /// This class puts all the pieces of the IRInstructionData, 1023 /// IRInstructionMapper, IRSimilarityCandidate together. 1024 /// 1025 /// It first feeds the Module or vector of Modules into the IRInstructionMapper, 1026 /// and puts all the mapped instructions into a single long list of 1027 /// IRInstructionData. 1028 /// 1029 /// The list of unsigned integers is given to the Suffix Tree or similar data 1030 /// structure to find repeated subsequences. We construct an 1031 /// IRSimilarityCandidate for each instance of the subsequence. We compare them 1032 /// against one another since These repeated subsequences can have different 1033 /// structure. For each different kind of structure found, we create a 1034 /// similarity group. 1035 /// 1036 /// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are 1037 /// structurally similar to one another, while C is different we would have two 1038 /// SimilarityGroups: 1039 /// 1040 /// SimilarityGroup 1: SimilarityGroup 2 1041 /// A, B, D C 1042 /// 1043 /// A list of the different similarity groups is then returned after 1044 /// analyzing the module. 1045 class IRSimilarityIdentifier { 1046 public: 1047 IRSimilarityIdentifier(bool MatchBranches = true, 1048 bool MatchIndirectCalls = true, 1049 bool MatchCallsWithName = false, 1050 bool MatchIntrinsics = true, 1051 bool MatchMustTailCalls = true) 1052 : Mapper(&InstDataAllocator, &InstDataListAllocator), 1053 EnableBranches(MatchBranches), EnableIndirectCalls(MatchIndirectCalls), 1054 EnableMatchingCallsByName(MatchCallsWithName), 1055 EnableIntrinsics(MatchIntrinsics), 1056 EnableMustTailCalls(MatchMustTailCalls) {} 1057 1058 private: 1059 /// Map the instructions in the module to unsigned integers, using mapping 1060 /// already present in the Mapper if possible. 1061 /// 1062 /// \param [in] M Module - To map to integers. 1063 /// \param [in,out] InstrList - The vector to append IRInstructionData to. 1064 /// \param [in,out] IntegerMapping - The vector to append integers to. 1065 void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList, 1066 std::vector<unsigned> &IntegerMapping); 1067 1068 /// Map the instructions in the modules vector to unsigned integers, using 1069 /// mapping already present in the mapper if possible. 1070 /// 1071 /// \param [in] Modules - The list of modules to use to populate the mapper 1072 /// \param [in,out] InstrList - The vector to append IRInstructionData to. 1073 /// \param [in,out] IntegerMapping - The vector to append integers to. 1074 void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules, 1075 std::vector<IRInstructionData *> &InstrList, 1076 std::vector<unsigned> &IntegerMapping); 1077 1078 /// Find the similarity candidates in \p InstrList and corresponding 1079 /// \p UnsignedVec 1080 /// 1081 /// \param [in,out] InstrList - The vector to append IRInstructionData to. 1082 /// \param [in,out] IntegerMapping - The vector to append integers to. 1083 /// candidates found in the program. 1084 void findCandidates(std::vector<IRInstructionData *> &InstrList, 1085 std::vector<unsigned> &IntegerMapping); 1086 1087 public: 1088 // Find the IRSimilarityCandidates in the \p Modules and group by structural 1089 // similarity in a SimilarityGroup, each group is returned in a 1090 // SimilarityGroupList. 1091 // 1092 // \param [in] Modules - the modules to analyze. 1093 // \returns The groups of similarity ranges found in the modules. 1094 SimilarityGroupList & 1095 findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules); 1096 1097 // Find the IRSimilarityCandidates in the given Module grouped by structural 1098 // similarity in a SimilarityGroup, contained inside a SimilarityGroupList. 1099 // 1100 // \param [in] M - the module to analyze. 1101 // \returns The groups of similarity ranges found in the module. 1102 SimilarityGroupList &findSimilarity(Module &M); 1103 1104 // Clears \ref SimilarityCandidates if it is already filled by a previous run. 1105 void resetSimilarityCandidates() { 1106 // If we've already analyzed a Module or set of Modules, so we must clear 1107 // the SimilarityCandidates to make sure we do not have only old values 1108 // hanging around. 1109 if (SimilarityCandidates) 1110 SimilarityCandidates->clear(); 1111 else 1112 SimilarityCandidates = SimilarityGroupList(); 1113 } 1114 1115 // \returns The groups of similarity ranges found in the most recently passed 1116 // set of modules. 1117 std::optional<SimilarityGroupList> &getSimilarity() { 1118 return SimilarityCandidates; 1119 } 1120 1121 private: 1122 /// The allocator for IRInstructionData. 1123 SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator; 1124 1125 /// The allocator for IRInstructionDataLists. 1126 SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator; 1127 1128 /// Map Instructions to unsigned integers and wraps the Instruction in an 1129 /// instance of IRInstructionData. 1130 IRInstructionMapper Mapper; 1131 1132 /// The flag variable that marks whether we should check branches for 1133 /// similarity, or only look within basic blocks. 1134 bool EnableBranches = true; 1135 1136 /// The flag variable that marks whether we allow indirect calls to be checked 1137 /// for similarity, or exclude them as a legal instruction. 1138 bool EnableIndirectCalls = true; 1139 1140 /// The flag variable that marks whether we allow calls to be marked as 1141 /// similar if they do not have the same name, only the same calling 1142 /// convention, attributes and type signature. 1143 bool EnableMatchingCallsByName = true; 1144 1145 /// The flag variable that marks whether we should check intrinsics for 1146 /// similarity. 1147 bool EnableIntrinsics = true; 1148 1149 // The flag variable that marks whether we should allow tailcalls 1150 // to be checked for similarity. 1151 bool EnableMustTailCalls = false; 1152 1153 /// The SimilarityGroups found with the most recent run of \ref 1154 /// findSimilarity. std::nullopt if there is no recent run. 1155 std::optional<SimilarityGroupList> SimilarityCandidates; 1156 }; 1157 1158 } // end namespace IRSimilarity 1159 1160 /// An analysis pass based on legacy pass manager that runs and returns 1161 /// IRSimilarityIdentifier run on the Module. 1162 class IRSimilarityIdentifierWrapperPass : public ModulePass { 1163 std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI; 1164 1165 public: 1166 static char ID; 1167 IRSimilarityIdentifierWrapperPass(); 1168 1169 IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; } 1170 const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; } 1171 1172 bool doInitialization(Module &M) override; 1173 bool doFinalization(Module &M) override; 1174 bool runOnModule(Module &M) override; 1175 void getAnalysisUsage(AnalysisUsage &AU) const override { 1176 AU.setPreservesAll(); 1177 } 1178 }; 1179 1180 /// An analysis pass that runs and returns the IRSimilarityIdentifier run on the 1181 /// Module. 1182 class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> { 1183 public: 1184 typedef IRSimilarity::IRSimilarityIdentifier Result; 1185 1186 Result run(Module &M, ModuleAnalysisManager &); 1187 1188 private: 1189 friend AnalysisInfoMixin<IRSimilarityAnalysis>; 1190 static AnalysisKey Key; 1191 }; 1192 1193 /// Printer pass that uses \c IRSimilarityAnalysis. 1194 class IRSimilarityAnalysisPrinterPass 1195 : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> { 1196 raw_ostream &OS; 1197 1198 public: 1199 explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {} 1200 PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); 1201 static bool isRequired() { return true; } 1202 }; 1203 1204 } // end namespace llvm 1205 1206 #endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H 1207