1 //===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// This file implements classes for searching and analyzing source code clones. 10 /// 11 //===----------------------------------------------------------------------===// 12 13 #include "clang/Analysis/CloneDetection.h" 14 #include "clang/AST/Attr.h" 15 #include "clang/AST/DataCollection.h" 16 #include "clang/AST/DeclTemplate.h" 17 #include "clang/Basic/SourceManager.h" 18 #include "llvm/Support/MD5.h" 19 #include "llvm/Support/Path.h" 20 21 using namespace clang; 22 23 StmtSequence::StmtSequence(const CompoundStmt *Stmt, const Decl *D, 24 unsigned StartIndex, unsigned EndIndex) 25 : S(Stmt), D(D), StartIndex(StartIndex), EndIndex(EndIndex) { 26 assert(Stmt && "Stmt must not be a nullptr"); 27 assert(StartIndex < EndIndex && "Given array should not be empty"); 28 assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt"); 29 } 30 31 StmtSequence::StmtSequence(const Stmt *Stmt, const Decl *D) 32 : S(Stmt), D(D), StartIndex(0), EndIndex(0) {} 33 34 StmtSequence::StmtSequence() 35 : S(nullptr), D(nullptr), StartIndex(0), EndIndex(0) {} 36 37 bool StmtSequence::contains(const StmtSequence &Other) const { 38 // If both sequences reside in different declarations, they can never contain 39 // each other. 40 if (D != Other.D) 41 return false; 42 43 const SourceManager &SM = getASTContext().getSourceManager(); 44 45 // Otherwise check if the start and end locations of the current sequence 46 // surround the other sequence. 47 bool StartIsInBounds = 48 SM.isBeforeInTranslationUnit(getBeginLoc(), Other.getBeginLoc()) || 49 getBeginLoc() == Other.getBeginLoc(); 50 if (!StartIsInBounds) 51 return false; 52 53 bool EndIsInBounds = 54 SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) || 55 Other.getEndLoc() == getEndLoc(); 56 return EndIsInBounds; 57 } 58 59 StmtSequence::iterator StmtSequence::begin() const { 60 if (!holdsSequence()) { 61 return &S; 62 } 63 auto CS = cast<CompoundStmt>(S); 64 return CS->body_begin() + StartIndex; 65 } 66 67 StmtSequence::iterator StmtSequence::end() const { 68 if (!holdsSequence()) { 69 return reinterpret_cast<StmtSequence::iterator>(&S) + 1; 70 } 71 auto CS = cast<CompoundStmt>(S); 72 return CS->body_begin() + EndIndex; 73 } 74 75 ASTContext &StmtSequence::getASTContext() const { 76 assert(D); 77 return D->getASTContext(); 78 } 79 80 SourceLocation StmtSequence::getBeginLoc() const { 81 return front()->getBeginLoc(); 82 } 83 84 SourceLocation StmtSequence::getEndLoc() const { return back()->getEndLoc(); } 85 86 SourceRange StmtSequence::getSourceRange() const { 87 return SourceRange(getBeginLoc(), getEndLoc()); 88 } 89 90 void CloneDetector::analyzeCodeBody(const Decl *D) { 91 assert(D); 92 assert(D->hasBody()); 93 94 Sequences.push_back(StmtSequence(D->getBody(), D)); 95 } 96 97 /// Returns true if and only if \p Stmt contains at least one other 98 /// sequence in the \p Group. 99 static bool containsAnyInGroup(StmtSequence &Seq, 100 CloneDetector::CloneGroup &Group) { 101 for (StmtSequence &GroupSeq : Group) { 102 if (Seq.contains(GroupSeq)) 103 return true; 104 } 105 return false; 106 } 107 108 /// Returns true if and only if all sequences in \p OtherGroup are 109 /// contained by a sequence in \p Group. 110 static bool containsGroup(CloneDetector::CloneGroup &Group, 111 CloneDetector::CloneGroup &OtherGroup) { 112 // We have less sequences in the current group than we have in the other, 113 // so we will never fulfill the requirement for returning true. This is only 114 // possible because we know that a sequence in Group can contain at most 115 // one sequence in OtherGroup. 116 if (Group.size() < OtherGroup.size()) 117 return false; 118 119 for (StmtSequence &Stmt : Group) { 120 if (!containsAnyInGroup(Stmt, OtherGroup)) 121 return false; 122 } 123 return true; 124 } 125 126 void OnlyLargestCloneConstraint::constrain( 127 std::vector<CloneDetector::CloneGroup> &Result) { 128 std::vector<unsigned> IndexesToRemove; 129 130 // Compare every group in the result with the rest. If one groups contains 131 // another group, we only need to return the bigger group. 132 // Note: This doesn't scale well, so if possible avoid calling any heavy 133 // function from this loop to minimize the performance impact. 134 for (unsigned i = 0; i < Result.size(); ++i) { 135 for (unsigned j = 0; j < Result.size(); ++j) { 136 // Don't compare a group with itself. 137 if (i == j) 138 continue; 139 140 if (containsGroup(Result[j], Result[i])) { 141 IndexesToRemove.push_back(i); 142 break; 143 } 144 } 145 } 146 147 // Erasing a list of indexes from the vector should be done with decreasing 148 // indexes. As IndexesToRemove is constructed with increasing values, we just 149 // reverse iterate over it to get the desired order. 150 for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) { 151 Result.erase(Result.begin() + *I); 152 } 153 } 154 155 bool FilenamePatternConstraint::isAutoGenerated( 156 const CloneDetector::CloneGroup &Group) { 157 if (IgnoredFilesPattern.empty() || Group.empty() || 158 !IgnoredFilesRegex->isValid()) 159 return false; 160 161 for (const StmtSequence &S : Group) { 162 const SourceManager &SM = S.getASTContext().getSourceManager(); 163 StringRef Filename = llvm::sys::path::filename( 164 SM.getFilename(S.getContainingDecl()->getLocation())); 165 if (IgnoredFilesRegex->match(Filename)) 166 return true; 167 } 168 169 return false; 170 } 171 172 /// This class defines what a type II code clone is: If it collects for two 173 /// statements the same data, then those two statements are considered to be 174 /// clones of each other. 175 /// 176 /// All collected data is forwarded to the given data consumer of the type T. 177 /// The data consumer class needs to provide a member method with the signature: 178 /// update(StringRef Str) 179 namespace { 180 template <class T> 181 class CloneTypeIIStmtDataCollector 182 : public ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>> { 183 ASTContext &Context; 184 /// The data sink to which all data is forwarded. 185 T &DataConsumer; 186 187 template <class Ty> void addData(const Ty &Data) { 188 data_collection::addDataToConsumer(DataConsumer, Data); 189 } 190 191 public: 192 CloneTypeIIStmtDataCollector(const Stmt *S, ASTContext &Context, 193 T &DataConsumer) 194 : Context(Context), DataConsumer(DataConsumer) { 195 this->Visit(S); 196 } 197 198 // Define a visit method for each class to collect data and subsequently visit 199 // all parent classes. This uses a template so that custom visit methods by us 200 // take precedence. 201 #define DEF_ADD_DATA(CLASS, CODE) \ 202 template <class = void> void Visit##CLASS(const CLASS *S) { \ 203 CODE; \ 204 ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \ 205 } 206 207 #include "clang/AST/StmtDataCollectors.inc" 208 209 // Type II clones ignore variable names and literals, so let's skip them. 210 #define SKIP(CLASS) \ 211 void Visit##CLASS(const CLASS *S) { \ 212 ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \ 213 } 214 SKIP(DeclRefExpr) 215 SKIP(MemberExpr) 216 SKIP(IntegerLiteral) 217 SKIP(FloatingLiteral) 218 SKIP(StringLiteral) 219 SKIP(CXXBoolLiteralExpr) 220 SKIP(CharacterLiteral) 221 #undef SKIP 222 }; 223 } // end anonymous namespace 224 225 static size_t createHash(llvm::MD5 &Hash) { 226 size_t HashCode; 227 228 // Create the final hash code for the current Stmt. 229 llvm::MD5::MD5Result HashResult; 230 Hash.final(HashResult); 231 232 // Copy as much as possible of the generated hash code to the Stmt's hash 233 // code. 234 std::memcpy(&HashCode, &HashResult, 235 std::min(sizeof(HashCode), sizeof(HashResult))); 236 237 return HashCode; 238 } 239 240 /// Generates and saves a hash code for the given Stmt. 241 /// \param S The given Stmt. 242 /// \param D The Decl containing S. 243 /// \param StmtsByHash Output parameter that will contain the hash codes for 244 /// each StmtSequence in the given Stmt. 245 /// \return The hash code of the given Stmt. 246 /// 247 /// If the given Stmt is a CompoundStmt, this method will also generate 248 /// hashes for all possible StmtSequences in the children of this Stmt. 249 static size_t 250 saveHash(const Stmt *S, const Decl *D, 251 std::vector<std::pair<size_t, StmtSequence>> &StmtsByHash) { 252 llvm::MD5 Hash; 253 ASTContext &Context = D->getASTContext(); 254 255 CloneTypeIIStmtDataCollector<llvm::MD5>(S, Context, Hash); 256 257 auto CS = dyn_cast<CompoundStmt>(S); 258 SmallVector<size_t, 8> ChildHashes; 259 260 for (const Stmt *Child : S->children()) { 261 if (Child == nullptr) { 262 ChildHashes.push_back(0); 263 continue; 264 } 265 size_t ChildHash = saveHash(Child, D, StmtsByHash); 266 Hash.update( 267 StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash))); 268 ChildHashes.push_back(ChildHash); 269 } 270 271 if (CS) { 272 // If we're in a CompoundStmt, we hash all possible combinations of child 273 // statements to find clones in those subsequences. 274 // We first go through every possible starting position of a subsequence. 275 for (unsigned Pos = 0; Pos < CS->size(); ++Pos) { 276 // Then we try all possible lengths this subsequence could have and 277 // reuse the same hash object to make sure we only hash every child 278 // hash exactly once. 279 llvm::MD5 Hash; 280 for (unsigned Length = 1; Length <= CS->size() - Pos; ++Length) { 281 // Grab the current child hash and put it into our hash. We do 282 // -1 on the index because we start counting the length at 1. 283 size_t ChildHash = ChildHashes[Pos + Length - 1]; 284 Hash.update( 285 StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash))); 286 // If we have at least two elements in our subsequence, we can start 287 // saving it. 288 if (Length > 1) { 289 llvm::MD5 SubHash = Hash; 290 StmtsByHash.push_back(std::make_pair( 291 createHash(SubHash), StmtSequence(CS, D, Pos, Pos + Length))); 292 } 293 } 294 } 295 } 296 297 size_t HashCode = createHash(Hash); 298 StmtsByHash.push_back(std::make_pair(HashCode, StmtSequence(S, D))); 299 return HashCode; 300 } 301 302 namespace { 303 /// Wrapper around FoldingSetNodeID that it can be used as the template 304 /// argument of the StmtDataCollector. 305 class FoldingSetNodeIDWrapper { 306 307 llvm::FoldingSetNodeID &FS; 308 309 public: 310 FoldingSetNodeIDWrapper(llvm::FoldingSetNodeID &FS) : FS(FS) {} 311 312 void update(StringRef Str) { FS.AddString(Str); } 313 }; 314 } // end anonymous namespace 315 316 /// Writes the relevant data from all statements and child statements 317 /// in the given StmtSequence into the given FoldingSetNodeID. 318 static void CollectStmtSequenceData(const StmtSequence &Sequence, 319 FoldingSetNodeIDWrapper &OutputData) { 320 for (const Stmt *S : Sequence) { 321 CloneTypeIIStmtDataCollector<FoldingSetNodeIDWrapper>( 322 S, Sequence.getASTContext(), OutputData); 323 324 for (const Stmt *Child : S->children()) { 325 if (!Child) 326 continue; 327 328 CollectStmtSequenceData(StmtSequence(Child, Sequence.getContainingDecl()), 329 OutputData); 330 } 331 } 332 } 333 334 /// Returns true if both sequences are clones of each other. 335 static bool areSequencesClones(const StmtSequence &LHS, 336 const StmtSequence &RHS) { 337 // We collect the data from all statements in the sequence as we did before 338 // when generating a hash value for each sequence. But this time we don't 339 // hash the collected data and compare the whole data set instead. This 340 // prevents any false-positives due to hash code collisions. 341 llvm::FoldingSetNodeID DataLHS, DataRHS; 342 FoldingSetNodeIDWrapper LHSWrapper(DataLHS); 343 FoldingSetNodeIDWrapper RHSWrapper(DataRHS); 344 345 CollectStmtSequenceData(LHS, LHSWrapper); 346 CollectStmtSequenceData(RHS, RHSWrapper); 347 348 return DataLHS == DataRHS; 349 } 350 351 void RecursiveCloneTypeIIHashConstraint::constrain( 352 std::vector<CloneDetector::CloneGroup> &Sequences) { 353 // FIXME: Maybe we can do this in-place and don't need this additional vector. 354 std::vector<CloneDetector::CloneGroup> Result; 355 356 for (CloneDetector::CloneGroup &Group : Sequences) { 357 // We assume in the following code that the Group is non-empty, so we 358 // skip all empty groups. 359 if (Group.empty()) 360 continue; 361 362 std::vector<std::pair<size_t, StmtSequence>> StmtsByHash; 363 364 // Generate hash codes for all children of S and save them in StmtsByHash. 365 for (const StmtSequence &S : Group) { 366 saveHash(S.front(), S.getContainingDecl(), StmtsByHash); 367 } 368 369 // Sort hash_codes in StmtsByHash. 370 llvm::stable_sort(StmtsByHash, llvm::less_first()); 371 372 // Check for each StmtSequence if its successor has the same hash value. 373 // We don't check the last StmtSequence as it has no successor. 374 // Note: The 'size - 1 ' in the condition is safe because we check for an 375 // empty Group vector at the beginning of this function. 376 for (unsigned i = 0; i < StmtsByHash.size() - 1; ++i) { 377 const auto Current = StmtsByHash[i]; 378 379 // It's likely that we just found a sequence of StmtSequences that 380 // represent a CloneGroup, so we create a new group and start checking and 381 // adding the StmtSequences in this sequence. 382 CloneDetector::CloneGroup NewGroup; 383 384 size_t PrototypeHash = Current.first; 385 386 for (; i < StmtsByHash.size(); ++i) { 387 // A different hash value means we have reached the end of the sequence. 388 if (PrototypeHash != StmtsByHash[i].first) { 389 // The current sequence could be the start of a new CloneGroup. So we 390 // decrement i so that we visit it again in the outer loop. 391 // Note: i can never be 0 at this point because we are just comparing 392 // the hash of the Current StmtSequence with itself in the 'if' above. 393 assert(i != 0); 394 --i; 395 break; 396 } 397 // Same hash value means we should add the StmtSequence to the current 398 // group. 399 NewGroup.push_back(StmtsByHash[i].second); 400 } 401 402 // We created a new clone group with matching hash codes and move it to 403 // the result vector. 404 Result.push_back(NewGroup); 405 } 406 } 407 // Sequences is the output parameter, so we copy our result into it. 408 Sequences = Result; 409 } 410 411 void RecursiveCloneTypeIIVerifyConstraint::constrain( 412 std::vector<CloneDetector::CloneGroup> &Sequences) { 413 CloneConstraint::splitCloneGroups( 414 Sequences, [](const StmtSequence &A, const StmtSequence &B) { 415 return areSequencesClones(A, B); 416 }); 417 } 418 419 size_t MinComplexityConstraint::calculateStmtComplexity( 420 const StmtSequence &Seq, std::size_t Limit, 421 const std::string &ParentMacroStack) { 422 if (Seq.empty()) 423 return 0; 424 425 size_t Complexity = 1; 426 427 ASTContext &Context = Seq.getASTContext(); 428 429 // Look up what macros expanded into the current statement. 430 std::string MacroStack = 431 data_collection::getMacroStack(Seq.getBeginLoc(), Context); 432 433 // First, check if ParentMacroStack is not empty which means we are currently 434 // dealing with a parent statement which was expanded from a macro. 435 // If this parent statement was expanded from the same macros as this 436 // statement, we reduce the initial complexity of this statement to zero. 437 // This causes that a group of statements that were generated by a single 438 // macro expansion will only increase the total complexity by one. 439 // Note: This is not the final complexity of this statement as we still 440 // add the complexity of the child statements to the complexity value. 441 if (!ParentMacroStack.empty() && MacroStack == ParentMacroStack) { 442 Complexity = 0; 443 } 444 445 // Iterate over the Stmts in the StmtSequence and add their complexity values 446 // to the current complexity value. 447 if (Seq.holdsSequence()) { 448 for (const Stmt *S : Seq) { 449 Complexity += calculateStmtComplexity( 450 StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack); 451 if (Complexity >= Limit) 452 return Limit; 453 } 454 } else { 455 for (const Stmt *S : Seq.front()->children()) { 456 Complexity += calculateStmtComplexity( 457 StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack); 458 if (Complexity >= Limit) 459 return Limit; 460 } 461 } 462 return Complexity; 463 } 464 465 void MatchingVariablePatternConstraint::constrain( 466 std::vector<CloneDetector::CloneGroup> &CloneGroups) { 467 CloneConstraint::splitCloneGroups( 468 CloneGroups, [](const StmtSequence &A, const StmtSequence &B) { 469 VariablePattern PatternA(A); 470 VariablePattern PatternB(B); 471 return PatternA.countPatternDifferences(PatternB) == 0; 472 }); 473 } 474 475 void CloneConstraint::splitCloneGroups( 476 std::vector<CloneDetector::CloneGroup> &CloneGroups, 477 llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)> 478 Compare) { 479 std::vector<CloneDetector::CloneGroup> Result; 480 for (auto &HashGroup : CloneGroups) { 481 // Contains all indexes in HashGroup that were already added to a 482 // CloneGroup. 483 std::vector<char> Indexes; 484 Indexes.resize(HashGroup.size()); 485 486 for (unsigned i = 0; i < HashGroup.size(); ++i) { 487 // Skip indexes that are already part of a CloneGroup. 488 if (Indexes[i]) 489 continue; 490 491 // Pick the first unhandled StmtSequence and consider it as the 492 // beginning 493 // of a new CloneGroup for now. 494 // We don't add i to Indexes because we never iterate back. 495 StmtSequence Prototype = HashGroup[i]; 496 CloneDetector::CloneGroup PotentialGroup = {Prototype}; 497 ++Indexes[i]; 498 499 // Check all following StmtSequences for clones. 500 for (unsigned j = i + 1; j < HashGroup.size(); ++j) { 501 // Skip indexes that are already part of a CloneGroup. 502 if (Indexes[j]) 503 continue; 504 505 // If a following StmtSequence belongs to our CloneGroup, we add it. 506 const StmtSequence &Candidate = HashGroup[j]; 507 508 if (!Compare(Prototype, Candidate)) 509 continue; 510 511 PotentialGroup.push_back(Candidate); 512 // Make sure we never visit this StmtSequence again. 513 ++Indexes[j]; 514 } 515 516 // Otherwise, add it to the result and continue searching for more 517 // groups. 518 Result.push_back(PotentialGroup); 519 } 520 521 assert(llvm::all_of(Indexes, [](char c) { return c == 1; })); 522 } 523 CloneGroups = Result; 524 } 525 526 void VariablePattern::addVariableOccurence(const VarDecl *VarDecl, 527 const Stmt *Mention) { 528 // First check if we already reference this variable 529 for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) { 530 if (Variables[KindIndex] == VarDecl) { 531 // If yes, add a new occurrence that points to the existing entry in 532 // the Variables vector. 533 Occurences.emplace_back(KindIndex, Mention); 534 return; 535 } 536 } 537 // If this variable wasn't already referenced, add it to the list of 538 // referenced variables and add a occurrence that points to this new entry. 539 Occurences.emplace_back(Variables.size(), Mention); 540 Variables.push_back(VarDecl); 541 } 542 543 void VariablePattern::addVariables(const Stmt *S) { 544 // Sometimes we get a nullptr (such as from IfStmts which often have nullptr 545 // children). We skip such statements as they don't reference any 546 // variables. 547 if (!S) 548 return; 549 550 // Check if S is a reference to a variable. If yes, add it to the pattern. 551 if (auto D = dyn_cast<DeclRefExpr>(S)) { 552 if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl())) 553 addVariableOccurence(VD, D); 554 } 555 556 // Recursively check all children of the given statement. 557 for (const Stmt *Child : S->children()) { 558 addVariables(Child); 559 } 560 } 561 562 unsigned VariablePattern::countPatternDifferences( 563 const VariablePattern &Other, 564 VariablePattern::SuspiciousClonePair *FirstMismatch) { 565 unsigned NumberOfDifferences = 0; 566 567 assert(Other.Occurences.size() == Occurences.size()); 568 for (unsigned i = 0; i < Occurences.size(); ++i) { 569 auto ThisOccurence = Occurences[i]; 570 auto OtherOccurence = Other.Occurences[i]; 571 if (ThisOccurence.KindID == OtherOccurence.KindID) 572 continue; 573 574 ++NumberOfDifferences; 575 576 // If FirstMismatch is not a nullptr, we need to store information about 577 // the first difference between the two patterns. 578 if (FirstMismatch == nullptr) 579 continue; 580 581 // Only proceed if we just found the first difference as we only store 582 // information about the first difference. 583 if (NumberOfDifferences != 1) 584 continue; 585 586 const VarDecl *FirstSuggestion = nullptr; 587 // If there is a variable available in the list of referenced variables 588 // which wouldn't break the pattern if it is used in place of the 589 // current variable, we provide this variable as the suggested fix. 590 if (OtherOccurence.KindID < Variables.size()) 591 FirstSuggestion = Variables[OtherOccurence.KindID]; 592 593 // Store information about the first clone. 594 FirstMismatch->FirstCloneInfo = 595 VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo( 596 Variables[ThisOccurence.KindID], ThisOccurence.Mention, 597 FirstSuggestion); 598 599 // Same as above but with the other clone. We do this for both clones as 600 // we don't know which clone is the one containing the unintended 601 // pattern error. 602 const VarDecl *SecondSuggestion = nullptr; 603 if (ThisOccurence.KindID < Other.Variables.size()) 604 SecondSuggestion = Other.Variables[ThisOccurence.KindID]; 605 606 // Store information about the second clone. 607 FirstMismatch->SecondCloneInfo = 608 VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo( 609 Other.Variables[OtherOccurence.KindID], OtherOccurence.Mention, 610 SecondSuggestion); 611 612 // SuspiciousClonePair guarantees that the first clone always has a 613 // suggested variable associated with it. As we know that one of the two 614 // clones in the pair always has suggestion, we swap the two clones 615 // in case the first clone has no suggested variable which means that 616 // the second clone has a suggested variable and should be first. 617 if (!FirstMismatch->FirstCloneInfo.Suggestion) 618 std::swap(FirstMismatch->FirstCloneInfo, FirstMismatch->SecondCloneInfo); 619 620 // This ensures that we always have at least one suggestion in a pair. 621 assert(FirstMismatch->FirstCloneInfo.Suggestion); 622 } 623 624 return NumberOfDifferences; 625 } 626