1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file defines classes for searching and analyzing source code clones. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_CLANG_ANALYSIS_CLONEDETECTION_H 15 #define LLVM_CLANG_ANALYSIS_CLONEDETECTION_H 16 17 #include "clang/AST/StmtVisitor.h" 18 #include "llvm/Support/Regex.h" 19 #include <vector> 20 21 namespace clang { 22 23 class Stmt; 24 class Decl; 25 class VarDecl; 26 class ASTContext; 27 class CompoundStmt; 28 29 /// Identifies a list of statements. 30 /// 31 /// Can either identify a single arbitrary Stmt object, a continuous sequence of 32 /// child statements inside a CompoundStmt or no statements at all. 33 class StmtSequence { 34 /// If this object identifies a sequence of statements inside a CompoundStmt, 35 /// S points to this CompoundStmt. If this object only identifies a single 36 /// Stmt, then S is a pointer to this Stmt. 37 const Stmt *S; 38 39 /// The declaration that contains the statements. 40 const Decl *D; 41 42 /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence 43 /// instance is representing the CompoundStmt children inside the array 44 /// [StartIndex, EndIndex). 45 unsigned StartIndex; 46 unsigned EndIndex; 47 48 public: 49 /// Constructs a StmtSequence holding multiple statements. 50 /// 51 /// The resulting StmtSequence identifies a continuous sequence of statements 52 /// in the body of the given CompoundStmt. Which statements of the body should 53 /// be identified needs to be specified by providing a start and end index 54 /// that describe a non-empty sub-array in the body of the given CompoundStmt. 55 /// 56 /// \param Stmt A CompoundStmt that contains all statements in its body. 57 /// \param D The Decl containing this Stmt. 58 /// \param StartIndex The inclusive start index in the children array of 59 /// \p Stmt 60 /// \param EndIndex The exclusive end index in the children array of \p Stmt. 61 StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex, 62 unsigned EndIndex); 63 64 /// Constructs a StmtSequence holding a single statement. 65 /// 66 /// \param Stmt An arbitrary Stmt. 67 /// \param D The Decl containing this Stmt. 68 StmtSequence(const Stmt *Stmt, const Decl *D); 69 70 /// Constructs an empty StmtSequence. 71 StmtSequence(); 72 73 typedef const Stmt *const *iterator; 74 75 /// Returns an iterator pointing to the first statement in this sequence. 76 iterator begin() const; 77 78 /// Returns an iterator pointing behind the last statement in this sequence. 79 iterator end() const; 80 81 /// Returns the first statement in this sequence. 82 /// 83 /// This method should only be called on a non-empty StmtSequence object. front()84 const Stmt *front() const { 85 assert(!empty()); 86 return begin()[0]; 87 } 88 89 /// Returns the last statement in this sequence. 90 /// 91 /// This method should only be called on a non-empty StmtSequence object. back()92 const Stmt *back() const { 93 assert(!empty()); 94 return begin()[size() - 1]; 95 } 96 97 /// Returns the number of statements this object holds. size()98 unsigned size() const { 99 if (holdsSequence()) 100 return EndIndex - StartIndex; 101 if (S == nullptr) 102 return 0; 103 return 1; 104 } 105 106 /// Returns true if and only if this StmtSequence contains no statements. empty()107 bool empty() const { return size() == 0; } 108 109 /// Returns the related ASTContext for the stored Stmts. 110 ASTContext &getASTContext() const; 111 112 /// Returns the declaration that contains the stored Stmts. getContainingDecl()113 const Decl *getContainingDecl() const { 114 assert(D); 115 return D; 116 } 117 118 /// Returns true if this objects holds a list of statements. holdsSequence()119 bool holdsSequence() const { return EndIndex != 0; } 120 121 /// Returns the start sourcelocation of the first statement in this sequence. 122 /// 123 /// This method should only be called on a non-empty StmtSequence object. 124 SourceLocation getBeginLoc() const; 125 126 /// Returns the end sourcelocation of the last statement in this sequence. 127 /// 128 /// This method should only be called on a non-empty StmtSequence object. 129 SourceLocation getEndLoc() const; 130 131 /// Returns the source range of the whole sequence - from the beginning 132 /// of the first statement to the end of the last statement. 133 SourceRange getSourceRange() const; 134 135 bool operator==(const StmtSequence &Other) const { 136 return std::tie(S, StartIndex, EndIndex) == 137 std::tie(Other.S, Other.StartIndex, Other.EndIndex); 138 } 139 140 bool operator!=(const StmtSequence &Other) const { 141 return std::tie(S, StartIndex, EndIndex) != 142 std::tie(Other.S, Other.StartIndex, Other.EndIndex); 143 } 144 145 /// Returns true if and only if this sequence covers a source range that 146 /// contains the source range of the given sequence \p Other. 147 /// 148 /// This method should only be called on a non-empty StmtSequence object 149 /// and passed a non-empty StmtSequence object. 150 bool contains(const StmtSequence &Other) const; 151 }; 152 153 /// Searches for similar subtrees in the AST. 154 /// 155 /// First, this class needs several declarations with statement bodies which 156 /// can be passed via analyzeCodeBody. Afterwards all statements can be 157 /// searched for clones by calling findClones with a given list of constraints 158 /// that should specify the wanted properties of the clones. 159 /// 160 /// The result of findClones can be further constrained with the constrainClones 161 /// method. 162 /// 163 /// This class only searches for clones in executable source code 164 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations) 165 /// are not supported. 166 class CloneDetector { 167 168 public: 169 /// A collection of StmtSequences that share an arbitrary property. 170 typedef llvm::SmallVector<StmtSequence, 8> CloneGroup; 171 172 /// Generates and stores search data for all statements in the body of 173 /// the given Decl. 174 void analyzeCodeBody(const Decl *D); 175 176 /// Constrains the given list of clone groups with the given constraint. 177 /// 178 /// The constraint is expected to have a method with the signature 179 /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)` 180 /// as this is the interface that the CloneDetector uses for applying the 181 /// constraint. The constraint is supposed to directly modify the passed list 182 /// so that all clones in the list fulfill the specific property this 183 /// constraint ensures. 184 template <typename T> constrainClones(std::vector<CloneGroup> & CloneGroups,T C)185 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) { 186 C.constrain(CloneGroups); 187 } 188 189 /// Constrains the given list of clone groups with the given list of 190 /// constraints. 191 /// 192 /// The constraints are applied in sequence in the order in which they are 193 /// passed to this function. 194 template <typename T1, typename... Ts> constrainClones(std::vector<CloneGroup> & CloneGroups,T1 C,Ts...ConstraintList)195 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C, 196 Ts... ConstraintList) { 197 constrainClones(CloneGroups, C); 198 constrainClones(CloneGroups, ConstraintList...); 199 } 200 201 /// Searches for clones in all previously passed statements. 202 /// \param Result Output parameter to which all created clone groups are 203 /// added. 204 /// \param ConstraintList The constraints that should be applied to the 205 // result. 206 template <typename... Ts> findClones(std::vector<CloneGroup> & Result,Ts...ConstraintList)207 void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) { 208 // The initial assumption is that there is only one clone group and every 209 // statement is a clone of the others. This clone group will then be 210 // split up with the help of the constraints. 211 Result.push_back(Sequences); 212 213 constrainClones(Result, ConstraintList...); 214 } 215 216 private: 217 CloneGroup Sequences; 218 }; 219 220 /// This class is a utility class that contains utility functions for building 221 /// custom constraints. 222 class CloneConstraint { 223 public: 224 /// Removes all groups by using a filter function. 225 /// \param CloneGroups The list of CloneGroups that is supposed to be 226 /// filtered. 227 /// \param Filter The filter function that should return true for all groups 228 /// that should be removed from the list. filterGroups(std::vector<CloneDetector::CloneGroup> & CloneGroups,llvm::function_ref<bool (const CloneDetector::CloneGroup &)> Filter)229 static void filterGroups( 230 std::vector<CloneDetector::CloneGroup> &CloneGroups, 231 llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) { 232 llvm::erase_if(CloneGroups, Filter); 233 } 234 235 /// Splits the given CloneGroups until the given Compare function returns true 236 /// for all clones in a single group. 237 /// \param CloneGroups A list of CloneGroups that should be modified. 238 /// \param Compare The comparison function that all clones are supposed to 239 /// pass. Should return true if and only if two clones belong 240 /// to the same CloneGroup. 241 static void splitCloneGroups( 242 std::vector<CloneDetector::CloneGroup> &CloneGroups, 243 llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)> 244 Compare); 245 }; 246 247 /// This constraint moves clones into clone groups of type II via hashing. 248 /// 249 /// Clones with different hash values are moved into separate clone groups. 250 /// Collisions are possible, and this constraint does nothing to address this 251 /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the 252 /// constraint chain, not necessarily immediately, to eliminate hash collisions 253 /// through a more detailed analysis. 254 class RecursiveCloneTypeIIHashConstraint { 255 public: 256 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); 257 }; 258 259 /// This constraint moves clones into clone groups of type II by comparing them. 260 /// 261 /// Clones that aren't type II clones are moved into separate clone groups. 262 /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone 263 /// group are guaranteed to be type II clones of each other, but it is too 264 /// slow to efficiently handle large amounts of clones. 265 class RecursiveCloneTypeIIVerifyConstraint { 266 public: 267 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); 268 }; 269 270 /// Ensures that every clone has at least the given complexity. 271 /// 272 /// Complexity is here defined as the total amount of children of a statement. 273 /// This constraint assumes the first statement in the group is representative 274 /// for all other statements in the group in terms of complexity. 275 class MinComplexityConstraint { 276 unsigned MinComplexity; 277 278 public: MinComplexityConstraint(unsigned MinComplexity)279 MinComplexityConstraint(unsigned MinComplexity) 280 : MinComplexity(MinComplexity) {} 281 282 /// Calculates the complexity of the given StmtSequence. 283 /// \param Limit The limit of complexity we probe for. After reaching 284 /// this limit during calculation, this method is exiting 285 /// early to improve performance and returns this limit. 286 size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit, 287 const std::string &ParentMacroStack = ""); 288 constrain(std::vector<CloneDetector::CloneGroup> & CloneGroups)289 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 290 CloneConstraint::filterGroups( 291 CloneGroups, [this](const CloneDetector::CloneGroup &A) { 292 if (!A.empty()) 293 return calculateStmtComplexity(A.front(), MinComplexity) < 294 MinComplexity; 295 else 296 return false; 297 }); 298 } 299 }; 300 301 /// Ensures that all clone groups contain at least the given amount of clones. 302 class MinGroupSizeConstraint { 303 unsigned MinGroupSize; 304 305 public: 306 MinGroupSizeConstraint(unsigned MinGroupSize = 2) MinGroupSize(MinGroupSize)307 : MinGroupSize(MinGroupSize) {} 308 constrain(std::vector<CloneDetector::CloneGroup> & CloneGroups)309 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 310 CloneConstraint::filterGroups(CloneGroups, 311 [this](const CloneDetector::CloneGroup &A) { 312 return A.size() < MinGroupSize; 313 }); 314 } 315 }; 316 317 /// Ensures that no clone group fully contains another clone group. 318 struct OnlyLargestCloneConstraint { 319 void constrain(std::vector<CloneDetector::CloneGroup> &Result); 320 }; 321 322 struct FilenamePatternConstraint { 323 StringRef IgnoredFilesPattern; 324 std::shared_ptr<llvm::Regex> IgnoredFilesRegex; 325 FilenamePatternConstraintFilenamePatternConstraint326 FilenamePatternConstraint(StringRef IgnoredFilesPattern) 327 : IgnoredFilesPattern(IgnoredFilesPattern) { 328 IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" + 329 IgnoredFilesPattern.str() + "$)"); 330 } 331 332 bool isAutoGenerated(const CloneDetector::CloneGroup &Group); 333 constrainFilenamePatternConstraint334 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 335 CloneConstraint::filterGroups( 336 CloneGroups, [this](const CloneDetector::CloneGroup &Group) { 337 return isAutoGenerated(Group); 338 }); 339 } 340 }; 341 342 /// Analyzes the pattern of the referenced variables in a statement. 343 class VariablePattern { 344 345 /// Describes an occurrence of a variable reference in a statement. 346 struct VariableOccurence { 347 /// The index of the associated VarDecl in the Variables vector. 348 size_t KindID; 349 /// The statement in the code where the variable was referenced. 350 const Stmt *Mention; 351 VariableOccurenceVariableOccurence352 VariableOccurence(size_t KindID, const Stmt *Mention) 353 : KindID(KindID), Mention(Mention) {} 354 }; 355 356 /// All occurrences of referenced variables in the order of appearance. 357 std::vector<VariableOccurence> Occurences; 358 /// List of referenced variables in the order of appearance. 359 /// Every item in this list is unique. 360 std::vector<const VarDecl *> Variables; 361 362 /// Adds a new variable referenced to this pattern. 363 /// \param VarDecl The declaration of the variable that is referenced. 364 /// \param Mention The SourceRange where this variable is referenced. 365 void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention); 366 367 /// Adds each referenced variable from the given statement. 368 void addVariables(const Stmt *S); 369 370 public: 371 /// Creates an VariablePattern object with information about the given 372 /// StmtSequence. VariablePattern(const StmtSequence & Sequence)373 VariablePattern(const StmtSequence &Sequence) { 374 for (const Stmt *S : Sequence) 375 addVariables(S); 376 } 377 378 /// Describes two clones that reference their variables in a different pattern 379 /// which could indicate a programming error. 380 struct SuspiciousClonePair { 381 /// Utility class holding the relevant information about a single 382 /// clone in this pair. 383 struct SuspiciousCloneInfo { 384 /// The variable which referencing in this clone was against the pattern. 385 const VarDecl *Variable; 386 /// Where the variable was referenced. 387 const Stmt *Mention; 388 /// The variable that should have been referenced to follow the pattern. 389 /// If Suggestion is a nullptr then it's not possible to fix the pattern 390 /// by referencing a different variable in this clone. 391 const VarDecl *Suggestion; SuspiciousCloneInfoSuspiciousClonePair::SuspiciousCloneInfo392 SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention, 393 const VarDecl *Suggestion) 394 : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {} SuspiciousCloneInfoSuspiciousClonePair::SuspiciousCloneInfo395 SuspiciousCloneInfo() {} 396 }; 397 /// The first clone in the pair which always has a suggested variable. 398 SuspiciousCloneInfo FirstCloneInfo; 399 /// This other clone in the pair which can have a suggested variable. 400 SuspiciousCloneInfo SecondCloneInfo; 401 }; 402 403 /// Counts the differences between this pattern and the given one. 404 /// \param Other The given VariablePattern to compare with. 405 /// \param FirstMismatch Output parameter that will be filled with information 406 /// about the first difference between the two patterns. This parameter 407 /// can be a nullptr, in which case it will be ignored. 408 /// \return Returns the number of differences between the pattern this object 409 /// is following and the given VariablePattern. 410 /// 411 /// For example, the following statements all have the same pattern and this 412 /// function would return zero: 413 /// 414 /// if (a < b) return a; return b; 415 /// if (x < y) return x; return y; 416 /// if (u2 < u1) return u2; return u1; 417 /// 418 /// But the following statement has a different pattern (note the changed 419 /// variables in the return statements) and would have two differences when 420 /// compared with one of the statements above. 421 /// 422 /// if (a < b) return b; return a; 423 /// 424 /// This function should only be called if the related statements of the given 425 /// pattern and the statements of this objects are clones of each other. 426 unsigned countPatternDifferences( 427 const VariablePattern &Other, 428 VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr); 429 }; 430 431 /// Ensures that all clones reference variables in the same pattern. 432 struct MatchingVariablePatternConstraint { 433 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups); 434 }; 435 436 } // end namespace clang 437 438 #endif // LLVM_CLANG_ANALYSIS_CLONEDETECTION_H 439