1 //===-- llvm/ADT/edit_distance.h - Array edit distance function --- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file defines a Levenshtein distance function that works for any two 11 /// sequences, with each element of each sequence being analogous to a character 12 /// in a string. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_ADT_EDIT_DISTANCE_H 17 #define LLVM_ADT_EDIT_DISTANCE_H 18 19 #include "llvm/ADT/ArrayRef.h" 20 #include <algorithm> 21 22 namespace llvm { 23 24 /// Determine the edit distance between two sequences. 25 /// 26 /// \param FromArray the first sequence to compare. 27 /// 28 /// \param ToArray the second sequence to compare. 29 /// 30 /// \param Map A Functor to apply to each item of the sequences before 31 /// comparison. 32 /// 33 /// \param AllowReplacements whether to allow element replacements (change one 34 /// element into another) as a single operation, rather than as two operations 35 /// (an insertion and a removal). 36 /// 37 /// \param MaxEditDistance If non-zero, the maximum edit distance that this 38 /// routine is allowed to compute. If the edit distance will exceed that 39 /// maximum, returns \c MaxEditDistance+1. 40 /// 41 /// \returns the minimum number of element insertions, removals, or (if 42 /// \p AllowReplacements is \c true) replacements needed to transform one of 43 /// the given sequences into the other. If zero, the sequences are identical. 44 template <typename T, typename Functor> 45 unsigned ComputeMappedEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, 46 Functor Map, bool AllowReplacements = true, 47 unsigned MaxEditDistance = 0) { 48 // The algorithm implemented below is the "classic" 49 // dynamic-programming algorithm for computing the Levenshtein 50 // distance, which is described here: 51 // 52 // http://en.wikipedia.org/wiki/Levenshtein_distance 53 // 54 // Although the algorithm is typically described using an m x n 55 // array, only one row plus one element are used at a time, so this 56 // implementation just keeps one vector for the row. To update one entry, 57 // only the entries to the left, top, and top-left are needed. The left 58 // entry is in Row[x-1], the top entry is what's in Row[x] from the last 59 // iteration, and the top-left entry is stored in Previous. 60 typename ArrayRef<T>::size_type m = FromArray.size(); 61 typename ArrayRef<T>::size_type n = ToArray.size(); 62 63 if (MaxEditDistance) { 64 // If the difference in size between the 2 arrays is larger than the max 65 // distance allowed, we can bail out as we will always need at least 66 // MaxEditDistance insertions or removals. 67 typename ArrayRef<T>::size_type AbsDiff = m > n ? m - n : n - m; 68 if (AbsDiff > MaxEditDistance) 69 return MaxEditDistance + 1; 70 } 71 72 SmallVector<unsigned, 64> Row(n + 1); 73 for (unsigned i = 1; i < Row.size(); ++i) 74 Row[i] = i; 75 76 for (typename ArrayRef<T>::size_type y = 1; y <= m; ++y) { 77 Row[0] = y; 78 unsigned BestThisRow = Row[0]; 79 80 unsigned Previous = y - 1; 81 const auto &CurItem = Map(FromArray[y - 1]); 82 for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) { 83 int OldRow = Row[x]; 84 if (AllowReplacements) { 85 Row[x] = std::min(Previous + (CurItem == Map(ToArray[x - 1]) ? 0u : 1u), 86 std::min(Row[x - 1], Row[x]) + 1); 87 } 88 else { 89 if (CurItem == Map(ToArray[x - 1])) 90 Row[x] = Previous; 91 else Row[x] = std::min(Row[x-1], Row[x]) + 1; 92 } 93 Previous = OldRow; 94 BestThisRow = std::min(BestThisRow, Row[x]); 95 } 96 97 if (MaxEditDistance && BestThisRow > MaxEditDistance) 98 return MaxEditDistance + 1; 99 } 100 101 unsigned Result = Row[n]; 102 return Result; 103 } 104 105 template <typename T> 106 unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, 107 bool AllowReplacements = true, 108 unsigned MaxEditDistance = 0) { 109 return ComputeMappedEditDistance( 110 FromArray, ToArray, [](const T &X) -> const T & { return X; }, 111 AllowReplacements, MaxEditDistance); 112 } 113 114 } // End llvm namespace 115 116 #endif 117