1 //===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Lempel–Ziv–Welch encoding/decoding 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef SANITIZER_LZW_H 14 #define SANITIZER_LZW_H 15 16 #include "sanitizer_dense_map.h" 17 18 namespace __sanitizer { 19 20 using LzwCodeType = u32; 21 22 template <class T, class ItIn, class ItOut> 23 ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) { 24 using Substring = 25 detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>; 26 27 // Sentinel value for substrings of len 1. 28 static constexpr LzwCodeType kNoPrefix = 29 Min(DenseMapInfo<Substring>::getEmptyKey().first, 30 DenseMapInfo<Substring>::getTombstoneKey().first) - 31 1; 32 DenseMap<Substring, LzwCodeType> prefix_to_code; 33 { 34 // Add all substring of len 1 as initial dictionary. 35 InternalMmapVector<T> dict_len1; 36 for (auto it = begin; it != end; ++it) 37 if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second) 38 dict_len1.push_back(*it); 39 40 // Slightly helps with later delta encoding. 41 Sort(dict_len1.data(), dict_len1.size()); 42 43 // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can 44 // just generate them. 45 *out = dict_len1.size(); 46 ++out; 47 48 for (uptr i = 0; i != dict_len1.size(); ++i) { 49 // Remap after the Sort. 50 prefix_to_code[{kNoPrefix, dict_len1[i]}] = i; 51 *out = dict_len1[i]; 52 ++out; 53 } 54 CHECK_EQ(prefix_to_code.size(), dict_len1.size()); 55 } 56 57 if (begin == end) 58 return out; 59 60 // Main LZW encoding loop. 61 LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second; 62 ++begin; 63 for (auto it = begin; it != end; ++it) { 64 // Extend match with the new item. 65 auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size()); 66 if (ins.second) { 67 // This is a new substring, but emit the code for the current match 68 // (before extend). This allows LZW decoder to recover the dictionary. 69 *out = match; 70 ++out; 71 // Reset the match to a single item, which must be already in the map. 72 match = prefix_to_code.find({kNoPrefix, *it})->second; 73 } else { 74 // Already known, use as the current match. 75 match = ins.first->second; 76 } 77 } 78 79 *out = match; 80 ++out; 81 82 return out; 83 } 84 85 template <class T, class ItIn, class ItOut> 86 ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) { 87 if (begin == end) 88 return out; 89 90 // Load dictionary of len 1 substrings. Theses correspont to lowest codes. 91 InternalMmapVector<T> dict_len1(*begin); 92 ++begin; 93 94 if (begin == end) 95 return out; 96 97 for (auto& v : dict_len1) { 98 v = *begin; 99 ++begin; 100 } 101 102 // Substrings of len 2 and up. Indexes are shifted because [0, 103 // dict_len1.size()) stored in dict_len1. Substings get here after being 104 // emitted to the output, so we can use output position. 105 InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>> 106 code_to_substr; 107 108 // Copies already emitted substrings into the output again. 109 auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) { 110 if (code < dict_len1.size()) { 111 *out = dict_len1[code]; 112 ++out; 113 return out; 114 } 115 const auto& s = code_to_substr[code - dict_len1.size()]; 116 117 for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it; 118 return out; 119 }; 120 121 // Returns lens of the substring with the given code. 122 auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr { 123 if (code < dict_len1.size()) 124 return 1; 125 const auto& s = code_to_substr[code - dict_len1.size()]; 126 return s.second - s.first; 127 }; 128 129 // Main LZW decoding loop. 130 LzwCodeType prev_code = *begin; 131 ++begin; 132 out = copy(prev_code, out); 133 for (auto it = begin; it != end; ++it) { 134 LzwCodeType code = *it; 135 auto start = out; 136 if (code == dict_len1.size() + code_to_substr.size()) { 137 // Special LZW case. The code is not in the dictionary yet. This is 138 // possible only when the new substring is the same as previous one plus 139 // the first item of the previous substring. We can emit that in two 140 // steps. 141 out = copy(prev_code, out); 142 *out = *start; 143 ++out; 144 } else { 145 out = copy(code, out); 146 } 147 148 // Every time encoded emits the code, it also creates substing of len + 1 149 // including the first item of the just emmited substring. Do the same here. 150 uptr len = code_to_len(prev_code); 151 code_to_substr.push_back({start - len, start + 1}); 152 153 prev_code = code; 154 } 155 return out; 156 } 157 158 } // namespace __sanitizer 159 #endif 160