1 //===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Lempel–Ziv–Welch encoding/decoding
10 //
11 //===----------------------------------------------------------------------===//
12
13 #ifndef SANITIZER_LZW_H
14 #define SANITIZER_LZW_H
15
16 #include "sanitizer_dense_map.h"
17
18 namespace __sanitizer {
19
20 using LzwCodeType = u32;
21
22 template <class T, class ItIn, class ItOut>
LzwEncode(ItIn begin,ItIn end,ItOut out)23 ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
24 using Substring =
25 detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
26
27 // Sentinel value for substrings of len 1.
28 static constexpr LzwCodeType kNoPrefix =
29 Min(DenseMapInfo<Substring>::getEmptyKey().first,
30 DenseMapInfo<Substring>::getTombstoneKey().first) -
31 1;
32 DenseMap<Substring, LzwCodeType> prefix_to_code;
33 {
34 // Add all substring of len 1 as initial dictionary.
35 InternalMmapVector<T> dict_len1;
36 for (auto it = begin; it != end; ++it)
37 if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
38 dict_len1.push_back(*it);
39
40 // Slightly helps with later delta encoding.
41 Sort(dict_len1.data(), dict_len1.size());
42
43 // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
44 // just generate them.
45 *out = dict_len1.size();
46 ++out;
47
48 for (uptr i = 0; i != dict_len1.size(); ++i) {
49 // Remap after the Sort.
50 prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
51 *out = dict_len1[i];
52 ++out;
53 }
54 CHECK_EQ(prefix_to_code.size(), dict_len1.size());
55 }
56
57 if (begin == end)
58 return out;
59
60 // Main LZW encoding loop.
61 LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
62 ++begin;
63 for (auto it = begin; it != end; ++it) {
64 // Extend match with the new item.
65 auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
66 if (ins.second) {
67 // This is a new substring, but emit the code for the current match
68 // (before extend). This allows LZW decoder to recover the dictionary.
69 *out = match;
70 ++out;
71 // Reset the match to a single item, which must be already in the map.
72 match = prefix_to_code.find({kNoPrefix, *it})->second;
73 } else {
74 // Already known, use as the current match.
75 match = ins.first->second;
76 }
77 }
78
79 *out = match;
80 ++out;
81
82 return out;
83 }
84
85 template <class T, class ItIn, class ItOut>
LzwDecode(ItIn begin,ItIn end,ItOut out)86 ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
87 if (begin == end)
88 return out;
89
90 // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
91 InternalMmapVector<T> dict_len1(*begin);
92 ++begin;
93
94 if (begin == end)
95 return out;
96
97 for (auto& v : dict_len1) {
98 v = *begin;
99 ++begin;
100 }
101
102 // Substrings of len 2 and up. Indexes are shifted because [0,
103 // dict_len1.size()) stored in dict_len1. Substings get here after being
104 // emitted to the output, so we can use output position.
105 InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
106 code_to_substr;
107
108 // Copies already emitted substrings into the output again.
109 auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
110 if (code < dict_len1.size()) {
111 *out = dict_len1[code];
112 ++out;
113 return out;
114 }
115 const auto& s = code_to_substr[code - dict_len1.size()];
116
117 for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
118 return out;
119 };
120
121 // Returns lens of the substring with the given code.
122 auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
123 if (code < dict_len1.size())
124 return 1;
125 const auto& s = code_to_substr[code - dict_len1.size()];
126 return s.second - s.first;
127 };
128
129 // Main LZW decoding loop.
130 LzwCodeType prev_code = *begin;
131 ++begin;
132 out = copy(prev_code, out);
133 for (auto it = begin; it != end; ++it) {
134 LzwCodeType code = *it;
135 auto start = out;
136 if (code == dict_len1.size() + code_to_substr.size()) {
137 // Special LZW case. The code is not in the dictionary yet. This is
138 // possible only when the new substring is the same as previous one plus
139 // the first item of the previous substring. We can emit that in two
140 // steps.
141 out = copy(prev_code, out);
142 *out = *start;
143 ++out;
144 } else {
145 out = copy(code, out);
146 }
147
148 // Every time encoded emits the code, it also creates substing of len + 1
149 // including the first item of the just emmited substring. Do the same here.
150 uptr len = code_to_len(prev_code);
151 code_to_substr.push_back({start - len, start + 1});
152
153 prev_code = code;
154 }
155 return out;
156 }
157
158 } // namespace __sanitizer
159 #endif
160