xref: /freebsd/contrib/llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_lzw.h (revision 0eae32dcef82f6f06de6419a0d623d7def0cc8f6)
1*0eae32dcSDimitry Andric //===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
2*0eae32dcSDimitry Andric //
3*0eae32dcSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*0eae32dcSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*0eae32dcSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*0eae32dcSDimitry Andric //
7*0eae32dcSDimitry Andric //===----------------------------------------------------------------------===//
8*0eae32dcSDimitry Andric //
9*0eae32dcSDimitry Andric // Lempel–Ziv–Welch encoding/decoding
10*0eae32dcSDimitry Andric //
11*0eae32dcSDimitry Andric //===----------------------------------------------------------------------===//
12*0eae32dcSDimitry Andric 
13*0eae32dcSDimitry Andric #ifndef SANITIZER_LZW_H
14*0eae32dcSDimitry Andric #define SANITIZER_LZW_H
15*0eae32dcSDimitry Andric 
16*0eae32dcSDimitry Andric #include "sanitizer_dense_map.h"
17*0eae32dcSDimitry Andric 
18*0eae32dcSDimitry Andric namespace __sanitizer {
19*0eae32dcSDimitry Andric 
20*0eae32dcSDimitry Andric using LzwCodeType = u32;
21*0eae32dcSDimitry Andric 
22*0eae32dcSDimitry Andric template <class T, class ItIn, class ItOut>
LzwEncode(ItIn begin,ItIn end,ItOut out)23*0eae32dcSDimitry Andric ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
24*0eae32dcSDimitry Andric   using Substring =
25*0eae32dcSDimitry Andric       detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
26*0eae32dcSDimitry Andric 
27*0eae32dcSDimitry Andric   // Sentinel value for substrings of len 1.
28*0eae32dcSDimitry Andric   static constexpr LzwCodeType kNoPrefix =
29*0eae32dcSDimitry Andric       Min(DenseMapInfo<Substring>::getEmptyKey().first,
30*0eae32dcSDimitry Andric           DenseMapInfo<Substring>::getTombstoneKey().first) -
31*0eae32dcSDimitry Andric       1;
32*0eae32dcSDimitry Andric   DenseMap<Substring, LzwCodeType> prefix_to_code;
33*0eae32dcSDimitry Andric   {
34*0eae32dcSDimitry Andric     // Add all substring of len 1 as initial dictionary.
35*0eae32dcSDimitry Andric     InternalMmapVector<T> dict_len1;
36*0eae32dcSDimitry Andric     for (auto it = begin; it != end; ++it)
37*0eae32dcSDimitry Andric       if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
38*0eae32dcSDimitry Andric         dict_len1.push_back(*it);
39*0eae32dcSDimitry Andric 
40*0eae32dcSDimitry Andric     // Slightly helps with later delta encoding.
41*0eae32dcSDimitry Andric     Sort(dict_len1.data(), dict_len1.size());
42*0eae32dcSDimitry Andric 
43*0eae32dcSDimitry Andric     // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
44*0eae32dcSDimitry Andric     // just generate them.
45*0eae32dcSDimitry Andric     *out = dict_len1.size();
46*0eae32dcSDimitry Andric     ++out;
47*0eae32dcSDimitry Andric 
48*0eae32dcSDimitry Andric     for (uptr i = 0; i != dict_len1.size(); ++i) {
49*0eae32dcSDimitry Andric       // Remap after the Sort.
50*0eae32dcSDimitry Andric       prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
51*0eae32dcSDimitry Andric       *out = dict_len1[i];
52*0eae32dcSDimitry Andric       ++out;
53*0eae32dcSDimitry Andric     }
54*0eae32dcSDimitry Andric     CHECK_EQ(prefix_to_code.size(), dict_len1.size());
55*0eae32dcSDimitry Andric   }
56*0eae32dcSDimitry Andric 
57*0eae32dcSDimitry Andric   if (begin == end)
58*0eae32dcSDimitry Andric     return out;
59*0eae32dcSDimitry Andric 
60*0eae32dcSDimitry Andric   // Main LZW encoding loop.
61*0eae32dcSDimitry Andric   LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
62*0eae32dcSDimitry Andric   ++begin;
63*0eae32dcSDimitry Andric   for (auto it = begin; it != end; ++it) {
64*0eae32dcSDimitry Andric     // Extend match with the new item.
65*0eae32dcSDimitry Andric     auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
66*0eae32dcSDimitry Andric     if (ins.second) {
67*0eae32dcSDimitry Andric       // This is a new substring, but emit the code for the current match
68*0eae32dcSDimitry Andric       // (before extend). This allows LZW decoder to recover the dictionary.
69*0eae32dcSDimitry Andric       *out = match;
70*0eae32dcSDimitry Andric       ++out;
71*0eae32dcSDimitry Andric       // Reset the match to a single item, which must be already in the map.
72*0eae32dcSDimitry Andric       match = prefix_to_code.find({kNoPrefix, *it})->second;
73*0eae32dcSDimitry Andric     } else {
74*0eae32dcSDimitry Andric       // Already known, use as the current match.
75*0eae32dcSDimitry Andric       match = ins.first->second;
76*0eae32dcSDimitry Andric     }
77*0eae32dcSDimitry Andric   }
78*0eae32dcSDimitry Andric 
79*0eae32dcSDimitry Andric   *out = match;
80*0eae32dcSDimitry Andric   ++out;
81*0eae32dcSDimitry Andric 
82*0eae32dcSDimitry Andric   return out;
83*0eae32dcSDimitry Andric }
84*0eae32dcSDimitry Andric 
85*0eae32dcSDimitry Andric template <class T, class ItIn, class ItOut>
LzwDecode(ItIn begin,ItIn end,ItOut out)86*0eae32dcSDimitry Andric ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
87*0eae32dcSDimitry Andric   if (begin == end)
88*0eae32dcSDimitry Andric     return out;
89*0eae32dcSDimitry Andric 
90*0eae32dcSDimitry Andric   // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
91*0eae32dcSDimitry Andric   InternalMmapVector<T> dict_len1(*begin);
92*0eae32dcSDimitry Andric   ++begin;
93*0eae32dcSDimitry Andric 
94*0eae32dcSDimitry Andric   if (begin == end)
95*0eae32dcSDimitry Andric     return out;
96*0eae32dcSDimitry Andric 
97*0eae32dcSDimitry Andric   for (auto& v : dict_len1) {
98*0eae32dcSDimitry Andric     v = *begin;
99*0eae32dcSDimitry Andric     ++begin;
100*0eae32dcSDimitry Andric   }
101*0eae32dcSDimitry Andric 
102*0eae32dcSDimitry Andric   // Substrings of len 2 and up. Indexes are shifted because [0,
103*0eae32dcSDimitry Andric   // dict_len1.size()) stored in dict_len1. Substings get here after being
104*0eae32dcSDimitry Andric   // emitted to the output, so we can use output position.
105*0eae32dcSDimitry Andric   InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
106*0eae32dcSDimitry Andric       code_to_substr;
107*0eae32dcSDimitry Andric 
108*0eae32dcSDimitry Andric   // Copies already emitted substrings into the output again.
109*0eae32dcSDimitry Andric   auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
110*0eae32dcSDimitry Andric     if (code < dict_len1.size()) {
111*0eae32dcSDimitry Andric       *out = dict_len1[code];
112*0eae32dcSDimitry Andric       ++out;
113*0eae32dcSDimitry Andric       return out;
114*0eae32dcSDimitry Andric     }
115*0eae32dcSDimitry Andric     const auto& s = code_to_substr[code - dict_len1.size()];
116*0eae32dcSDimitry Andric 
117*0eae32dcSDimitry Andric     for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
118*0eae32dcSDimitry Andric     return out;
119*0eae32dcSDimitry Andric   };
120*0eae32dcSDimitry Andric 
121*0eae32dcSDimitry Andric   // Returns lens of the substring with the given code.
122*0eae32dcSDimitry Andric   auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
123*0eae32dcSDimitry Andric     if (code < dict_len1.size())
124*0eae32dcSDimitry Andric       return 1;
125*0eae32dcSDimitry Andric     const auto& s = code_to_substr[code - dict_len1.size()];
126*0eae32dcSDimitry Andric     return s.second - s.first;
127*0eae32dcSDimitry Andric   };
128*0eae32dcSDimitry Andric 
129*0eae32dcSDimitry Andric   // Main LZW decoding loop.
130*0eae32dcSDimitry Andric   LzwCodeType prev_code = *begin;
131*0eae32dcSDimitry Andric   ++begin;
132*0eae32dcSDimitry Andric   out = copy(prev_code, out);
133*0eae32dcSDimitry Andric   for (auto it = begin; it != end; ++it) {
134*0eae32dcSDimitry Andric     LzwCodeType code = *it;
135*0eae32dcSDimitry Andric     auto start = out;
136*0eae32dcSDimitry Andric     if (code == dict_len1.size() + code_to_substr.size()) {
137*0eae32dcSDimitry Andric       // Special LZW case. The code is not in the dictionary yet. This is
138*0eae32dcSDimitry Andric       // possible only when the new substring is the same as previous one plus
139*0eae32dcSDimitry Andric       // the first item of the previous substring. We can emit that in two
140*0eae32dcSDimitry Andric       // steps.
141*0eae32dcSDimitry Andric       out = copy(prev_code, out);
142*0eae32dcSDimitry Andric       *out = *start;
143*0eae32dcSDimitry Andric       ++out;
144*0eae32dcSDimitry Andric     } else {
145*0eae32dcSDimitry Andric       out = copy(code, out);
146*0eae32dcSDimitry Andric     }
147*0eae32dcSDimitry Andric 
148*0eae32dcSDimitry Andric     // Every time encoded emits the code, it also creates substing of len + 1
149*0eae32dcSDimitry Andric     // including the first item of the just emmited substring. Do the same here.
150*0eae32dcSDimitry Andric     uptr len = code_to_len(prev_code);
151*0eae32dcSDimitry Andric     code_to_substr.push_back({start - len, start + 1});
152*0eae32dcSDimitry Andric 
153*0eae32dcSDimitry Andric     prev_code = code;
154*0eae32dcSDimitry Andric   }
155*0eae32dcSDimitry Andric   return out;
156*0eae32dcSDimitry Andric }
157*0eae32dcSDimitry Andric 
158*0eae32dcSDimitry Andric }  // namespace __sanitizer
159*0eae32dcSDimitry Andric #endif
160