xref: /linux/fs/ntfs3/lib/decompress_common.c (revision 522e010b58379fbe19b38fdef5016bca0c3cf405)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * decompress_common.c - Code shared by the XPRESS and LZX decompressors
4  *
5  * Copyright (C) 2015 Eric Biggers
6  *
7  * This program is free software: you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License as published by the Free Software
9  * Foundation, either version 2 of the License, or (at your option) any later
10  * version.
11  *
12  * This program is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14  * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
15  * details.
16  *
17  * You should have received a copy of the GNU General Public License along with
18  * this program.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include "decompress_common.h"
22 
23 /*
24  * make_huffman_decode_table() -
25  *
26  * Build a decoding table for a canonical prefix code, or "Huffman code".
27  *
28  * This is an internal function, not part of the library API!
29  *
30  * This takes as input the length of the codeword for each symbol in the
31  * alphabet and produces as output a table that can be used for fast
32  * decoding of prefix-encoded symbols using read_huffsym().
33  *
34  * Strictly speaking, a canonical prefix code might not be a Huffman
35  * code.  But this algorithm will work either way; and in fact, since
36  * Huffman codes are defined in terms of symbol frequencies, there is no
37  * way for the decompressor to know whether the code is a true Huffman
38  * code or not until all symbols have been decoded.
39  *
40  * Because the prefix code is assumed to be "canonical", it can be
41  * reconstructed directly from the codeword lengths.  A prefix code is
42  * canonical if and only if a longer codeword never lexicographically
43  * precedes a shorter codeword, and the lexicographic ordering of
44  * codewords of the same length is the same as the lexicographic ordering
45  * of the corresponding symbols.  Consequently, we can sort the symbols
46  * primarily by codeword length and secondarily by symbol value, then
47  * reconstruct the prefix code by generating codewords lexicographically
48  * in that order.
49  *
50  * This function does not, however, generate the prefix code explicitly.
51  * Instead, it directly builds a table for decoding symbols using the
52  * code.  The basic idea is this: given the next 'max_codeword_len' bits
53  * in the input, we can look up the decoded symbol by indexing a table
54  * containing 2**max_codeword_len entries.  A codeword with length
55  * 'max_codeword_len' will have exactly one entry in this table, whereas
56  * a codeword shorter than 'max_codeword_len' will have multiple entries
57  * in this table.  Precisely, a codeword of length n will be represented
58  * by 2**(max_codeword_len - n) entries in this table.  The 0-based index
59  * of each such entry will contain the corresponding codeword as a prefix
60  * when zero-padded on the left to 'max_codeword_len' binary digits.
61  *
62  * That's the basic idea, but we implement two optimizations regarding
63  * the format of the decode table itself:
64  *
65  * - For many compression formats, the maximum codeword length is too
66  *   long for it to be efficient to build the full decoding table
67  *   whenever a new prefix code is used.  Instead, we can build the table
68  *   using only 2**table_bits entries, where 'table_bits' is some number
69  *   less than or equal to 'max_codeword_len'.  Then, only codewords of
70  *   length 'table_bits' and shorter can be directly looked up.  For
71  *   longer codewords, the direct lookup instead produces the root of a
72  *   binary tree.  Using this tree, the decoder can do traditional
73  *   bit-by-bit decoding of the remainder of the codeword.  Child nodes
74  *   are allocated in extra entries at the end of the table; leaf nodes
75  *   contain symbols.  Note that the long-codeword case is, in general,
76  *   not performance critical, since in Huffman codes the most frequently
77  *   used symbols are assigned the shortest codeword lengths.
78  *
79  * - When we decode a symbol using a direct lookup of the table, we still
80  *   need to know its length so that the bitstream can be advanced by the
81  *   appropriate number of bits.  The simple solution is to simply retain
82  *   the 'lens' array and use the decoded symbol as an index into it.
83  *   However, this requires two separate array accesses in the fast path.
84  *   The optimization is to store the length directly in the decode
85  *   table.  We use the bottom 11 bits for the symbol and the top 5 bits
86  *   for the length.  In addition, to combine this optimization with the
87  *   previous one, we introduce a special case where the top 2 bits of
88  *   the length are both set if the entry is actually the root of a
89  *   binary tree.
90  *
91  * @decode_table:
92  *	The array in which to create the decoding table.  This must have
93  *	a length of at least ((2**table_bits) + 2 * num_syms) entries.
94  *
95  * @num_syms:
96  *	The number of symbols in the alphabet; also, the length of the
97  *	'lens' array.  Must be less than or equal to 2048.
98  *
99  * @table_bits:
100  *	The order of the decode table size, as explained above.  Must be
101  *	less than or equal to 13.
102  *
103  * @lens:
104  *	An array of length @num_syms, indexable by symbol, that gives the
105  *	length of the codeword, in bits, for that symbol.  The length can
106  *	be 0, which means that the symbol does not have a codeword
107  *	assigned.
108  *
109  * @max_codeword_len:
110  *	The longest codeword length allowed in the compression format.
111  *	All entries in 'lens' must be less than or equal to this value.
112  *	This must be less than or equal to 23.
113  *
114  * @working_space
115  *	A temporary array of length '2 * (max_codeword_len + 1) +
116  *	num_syms'.
117  *
118  * Returns 0 on success, or -1 if the lengths do not form a valid prefix
119  * code.
120  */
121 int make_huffman_decode_table(u16 decode_table[], const u32 num_syms,
122 			      const u32 table_bits, const u8 lens[],
123 			      const u32 max_codeword_len,
124 			      u16 working_space[])
125 {
126 	const u32 table_num_entries = 1 << table_bits;
127 	u16 * const len_counts = &working_space[0];
128 	u16 * const offsets = &working_space[1 * (max_codeword_len + 1)];
129 	u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)];
130 	int left;
131 	void *decode_table_ptr;
132 	u32 sym_idx;
133 	u32 codeword_len;
134 	u32 stores_per_loop;
135 	u32 decode_table_pos;
136 	u32 len;
137 	u32 sym;
138 
139 	/* Count how many symbols have each possible codeword length.
140 	 * Note that a length of 0 indicates the corresponding symbol is not
141 	 * used in the code and therefore does not have a codeword.
142 	 */
143 	for (len = 0; len <= max_codeword_len; len++)
144 		len_counts[len] = 0;
145 	for (sym = 0; sym < num_syms; sym++)
146 		len_counts[lens[sym]]++;
147 
148 	/* We can assume all lengths are <= max_codeword_len, but we
149 	 * cannot assume they form a valid prefix code.  A codeword of
150 	 * length n should require a proportion of the codespace equaling
151 	 * (1/2)^n.  The code is valid if and only if the codespace is
152 	 * exactly filled by the lengths, by this measure.
153 	 */
154 	left = 1;
155 	for (len = 1; len <= max_codeword_len; len++) {
156 		left <<= 1;
157 		left -= len_counts[len];
158 		if (left < 0) {
159 			/* The lengths overflow the codespace; that is, the code
160 			 * is over-subscribed.
161 			 */
162 			return -1;
163 		}
164 	}
165 
166 	if (left) {
167 		/* The lengths do not fill the codespace; that is, they form an
168 		 * incomplete set.
169 		 */
170 		if (left == (1 << max_codeword_len)) {
171 			/* The code is completely empty.  This is arguably
172 			 * invalid, but in fact it is valid in LZX and XPRESS,
173 			 * so we must allow it.  By definition, no symbols can
174 			 * be decoded with an empty code.  Consequently, we
175 			 * technically don't even need to fill in the decode
176 			 * table.  However, to avoid accessing uninitialized
177 			 * memory if the algorithm nevertheless attempts to
178 			 * decode symbols using such a code, we zero out the
179 			 * decode table.
180 			 */
181 			memset(decode_table, 0,
182 			       table_num_entries * sizeof(decode_table[0]));
183 			return 0;
184 		}
185 		return -1;
186 	}
187 
188 	/* Sort the symbols primarily by length and secondarily by symbol order.
189 	 */
190 
191 	/* Initialize 'offsets' so that offsets[len] for 1 <= len <=
192 	 * max_codeword_len is the number of codewords shorter than 'len' bits.
193 	 */
194 	offsets[1] = 0;
195 	for (len = 1; len < max_codeword_len; len++)
196 		offsets[len + 1] = offsets[len] + len_counts[len];
197 
198 	/* Use the 'offsets' array to sort the symbols.  Note that we do not
199 	 * include symbols that are not used in the code.  Consequently, fewer
200 	 * than 'num_syms' entries in 'sorted_syms' may be filled.
201 	 */
202 	for (sym = 0; sym < num_syms; sym++)
203 		if (lens[sym])
204 			sorted_syms[offsets[lens[sym]]++] = sym;
205 
206 	/* Fill entries for codewords with length <= table_bits
207 	 * --- that is, those short enough for a direct mapping.
208 	 *
209 	 * The table will start with entries for the shortest codeword(s), which
210 	 * have the most entries.  From there, the number of entries per
211 	 * codeword will decrease.
212 	 */
213 	decode_table_ptr = decode_table;
214 	sym_idx = 0;
215 	codeword_len = 1;
216 	stores_per_loop = (1 << (table_bits - codeword_len));
217 	for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) {
218 		u32 end_sym_idx = sym_idx + len_counts[codeword_len];
219 
220 		for (; sym_idx < end_sym_idx; sym_idx++) {
221 			u16 entry;
222 			u16 *p;
223 			u32 n;
224 
225 			entry = ((u32)codeword_len << 11) | sorted_syms[sym_idx];
226 			p = (u16 *)decode_table_ptr;
227 			n = stores_per_loop;
228 
229 			do {
230 				*p++ = entry;
231 			} while (--n);
232 
233 			decode_table_ptr = p;
234 		}
235 	}
236 
237 	/* If we've filled in the entire table, we are done.  Otherwise,
238 	 * there are codewords longer than table_bits for which we must
239 	 * generate binary trees.
240 	 */
241 	decode_table_pos = (u16 *)decode_table_ptr - decode_table;
242 	if (decode_table_pos != table_num_entries) {
243 		u32 j;
244 		u32 next_free_tree_slot;
245 		u32 cur_codeword;
246 
247 		/* First, zero out the remaining entries.  This is
248 		 * necessary so that these entries appear as
249 		 * "unallocated" in the next part.  Each of these entries
250 		 * will eventually be filled with the representation of
251 		 * the root node of a binary tree.
252 		 */
253 		j = decode_table_pos;
254 		do {
255 			decode_table[j] = 0;
256 		} while (++j != table_num_entries);
257 
258 		/* We allocate child nodes starting at the end of the
259 		 * direct lookup table.  Note that there should be
260 		 * 2*num_syms extra entries for this purpose, although
261 		 * fewer than this may actually be needed.
262 		 */
263 		next_free_tree_slot = table_num_entries;
264 
265 		/* Iterate through each codeword with length greater than
266 		 * 'table_bits', primarily in order of codeword length
267 		 * and secondarily in order of symbol.
268 		 */
269 		for (cur_codeword = decode_table_pos << 1;
270 		     codeword_len <= max_codeword_len;
271 		     codeword_len++, cur_codeword <<= 1) {
272 			u32 end_sym_idx = sym_idx + len_counts[codeword_len];
273 
274 			for (; sym_idx < end_sym_idx; sym_idx++, cur_codeword++) {
275 				/* 'sorted_sym' is the symbol represented by the
276 				 * codeword.
277 				 */
278 				u32 sorted_sym = sorted_syms[sym_idx];
279 				u32 extra_bits = codeword_len - table_bits;
280 				u32 node_idx = cur_codeword >> extra_bits;
281 
282 				/* Go through each bit of the current codeword
283 				 * beyond the prefix of length @table_bits and
284 				 * walk the appropriate binary tree, allocating
285 				 * any slots that have not yet been allocated.
286 				 *
287 				 * Note that the 'pointer' entry to the binary
288 				 * tree, which is stored in the direct lookup
289 				 * portion of the table, is represented
290 				 * identically to other internal (non-leaf)
291 				 * nodes of the binary tree; it can be thought
292 				 * of as simply the root of the tree.  The
293 				 * representation of these internal nodes is
294 				 * simply the index of the left child combined
295 				 * with the special bits 0xC000 to distingush
296 				 * the entry from direct mapping and leaf node
297 				 * entries.
298 				 */
299 				do {
300 					/* At least one bit remains in the
301 					 * codeword, but the current node is an
302 					 * unallocated leaf.  Change it to an
303 					 * internal node.
304 					 */
305 					if (decode_table[node_idx] == 0) {
306 						decode_table[node_idx] =
307 							next_free_tree_slot | 0xC000;
308 						decode_table[next_free_tree_slot++] = 0;
309 						decode_table[next_free_tree_slot++] = 0;
310 					}
311 
312 					/* Go to the left child if the next bit
313 					 * in the codeword is 0; otherwise go to
314 					 * the right child.
315 					 */
316 					node_idx = decode_table[node_idx] & 0x3FFF;
317 					--extra_bits;
318 					node_idx += (cur_codeword >> extra_bits) & 1;
319 				} while (extra_bits != 0);
320 
321 				/* We've traversed the tree using the entire
322 				 * codeword, and we're now at the entry where
323 				 * the actual symbol will be stored.  This is
324 				 * distinguished from internal nodes by not
325 				 * having its high two bits set.
326 				 */
327 				decode_table[node_idx] = sorted_sym;
328 			}
329 		}
330 	}
331 	return 0;
332 }
333