xref: /freebsd/usr.bin/gzip/unpack.c (revision 9bd497b8354567454e075076d40c996e21bd6095)
1 /*-
2  * Copyright (c) 2009 Xin LI <delphij@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 /* This file is #included by gzip.c */
30 
31 /*
32  * pack(1) file format:
33  *
34  * The first 7 bytes is the header:
35  *	00, 01 - Signature (US, RS), we already validated it earlier.
36  *	02..05 - Uncompressed size
37  *	    06 - Level for the huffman tree (<=24)
38  *
39  * pack(1) will then store symbols (leaf) nodes count in each huffman
40  * tree levels, each level would consume 1 byte (See [1]).
41  *
42  * After the symbol count table, there is the symbol table, storing
43  * symbols represented by coresponding leaf node.  EOB is not being
44  * explicitly transmitted (not necessary anyway) in the symbol table.
45  *
46  * Compressed data goes after the symbol table.
47  *
48  * NOTES
49  *
50  * [1] If we count EOB into the symbols, that would mean that we will
51  * have at most 256 symbols in the huffman tree.  pack(1) rejects empty
52  * file and files that just repeats one character, which means that we
53  * will have at least 2 symbols.  Therefore, pack(1) would reduce the
54  * last level symbol count by 2 which makes it a number in
55  * range [0..254], so all levels' symbol count would fit into 1 byte.
56  */
57 
58 #define	PACK_HEADER_LENGTH	7
59 #define	HTREE_MAXLEVEL		24
60 
61 /*
62  * unpack descriptor
63  *
64  * Represent the huffman tree in a similiar way that pack(1) would
65  * store in a packed file.  We store all symbols in a linear table,
66  * and store pointers to each level's first symbol.  In addition to
67  * that, maintain two counts for each level: inner nodes count and
68  * leaf nodes count.
69  */
70 typedef struct {
71 	int		symbol_size;	/* Size of the symbol table */
72 	int		treelevels;	/* Levels for the huffman tree */
73 
74 	int		*symbolsin;	/* Table of leaf symbols count in
75 					   each level */
76 	int		*inodesin;	/* Table of internal nodes count in
77 					   each level */
78 
79 	char		*symbol;	/* The symbol table */
80 	char		*symbol_eob;	/* Pointer to the EOB symbol */
81 	char		**tree;		/* Decoding huffman tree (pointers to
82 					   first symbol of each tree level */
83 
84 	off_t		uncompressed_size; /* Uncompressed size */
85 	FILE		*fpIn;		/* Input stream */
86 	FILE		*fpOut;		/* Output stream */
87 } unpack_descriptor_t;
88 
89 /*
90  * Release resource allocated to an unpack descriptor.
91  *
92  * Caller is responsible to make sure that all of these pointers are
93  * initialized (in our case, they all point to valid memory block).
94  * We don't zero out pointers here because nobody else would ever
95  * reference the memory block without scrubing them.
96  */
97 static void
98 unpack_descriptor_fini(unpack_descriptor_t *unpackd)
99 {
100 
101 	free(unpackd->symbolsin);
102 	free(unpackd->inodesin);
103 	free(unpackd->symbol);
104 	free(unpackd->tree);
105 
106 	fclose(unpackd->fpIn);
107 	fclose(unpackd->fpOut);
108 }
109 
110 /*
111  * Recursively fill the internal node count table
112  */
113 static void
114 unpackd_fill_inodesin(const unpack_descriptor_t *unpackd, int level)
115 {
116 
117 	/*
118 	 * The internal nodes would be 1/2 of total internal nodes and
119 	 * leaf nodes in the next level.  For the last level there
120 	 * would be no internal node by defination.
121 	 */
122 	if (level < unpackd->treelevels) {
123 		unpackd_fill_inodesin(unpackd, level + 1);
124 		unpackd->inodesin[level] = (unpackd->inodesin[level + 1] +
125 					  unpackd->symbolsin[level + 1]) / 2;
126 	} else
127 		unpackd->inodesin[level] = 0;
128 }
129 
130 /*
131  * Update counter for accepted bytes
132  */
133 static void
134 accepted_bytes(off_t *bytes_in, off_t newbytes)
135 {
136 
137 	if (bytes_in != NULL)
138 		(*bytes_in) += newbytes;
139 }
140 
141 /*
142  * Read file header and construct the tree.  Also, prepare the buffered I/O
143  * for decode rountine.
144  *
145  * Return value is uncompressed size.
146  */
147 static void
148 unpack_parse_header(int in, int out, char *pre, size_t prelen, off_t *bytes_in,
149     unpack_descriptor_t *unpackd)
150 {
151 	unsigned char hdr[PACK_HEADER_LENGTH];	/* buffer for header */
152 	ssize_t bytesread;		/* Bytes read from the file */
153 	int i, j, thisbyte;
154 
155 	/* Prepend the header buffer if we already read some data */
156 	if (prelen != 0)
157 		memcpy(hdr, pre, prelen);
158 
159 	/* Read in and fill the rest bytes of header */
160 	bytesread = read(in, hdr + prelen, PACK_HEADER_LENGTH - prelen);
161 	if (bytesread < 0)
162 		maybe_err("Error reading pack header");
163 
164 	accepted_bytes(bytes_in, PACK_HEADER_LENGTH);
165 
166 	/* Obtain uncompressed length (bytes 2,3,4,5)*/
167 	unpackd->uncompressed_size = 0;
168 	for (i = 2; i <= 5; i++) {
169 		unpackd->uncompressed_size <<= 8;
170 		unpackd->uncompressed_size |= hdr[i];
171 	}
172 
173 	/* Get the levels of the tree */
174 	unpackd->treelevels = hdr[6];
175 	if (unpackd->treelevels > HTREE_MAXLEVEL || unpackd->treelevels < 1)
176 		maybe_errx("Huffman tree has insane levels");
177 
178 	/* Let libc take care for buffering from now on */
179 	if ((unpackd->fpIn = fdopen(in, "r")) == NULL)
180 		maybe_err("Can not fdopen() input stream");
181 	if ((unpackd->fpOut = fdopen(out, "w")) == NULL)
182 		maybe_err("Can not fdopen() output stream");
183 
184 	/* Allocate for the tables of bounds and the tree itself */
185 	unpackd->inodesin =
186 	    calloc(unpackd->treelevels, sizeof(*(unpackd->inodesin)));
187 	unpackd->symbolsin =
188 	    calloc(unpackd->treelevels, sizeof(*(unpackd->symbolsin)));
189 	unpackd->tree =
190 	    calloc(unpackd->treelevels, (sizeof (*(unpackd->tree))));
191 	if (unpackd->inodesin == NULL || unpackd->symbolsin == NULL ||
192 	    unpackd->tree == NULL)
193 		maybe_err("calloc");
194 
195 	/* We count from 0 so adjust to match array upper bound */
196 	unpackd->treelevels--;
197 
198 	/* Read the levels symbol count table and caculate total */
199 	unpackd->symbol_size = 1;		/* EOB */
200 	for (i = 0; i <= unpackd->treelevels; i++) {
201 		if ((thisbyte = fgetc(unpackd->fpIn)) == EOF)
202 			maybe_err("File appears to be truncated");
203 		unpackd->symbolsin[i] = (unsigned char)thisbyte;
204 		unpackd->symbol_size += unpackd->symbolsin[i];
205 	}
206 	accepted_bytes(bytes_in, unpackd->treelevels);
207 	if (unpackd->symbol_size > 256)
208 		maybe_errx("Bad symbol table");
209 
210 	/* Allocate for the symbol table, point symbol_eob at the beginning */
211 	unpackd->symbol_eob = unpackd->symbol = calloc(1, unpackd->symbol_size);
212 	if (unpackd->symbol == NULL)
213 		maybe_err("calloc");
214 
215 	/*
216 	 * Read in the symbol table, which contain [2, 256] symbols.
217 	 * In order to fit the count in one byte, pack(1) would offset
218 	 * it by reducing 2 from the actual number from the last level.
219 	 *
220 	 * We adjust the last level's symbol count by 1 here, because
221 	 * the EOB symbol is not being transmitted explicitly.  Another
222 	 * adjustment would be done later afterward.
223 	 */
224 	unpackd->symbolsin[unpackd->treelevels]++;
225 	for (i = 0; i <= unpackd->treelevels; i++) {
226 		unpackd->tree[i] = unpackd->symbol_eob;
227 		for (j = 0; j < unpackd->symbolsin[i]; j++) {
228 			if ((thisbyte = fgetc(unpackd->fpIn)) == EOF)
229 				maybe_errx("Symbol table truncated");
230 			*unpackd->symbol_eob++ = (char)thisbyte;
231 		}
232 		accepted_bytes(bytes_in, unpackd->symbolsin[i]);
233 	}
234 
235 	/* Now, take account for the EOB symbol as well */
236 	unpackd->symbolsin[unpackd->treelevels]++;
237 
238 	/*
239 	 * The symbolsin table has been constructed now.
240 	 * Caculate the internal nodes count table based on it.
241 	 */
242 	unpackd_fill_inodesin(unpackd, 0);
243 }
244 
245 /*
246  * Decode huffman stream, based on the huffman tree.
247  */
248 static void
249 unpack_decode(const unpack_descriptor_t *unpackd, off_t *bytes_in)
250 {
251 	int thislevel, thiscode, thisbyte, inlevelindex;
252 	int i;
253 	off_t bytes_out = 0;
254 	const char *thissymbol;	/* The symbol pointer decoded from stream */
255 
256 	/*
257 	 * Decode huffman.  Fetch every bytes from the file, get it
258 	 * into 'thiscode' bit-by-bit, then output the symbol we got
259 	 * when one has been found.
260 	 *
261 	 * Assumption: sizeof(int) > ((max tree levels + 1) / 8).
262 	 * bad things could happen if not.
263 	 */
264 	thislevel = 0;
265 	thiscode = thisbyte = 0;
266 
267 	while ((thisbyte = fgetc(unpackd->fpIn)) != EOF) {
268 		accepted_bytes(bytes_in, 1);
269 
270 		/*
271 		 * Split one bit from thisbyte, from highest to lowest,
272 		 * feed the bit into thiscode, until we got a symbol from
273 		 * the tree.
274 		 */
275 		for (i = 7; i >= 0; i--) {
276 			thiscode = (thiscode << 1) | ((thisbyte >> i) & 1);
277 
278 			/* Did we got a symbol? (referencing leaf node) */
279 			if (thiscode >= unpackd->inodesin[thislevel]) {
280 				inlevelindex =
281 				    thiscode - unpackd->inodesin[thislevel];
282 				if (inlevelindex > unpackd->symbolsin[thislevel])
283 					maybe_errx("File corrupt");
284 
285 				thissymbol =
286 				    &(unpackd->tree[thislevel][inlevelindex]);
287 				if ((thissymbol == unpackd->symbol_eob) &&
288 				    (bytes_out == unpackd->uncompressed_size))
289 					goto finished;
290 
291 				fputc((*thissymbol), unpackd->fpOut);
292 				bytes_out++;
293 
294 				/* Prepare for next input */
295 				thislevel = 0; thiscode = 0;
296 			} else {
297 				thislevel++;
298 				if (thislevel > unpackd->treelevels)
299 					maybe_errx("File corrupt");
300 			}
301 		}
302 	}
303 
304 finished:
305 	if (bytes_out != unpackd->uncompressed_size)
306 		maybe_errx("Premature EOF");
307 }
308 
309 /* Handler for pack(1)'ed file */
310 static off_t
311 unpack(int in, int out, char *pre, size_t prelen, off_t *bytes_in)
312 {
313 	unpack_descriptor_t	unpackd;
314 
315 	unpack_parse_header(dup(in), dup(out), pre, prelen, bytes_in, &unpackd);
316 	unpack_decode(&unpackd, bytes_in);
317 	unpack_descriptor_fini(&unpackd);
318 
319 	/* If we reached here, the unpack was successful */
320 	return (unpackd.uncompressed_size);
321 }
322 
323