xref: /linux/lib/xz/xz_dec_bcj.c (revision 836d13a6ef8a2eb0eab2bd2de06f2deabc62b060)
1 // SPDX-License-Identifier: 0BSD
2 
3 /*
4  * Branch/Call/Jump (BCJ) filter decoders
5  *
6  * Authors: Lasse Collin <lasse.collin@tukaani.org>
7  *          Igor Pavlov <https://7-zip.org/>
8  */
9 
10 #include "xz_private.h"
11 
12 /*
13  * The rest of the file is inside this ifdef. It makes things a little more
14  * convenient when building without support for any BCJ filters.
15  */
16 #ifdef XZ_DEC_BCJ
17 
18 struct xz_dec_bcj {
19 	/* Type of the BCJ filter being used */
20 	enum {
21 		BCJ_X86 = 4,        /* x86 or x86-64 */
22 		BCJ_POWERPC = 5,    /* Big endian only */
23 		BCJ_IA64 = 6,       /* Big or little endian */
24 		BCJ_ARM = 7,        /* Little endian only */
25 		BCJ_ARMTHUMB = 8,   /* Little endian only */
26 		BCJ_SPARC = 9       /* Big or little endian */
27 	} type;
28 
29 	/*
30 	 * Return value of the next filter in the chain. We need to preserve
31 	 * this information across calls, because we must not call the next
32 	 * filter anymore once it has returned XZ_STREAM_END.
33 	 */
34 	enum xz_ret ret;
35 
36 	/* True if we are operating in single-call mode. */
37 	bool single_call;
38 
39 	/*
40 	 * Absolute position relative to the beginning of the uncompressed
41 	 * data (in a single .xz Block). We care only about the lowest 32
42 	 * bits so this doesn't need to be uint64_t even with big files.
43 	 */
44 	uint32_t pos;
45 
46 	/* x86 filter state */
47 	uint32_t x86_prev_mask;
48 
49 	/* Temporary space to hold the variables from struct xz_buf */
50 	uint8_t *out;
51 	size_t out_pos;
52 	size_t out_size;
53 
54 	struct {
55 		/* Amount of already filtered data in the beginning of buf */
56 		size_t filtered;
57 
58 		/* Total amount of data currently stored in buf  */
59 		size_t size;
60 
61 		/*
62 		 * Buffer to hold a mix of filtered and unfiltered data. This
63 		 * needs to be big enough to hold Alignment + 2 * Look-ahead:
64 		 *
65 		 * Type         Alignment   Look-ahead
66 		 * x86              1           4
67 		 * PowerPC          4           0
68 		 * IA-64           16           0
69 		 * ARM              4           0
70 		 * ARM-Thumb        2           2
71 		 * SPARC            4           0
72 		 */
73 		uint8_t buf[16];
74 	} temp;
75 };
76 
77 #ifdef XZ_DEC_X86
78 /*
79  * This is used to test the most significant byte of a memory address
80  * in an x86 instruction.
81  */
82 static inline int bcj_x86_test_msbyte(uint8_t b)
83 {
84 	return b == 0x00 || b == 0xFF;
85 }
86 
87 static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
88 {
89 	static const bool mask_to_allowed_status[8]
90 		= { true, true, true, false, true, false, false, false };
91 
92 	static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
93 
94 	size_t i;
95 	size_t prev_pos = (size_t)-1;
96 	uint32_t prev_mask = s->x86_prev_mask;
97 	uint32_t src;
98 	uint32_t dest;
99 	uint32_t j;
100 	uint8_t b;
101 
102 	if (size <= 4)
103 		return 0;
104 
105 	size -= 4;
106 	for (i = 0; i < size; ++i) {
107 		if ((buf[i] & 0xFE) != 0xE8)
108 			continue;
109 
110 		prev_pos = i - prev_pos;
111 		if (prev_pos > 3) {
112 			prev_mask = 0;
113 		} else {
114 			prev_mask = (prev_mask << (prev_pos - 1)) & 7;
115 			if (prev_mask != 0) {
116 				b = buf[i + 4 - mask_to_bit_num[prev_mask]];
117 				if (!mask_to_allowed_status[prev_mask]
118 						|| bcj_x86_test_msbyte(b)) {
119 					prev_pos = i;
120 					prev_mask = (prev_mask << 1) | 1;
121 					continue;
122 				}
123 			}
124 		}
125 
126 		prev_pos = i;
127 
128 		if (bcj_x86_test_msbyte(buf[i + 4])) {
129 			src = get_unaligned_le32(buf + i + 1);
130 			while (true) {
131 				dest = src - (s->pos + (uint32_t)i + 5);
132 				if (prev_mask == 0)
133 					break;
134 
135 				j = mask_to_bit_num[prev_mask] * 8;
136 				b = (uint8_t)(dest >> (24 - j));
137 				if (!bcj_x86_test_msbyte(b))
138 					break;
139 
140 				src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
141 			}
142 
143 			dest &= 0x01FFFFFF;
144 			dest |= (uint32_t)0 - (dest & 0x01000000);
145 			put_unaligned_le32(dest, buf + i + 1);
146 			i += 4;
147 		} else {
148 			prev_mask = (prev_mask << 1) | 1;
149 		}
150 	}
151 
152 	prev_pos = i - prev_pos;
153 	s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
154 	return i;
155 }
156 #endif
157 
158 #ifdef XZ_DEC_POWERPC
159 static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
160 {
161 	size_t i;
162 	uint32_t instr;
163 
164 	for (i = 0; i + 4 <= size; i += 4) {
165 		instr = get_unaligned_be32(buf + i);
166 		if ((instr & 0xFC000003) == 0x48000001) {
167 			instr &= 0x03FFFFFC;
168 			instr -= s->pos + (uint32_t)i;
169 			instr &= 0x03FFFFFC;
170 			instr |= 0x48000001;
171 			put_unaligned_be32(instr, buf + i);
172 		}
173 	}
174 
175 	return i;
176 }
177 #endif
178 
179 #ifdef XZ_DEC_IA64
180 static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
181 {
182 	static const uint8_t branch_table[32] = {
183 		0, 0, 0, 0, 0, 0, 0, 0,
184 		0, 0, 0, 0, 0, 0, 0, 0,
185 		4, 4, 6, 6, 0, 0, 7, 7,
186 		4, 4, 0, 0, 4, 4, 0, 0
187 	};
188 
189 	/*
190 	 * The local variables take a little bit stack space, but it's less
191 	 * than what LZMA2 decoder takes, so it doesn't make sense to reduce
192 	 * stack usage here without doing that for the LZMA2 decoder too.
193 	 */
194 
195 	/* Loop counters */
196 	size_t i;
197 	size_t j;
198 
199 	/* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
200 	uint32_t slot;
201 
202 	/* Bitwise offset of the instruction indicated by slot */
203 	uint32_t bit_pos;
204 
205 	/* bit_pos split into byte and bit parts */
206 	uint32_t byte_pos;
207 	uint32_t bit_res;
208 
209 	/* Address part of an instruction */
210 	uint32_t addr;
211 
212 	/* Mask used to detect which instructions to convert */
213 	uint32_t mask;
214 
215 	/* 41-bit instruction stored somewhere in the lowest 48 bits */
216 	uint64_t instr;
217 
218 	/* Instruction normalized with bit_res for easier manipulation */
219 	uint64_t norm;
220 
221 	for (i = 0; i + 16 <= size; i += 16) {
222 		mask = branch_table[buf[i] & 0x1F];
223 		for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) {
224 			if (((mask >> slot) & 1) == 0)
225 				continue;
226 
227 			byte_pos = bit_pos >> 3;
228 			bit_res = bit_pos & 7;
229 			instr = 0;
230 			for (j = 0; j < 6; ++j)
231 				instr |= (uint64_t)(buf[i + j + byte_pos])
232 						<< (8 * j);
233 
234 			norm = instr >> bit_res;
235 
236 			if (((norm >> 37) & 0x0F) == 0x05
237 					&& ((norm >> 9) & 0x07) == 0) {
238 				addr = (norm >> 13) & 0x0FFFFF;
239 				addr |= ((uint32_t)(norm >> 36) & 1) << 20;
240 				addr <<= 4;
241 				addr -= s->pos + (uint32_t)i;
242 				addr >>= 4;
243 
244 				norm &= ~((uint64_t)0x8FFFFF << 13);
245 				norm |= (uint64_t)(addr & 0x0FFFFF) << 13;
246 				norm |= (uint64_t)(addr & 0x100000)
247 						<< (36 - 20);
248 
249 				instr &= (1 << bit_res) - 1;
250 				instr |= norm << bit_res;
251 
252 				for (j = 0; j < 6; j++)
253 					buf[i + j + byte_pos]
254 						= (uint8_t)(instr >> (8 * j));
255 			}
256 		}
257 	}
258 
259 	return i;
260 }
261 #endif
262 
263 #ifdef XZ_DEC_ARM
264 static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
265 {
266 	size_t i;
267 	uint32_t addr;
268 
269 	for (i = 0; i + 4 <= size; i += 4) {
270 		if (buf[i + 3] == 0xEB) {
271 			addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)
272 					| ((uint32_t)buf[i + 2] << 16);
273 			addr <<= 2;
274 			addr -= s->pos + (uint32_t)i + 8;
275 			addr >>= 2;
276 			buf[i] = (uint8_t)addr;
277 			buf[i + 1] = (uint8_t)(addr >> 8);
278 			buf[i + 2] = (uint8_t)(addr >> 16);
279 		}
280 	}
281 
282 	return i;
283 }
284 #endif
285 
286 #ifdef XZ_DEC_ARMTHUMB
287 static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
288 {
289 	size_t i;
290 	uint32_t addr;
291 
292 	for (i = 0; i + 4 <= size; i += 2) {
293 		if ((buf[i + 1] & 0xF8) == 0xF0
294 				&& (buf[i + 3] & 0xF8) == 0xF8) {
295 			addr = (((uint32_t)buf[i + 1] & 0x07) << 19)
296 					| ((uint32_t)buf[i] << 11)
297 					| (((uint32_t)buf[i + 3] & 0x07) << 8)
298 					| (uint32_t)buf[i + 2];
299 			addr <<= 1;
300 			addr -= s->pos + (uint32_t)i + 4;
301 			addr >>= 1;
302 			buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
303 			buf[i] = (uint8_t)(addr >> 11);
304 			buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
305 			buf[i + 2] = (uint8_t)addr;
306 			i += 2;
307 		}
308 	}
309 
310 	return i;
311 }
312 #endif
313 
314 #ifdef XZ_DEC_SPARC
315 static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
316 {
317 	size_t i;
318 	uint32_t instr;
319 
320 	for (i = 0; i + 4 <= size; i += 4) {
321 		instr = get_unaligned_be32(buf + i);
322 		if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {
323 			instr <<= 2;
324 			instr -= s->pos + (uint32_t)i;
325 			instr >>= 2;
326 			instr = ((uint32_t)0x40000000 - (instr & 0x400000))
327 					| 0x40000000 | (instr & 0x3FFFFF);
328 			put_unaligned_be32(instr, buf + i);
329 		}
330 	}
331 
332 	return i;
333 }
334 #endif
335 
336 /*
337  * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
338  * of data that got filtered.
339  *
340  * NOTE: This is implemented as a switch statement to avoid using function
341  * pointers, which could be problematic in the kernel boot code, which must
342  * avoid pointers to static data (at least on x86).
343  */
344 static void bcj_apply(struct xz_dec_bcj *s,
345 		      uint8_t *buf, size_t *pos, size_t size)
346 {
347 	size_t filtered;
348 
349 	buf += *pos;
350 	size -= *pos;
351 
352 	switch (s->type) {
353 #ifdef XZ_DEC_X86
354 	case BCJ_X86:
355 		filtered = bcj_x86(s, buf, size);
356 		break;
357 #endif
358 #ifdef XZ_DEC_POWERPC
359 	case BCJ_POWERPC:
360 		filtered = bcj_powerpc(s, buf, size);
361 		break;
362 #endif
363 #ifdef XZ_DEC_IA64
364 	case BCJ_IA64:
365 		filtered = bcj_ia64(s, buf, size);
366 		break;
367 #endif
368 #ifdef XZ_DEC_ARM
369 	case BCJ_ARM:
370 		filtered = bcj_arm(s, buf, size);
371 		break;
372 #endif
373 #ifdef XZ_DEC_ARMTHUMB
374 	case BCJ_ARMTHUMB:
375 		filtered = bcj_armthumb(s, buf, size);
376 		break;
377 #endif
378 #ifdef XZ_DEC_SPARC
379 	case BCJ_SPARC:
380 		filtered = bcj_sparc(s, buf, size);
381 		break;
382 #endif
383 	default:
384 		/* Never reached but silence compiler warnings. */
385 		filtered = 0;
386 		break;
387 	}
388 
389 	*pos += filtered;
390 	s->pos += filtered;
391 }
392 
393 /*
394  * Flush pending filtered data from temp to the output buffer.
395  * Move the remaining mixture of possibly filtered and unfiltered
396  * data to the beginning of temp.
397  */
398 static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
399 {
400 	size_t copy_size;
401 
402 	copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
403 	memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
404 	b->out_pos += copy_size;
405 
406 	s->temp.filtered -= copy_size;
407 	s->temp.size -= copy_size;
408 	memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
409 }
410 
411 /*
412  * The BCJ filter functions are primitive in sense that they process the
413  * data in chunks of 1-16 bytes. To hide this issue, this function does
414  * some buffering.
415  */
416 XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s,
417 				     struct xz_dec_lzma2 *lzma2,
418 				     struct xz_buf *b)
419 {
420 	size_t out_start;
421 
422 	/*
423 	 * Flush pending already filtered data to the output buffer. Return
424 	 * immediately if we couldn't flush everything, or if the next
425 	 * filter in the chain had already returned XZ_STREAM_END.
426 	 */
427 	if (s->temp.filtered > 0) {
428 		bcj_flush(s, b);
429 		if (s->temp.filtered > 0)
430 			return XZ_OK;
431 
432 		if (s->ret == XZ_STREAM_END)
433 			return XZ_STREAM_END;
434 	}
435 
436 	/*
437 	 * If we have more output space than what is currently pending in
438 	 * temp, copy the unfiltered data from temp to the output buffer
439 	 * and try to fill the output buffer by decoding more data from the
440 	 * next filter in the chain. Apply the BCJ filter on the new data
441 	 * in the output buffer. If everything cannot be filtered, copy it
442 	 * to temp and rewind the output buffer position accordingly.
443 	 *
444 	 * This needs to be always run when temp.size == 0 to handle a special
445 	 * case where the output buffer is full and the next filter has no
446 	 * more output coming but hasn't returned XZ_STREAM_END yet.
447 	 */
448 	if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) {
449 		out_start = b->out_pos;
450 		memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
451 		b->out_pos += s->temp.size;
452 
453 		s->ret = xz_dec_lzma2_run(lzma2, b);
454 		if (s->ret != XZ_STREAM_END
455 				&& (s->ret != XZ_OK || s->single_call))
456 			return s->ret;
457 
458 		bcj_apply(s, b->out, &out_start, b->out_pos);
459 
460 		/*
461 		 * As an exception, if the next filter returned XZ_STREAM_END,
462 		 * we can do that too, since the last few bytes that remain
463 		 * unfiltered are meant to remain unfiltered.
464 		 */
465 		if (s->ret == XZ_STREAM_END)
466 			return XZ_STREAM_END;
467 
468 		s->temp.size = b->out_pos - out_start;
469 		b->out_pos -= s->temp.size;
470 		memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);
471 
472 		/*
473 		 * If there wasn't enough input to the next filter to fill
474 		 * the output buffer with unfiltered data, there's no point
475 		 * to try decoding more data to temp.
476 		 */
477 		if (b->out_pos + s->temp.size < b->out_size)
478 			return XZ_OK;
479 	}
480 
481 	/*
482 	 * We have unfiltered data in temp. If the output buffer isn't full
483 	 * yet, try to fill the temp buffer by decoding more data from the
484 	 * next filter. Apply the BCJ filter on temp. Then we hopefully can
485 	 * fill the actual output buffer by copying filtered data from temp.
486 	 * A mix of filtered and unfiltered data may be left in temp; it will
487 	 * be taken care on the next call to this function.
488 	 */
489 	if (b->out_pos < b->out_size) {
490 		/* Make b->out{,_pos,_size} temporarily point to s->temp. */
491 		s->out = b->out;
492 		s->out_pos = b->out_pos;
493 		s->out_size = b->out_size;
494 		b->out = s->temp.buf;
495 		b->out_pos = s->temp.size;
496 		b->out_size = sizeof(s->temp.buf);
497 
498 		s->ret = xz_dec_lzma2_run(lzma2, b);
499 
500 		s->temp.size = b->out_pos;
501 		b->out = s->out;
502 		b->out_pos = s->out_pos;
503 		b->out_size = s->out_size;
504 
505 		if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
506 			return s->ret;
507 
508 		bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);
509 
510 		/*
511 		 * If the next filter returned XZ_STREAM_END, we mark that
512 		 * everything is filtered, since the last unfiltered bytes
513 		 * of the stream are meant to be left as is.
514 		 */
515 		if (s->ret == XZ_STREAM_END)
516 			s->temp.filtered = s->temp.size;
517 
518 		bcj_flush(s, b);
519 		if (s->temp.filtered > 0)
520 			return XZ_OK;
521 	}
522 
523 	return s->ret;
524 }
525 
526 XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call)
527 {
528 	struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL);
529 	if (s != NULL)
530 		s->single_call = single_call;
531 
532 	return s;
533 }
534 
535 XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id)
536 {
537 	switch (id) {
538 #ifdef XZ_DEC_X86
539 	case BCJ_X86:
540 #endif
541 #ifdef XZ_DEC_POWERPC
542 	case BCJ_POWERPC:
543 #endif
544 #ifdef XZ_DEC_IA64
545 	case BCJ_IA64:
546 #endif
547 #ifdef XZ_DEC_ARM
548 	case BCJ_ARM:
549 #endif
550 #ifdef XZ_DEC_ARMTHUMB
551 	case BCJ_ARMTHUMB:
552 #endif
553 #ifdef XZ_DEC_SPARC
554 	case BCJ_SPARC:
555 #endif
556 		break;
557 
558 	default:
559 		/* Unsupported Filter ID */
560 		return XZ_OPTIONS_ERROR;
561 	}
562 
563 	s->type = id;
564 	s->ret = XZ_OK;
565 	s->pos = 0;
566 	s->x86_prev_mask = 0;
567 	s->temp.filtered = 0;
568 	s->temp.size = 0;
569 
570 	return XZ_OK;
571 }
572 
573 #endif
574