xref: /linux/lib/xz/xz_dec_bcj.c (revision 509d3f45847627f4c5cdce004c3ec79262b5239c)
1 // SPDX-License-Identifier: 0BSD
2 
3 /*
4  * Branch/Call/Jump (BCJ) filter decoders
5  *
6  * Authors: Lasse Collin <lasse.collin@tukaani.org>
7  *          Igor Pavlov <https://7-zip.org/>
8  */
9 
10 #include "xz_private.h"
11 
12 /*
13  * The rest of the file is inside this ifdef. It makes things a little more
14  * convenient when building without support for any BCJ filters.
15  */
16 #ifdef XZ_DEC_BCJ
17 
18 struct xz_dec_bcj {
19 	/* Type of the BCJ filter being used */
20 	enum {
21 		BCJ_X86 = 4,        /* x86 or x86-64 */
22 		BCJ_POWERPC = 5,    /* Big endian only */
23 		BCJ_ARM = 7,        /* Little endian only */
24 		BCJ_ARMTHUMB = 8,   /* Little endian only */
25 		BCJ_SPARC = 9,      /* Big or little endian */
26 		BCJ_ARM64 = 10,     /* AArch64 */
27 		BCJ_RISCV = 11      /* RV32GQC_Zfh, RV64GQC_Zfh */
28 	} type;
29 
30 	/*
31 	 * Return value of the next filter in the chain. We need to preserve
32 	 * this information across calls, because we must not call the next
33 	 * filter anymore once it has returned XZ_STREAM_END.
34 	 */
35 	enum xz_ret ret;
36 
37 	/* True if we are operating in single-call mode. */
38 	bool single_call;
39 
40 	/*
41 	 * Absolute position relative to the beginning of the uncompressed
42 	 * data (in a single .xz Block). We care only about the lowest 32
43 	 * bits so this doesn't need to be uint64_t even with big files.
44 	 */
45 	uint32_t pos;
46 
47 	/* x86 filter state */
48 	uint32_t x86_prev_mask;
49 
50 	/* Temporary space to hold the variables from struct xz_buf */
51 	uint8_t *out;
52 	size_t out_pos;
53 	size_t out_size;
54 
55 	struct {
56 		/* Amount of already filtered data in the beginning of buf */
57 		size_t filtered;
58 
59 		/* Total amount of data currently stored in buf  */
60 		size_t size;
61 
62 		/*
63 		 * Buffer to hold a mix of filtered and unfiltered data. This
64 		 * needs to be big enough to hold Alignment + 2 * Look-ahead:
65 		 *
66 		 * Type         Alignment   Look-ahead
67 		 * x86              1           4
68 		 * PowerPC          4           0
69 		 * IA-64           16           0
70 		 * ARM              4           0
71 		 * ARM-Thumb        2           2
72 		 * SPARC            4           0
73 		 */
74 		uint8_t buf[16];
75 	} temp;
76 };
77 
78 #ifdef XZ_DEC_X86
79 /*
80  * This is used to test the most significant byte of a memory address
81  * in an x86 instruction.
82  */
bcj_x86_test_msbyte(uint8_t b)83 static inline int bcj_x86_test_msbyte(uint8_t b)
84 {
85 	return b == 0x00 || b == 0xFF;
86 }
87 
bcj_x86(struct xz_dec_bcj * s,uint8_t * buf,size_t size)88 static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
89 {
90 	static const bool mask_to_allowed_status[8]
91 		= { true, true, true, false, true, false, false, false };
92 
93 	static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
94 
95 	size_t i;
96 	size_t prev_pos = (size_t)-1;
97 	uint32_t prev_mask = s->x86_prev_mask;
98 	uint32_t src;
99 	uint32_t dest;
100 	uint32_t j;
101 	uint8_t b;
102 
103 	if (size <= 4)
104 		return 0;
105 
106 	size -= 4;
107 	for (i = 0; i < size; ++i) {
108 		if ((buf[i] & 0xFE) != 0xE8)
109 			continue;
110 
111 		prev_pos = i - prev_pos;
112 		if (prev_pos > 3) {
113 			prev_mask = 0;
114 		} else {
115 			prev_mask = (prev_mask << (prev_pos - 1)) & 7;
116 			if (prev_mask != 0) {
117 				b = buf[i + 4 - mask_to_bit_num[prev_mask]];
118 				if (!mask_to_allowed_status[prev_mask]
119 						|| bcj_x86_test_msbyte(b)) {
120 					prev_pos = i;
121 					prev_mask = (prev_mask << 1) | 1;
122 					continue;
123 				}
124 			}
125 		}
126 
127 		prev_pos = i;
128 
129 		if (bcj_x86_test_msbyte(buf[i + 4])) {
130 			src = get_unaligned_le32(buf + i + 1);
131 			while (true) {
132 				dest = src - (s->pos + (uint32_t)i + 5);
133 				if (prev_mask == 0)
134 					break;
135 
136 				j = mask_to_bit_num[prev_mask] * 8;
137 				b = (uint8_t)(dest >> (24 - j));
138 				if (!bcj_x86_test_msbyte(b))
139 					break;
140 
141 				src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
142 			}
143 
144 			dest &= 0x01FFFFFF;
145 			dest |= (uint32_t)0 - (dest & 0x01000000);
146 			put_unaligned_le32(dest, buf + i + 1);
147 			i += 4;
148 		} else {
149 			prev_mask = (prev_mask << 1) | 1;
150 		}
151 	}
152 
153 	prev_pos = i - prev_pos;
154 	s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
155 	return i;
156 }
157 #endif
158 
159 #ifdef XZ_DEC_POWERPC
bcj_powerpc(struct xz_dec_bcj * s,uint8_t * buf,size_t size)160 static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
161 {
162 	size_t i;
163 	uint32_t instr;
164 
165 	size &= ~(size_t)3;
166 
167 	for (i = 0; i < size; i += 4) {
168 		instr = get_unaligned_be32(buf + i);
169 		if ((instr & 0xFC000003) == 0x48000001) {
170 			instr &= 0x03FFFFFC;
171 			instr -= s->pos + (uint32_t)i;
172 			instr &= 0x03FFFFFC;
173 			instr |= 0x48000001;
174 			put_unaligned_be32(instr, buf + i);
175 		}
176 	}
177 
178 	return i;
179 }
180 #endif
181 
182 #ifdef XZ_DEC_ARM
bcj_arm(struct xz_dec_bcj * s,uint8_t * buf,size_t size)183 static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
184 {
185 	size_t i;
186 	uint32_t addr;
187 
188 	size &= ~(size_t)3;
189 
190 	for (i = 0; i < size; i += 4) {
191 		if (buf[i + 3] == 0xEB) {
192 			addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)
193 					| ((uint32_t)buf[i + 2] << 16);
194 			addr <<= 2;
195 			addr -= s->pos + (uint32_t)i + 8;
196 			addr >>= 2;
197 			buf[i] = (uint8_t)addr;
198 			buf[i + 1] = (uint8_t)(addr >> 8);
199 			buf[i + 2] = (uint8_t)(addr >> 16);
200 		}
201 	}
202 
203 	return i;
204 }
205 #endif
206 
207 #ifdef XZ_DEC_ARMTHUMB
bcj_armthumb(struct xz_dec_bcj * s,uint8_t * buf,size_t size)208 static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
209 {
210 	size_t i;
211 	uint32_t addr;
212 
213 	if (size < 4)
214 		return 0;
215 
216 	size -= 4;
217 
218 	for (i = 0; i <= size; i += 2) {
219 		if ((buf[i + 1] & 0xF8) == 0xF0
220 				&& (buf[i + 3] & 0xF8) == 0xF8) {
221 			addr = (((uint32_t)buf[i + 1] & 0x07) << 19)
222 					| ((uint32_t)buf[i] << 11)
223 					| (((uint32_t)buf[i + 3] & 0x07) << 8)
224 					| (uint32_t)buf[i + 2];
225 			addr <<= 1;
226 			addr -= s->pos + (uint32_t)i + 4;
227 			addr >>= 1;
228 			buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
229 			buf[i] = (uint8_t)(addr >> 11);
230 			buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
231 			buf[i + 2] = (uint8_t)addr;
232 			i += 2;
233 		}
234 	}
235 
236 	return i;
237 }
238 #endif
239 
240 #ifdef XZ_DEC_SPARC
bcj_sparc(struct xz_dec_bcj * s,uint8_t * buf,size_t size)241 static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
242 {
243 	size_t i;
244 	uint32_t instr;
245 
246 	size &= ~(size_t)3;
247 
248 	for (i = 0; i < size; i += 4) {
249 		instr = get_unaligned_be32(buf + i);
250 		if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {
251 			instr <<= 2;
252 			instr -= s->pos + (uint32_t)i;
253 			instr >>= 2;
254 			instr = ((uint32_t)0x40000000 - (instr & 0x400000))
255 					| 0x40000000 | (instr & 0x3FFFFF);
256 			put_unaligned_be32(instr, buf + i);
257 		}
258 	}
259 
260 	return i;
261 }
262 #endif
263 
264 #ifdef XZ_DEC_ARM64
bcj_arm64(struct xz_dec_bcj * s,uint8_t * buf,size_t size)265 static size_t bcj_arm64(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
266 {
267 	size_t i;
268 	uint32_t instr;
269 	uint32_t addr;
270 
271 	size &= ~(size_t)3;
272 
273 	for (i = 0; i < size; i += 4) {
274 		instr = get_unaligned_le32(buf + i);
275 
276 		if ((instr >> 26) == 0x25) {
277 			/* BL instruction */
278 			addr = instr - ((s->pos + (uint32_t)i) >> 2);
279 			instr = 0x94000000 | (addr & 0x03FFFFFF);
280 			put_unaligned_le32(instr, buf + i);
281 
282 		} else if ((instr & 0x9F000000) == 0x90000000) {
283 			/* ADRP instruction */
284 			addr = ((instr >> 29) & 3) | ((instr >> 3) & 0x1FFFFC);
285 
286 			/* Only convert values in the range +/-512 MiB. */
287 			if ((addr + 0x020000) & 0x1C0000)
288 				continue;
289 
290 			addr -= (s->pos + (uint32_t)i) >> 12;
291 
292 			instr &= 0x9000001F;
293 			instr |= (addr & 3) << 29;
294 			instr |= (addr & 0x03FFFC) << 3;
295 			instr |= (0U - (addr & 0x020000)) & 0xE00000;
296 
297 			put_unaligned_le32(instr, buf + i);
298 		}
299 	}
300 
301 	return i;
302 }
303 #endif
304 
305 #ifdef XZ_DEC_RISCV
bcj_riscv(struct xz_dec_bcj * s,uint8_t * buf,size_t size)306 static size_t bcj_riscv(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
307 {
308 	size_t i;
309 	uint32_t b1;
310 	uint32_t b2;
311 	uint32_t b3;
312 	uint32_t instr;
313 	uint32_t instr2;
314 	uint32_t instr2_rs1;
315 	uint32_t addr;
316 
317 	if (size < 8)
318 		return 0;
319 
320 	size -= 8;
321 
322 	for (i = 0; i <= size; i += 2) {
323 		instr = buf[i];
324 
325 		if (instr == 0xEF) {
326 			/* JAL */
327 			b1 = buf[i + 1];
328 			if ((b1 & 0x0D) != 0)
329 				continue;
330 
331 			b2 = buf[i + 2];
332 			b3 = buf[i + 3];
333 
334 			addr = ((b1 & 0xF0) << 13) | (b2 << 9) | (b3 << 1);
335 			addr -= s->pos + (uint32_t)i;
336 
337 			buf[i + 1] = (uint8_t)((b1 & 0x0F)
338 					| ((addr >> 8) & 0xF0));
339 
340 			buf[i + 2] = (uint8_t)(((addr >> 16) & 0x0F)
341 					| ((addr >> 7) & 0x10)
342 					| ((addr << 4) & 0xE0));
343 
344 			buf[i + 3] = (uint8_t)(((addr >> 4) & 0x7F)
345 					| ((addr >> 13) & 0x80));
346 
347 			i += 4 - 2;
348 
349 		} else if ((instr & 0x7F) == 0x17) {
350 			/* AUIPC */
351 			instr |= (uint32_t)buf[i + 1] << 8;
352 			instr |= (uint32_t)buf[i + 2] << 16;
353 			instr |= (uint32_t)buf[i + 3] << 24;
354 
355 			if (instr & 0xE80) {
356 				/* AUIPC's rd doesn't equal x0 or x2. */
357 				instr2 = get_unaligned_le32(buf + i + 4);
358 
359 				if (((instr << 8) ^ (instr2 - 3)) & 0xF8003) {
360 					i += 6 - 2;
361 					continue;
362 				}
363 
364 				addr = (instr & 0xFFFFF000) + (instr2 >> 20);
365 
366 				instr = 0x17 | (2 << 7) | (instr2 << 12);
367 				instr2 = addr;
368 			} else {
369 				/* AUIPC's rd equals x0 or x2. */
370 				instr2_rs1 = instr >> 27;
371 
372 				if ((uint32_t)((instr - 0x3117) << 18)
373 						>= (instr2_rs1 & 0x1D)) {
374 					i += 4 - 2;
375 					continue;
376 				}
377 
378 				addr = get_unaligned_be32(buf + i + 4);
379 				addr -= s->pos + (uint32_t)i;
380 
381 				instr2 = (instr >> 12) | (addr << 20);
382 
383 				instr = 0x17 | (instr2_rs1 << 7)
384 					| ((addr + 0x800) & 0xFFFFF000);
385 			}
386 
387 			put_unaligned_le32(instr, buf + i);
388 			put_unaligned_le32(instr2, buf + i + 4);
389 
390 			i += 8 - 2;
391 		}
392 	}
393 
394 	return i;
395 }
396 #endif
397 
398 /*
399  * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
400  * of data that got filtered.
401  *
402  * NOTE: This is implemented as a switch statement to avoid using function
403  * pointers, which could be problematic in the kernel boot code, which must
404  * avoid pointers to static data (at least on x86).
405  */
bcj_apply(struct xz_dec_bcj * s,uint8_t * buf,size_t * pos,size_t size)406 static void bcj_apply(struct xz_dec_bcj *s,
407 		      uint8_t *buf, size_t *pos, size_t size)
408 {
409 	size_t filtered;
410 
411 	buf += *pos;
412 	size -= *pos;
413 
414 	switch (s->type) {
415 #ifdef XZ_DEC_X86
416 	case BCJ_X86:
417 		filtered = bcj_x86(s, buf, size);
418 		break;
419 #endif
420 #ifdef XZ_DEC_POWERPC
421 	case BCJ_POWERPC:
422 		filtered = bcj_powerpc(s, buf, size);
423 		break;
424 #endif
425 #ifdef XZ_DEC_ARM
426 	case BCJ_ARM:
427 		filtered = bcj_arm(s, buf, size);
428 		break;
429 #endif
430 #ifdef XZ_DEC_ARMTHUMB
431 	case BCJ_ARMTHUMB:
432 		filtered = bcj_armthumb(s, buf, size);
433 		break;
434 #endif
435 #ifdef XZ_DEC_SPARC
436 	case BCJ_SPARC:
437 		filtered = bcj_sparc(s, buf, size);
438 		break;
439 #endif
440 #ifdef XZ_DEC_ARM64
441 	case BCJ_ARM64:
442 		filtered = bcj_arm64(s, buf, size);
443 		break;
444 #endif
445 #ifdef XZ_DEC_RISCV
446 	case BCJ_RISCV:
447 		filtered = bcj_riscv(s, buf, size);
448 		break;
449 #endif
450 	default:
451 		/* Never reached but silence compiler warnings. */
452 		filtered = 0;
453 		break;
454 	}
455 
456 	*pos += filtered;
457 	s->pos += filtered;
458 }
459 
460 /*
461  * Flush pending filtered data from temp to the output buffer.
462  * Move the remaining mixture of possibly filtered and unfiltered
463  * data to the beginning of temp.
464  */
bcj_flush(struct xz_dec_bcj * s,struct xz_buf * b)465 static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
466 {
467 	size_t copy_size;
468 
469 	copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
470 	memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
471 	b->out_pos += copy_size;
472 
473 	s->temp.filtered -= copy_size;
474 	s->temp.size -= copy_size;
475 	memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
476 }
477 
478 /*
479  * The BCJ filter functions are primitive in sense that they process the
480  * data in chunks of 1-16 bytes. To hide this issue, this function does
481  * some buffering.
482  */
xz_dec_bcj_run(struct xz_dec_bcj * s,struct xz_dec_lzma2 * lzma2,struct xz_buf * b)483 enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, struct xz_dec_lzma2 *lzma2,
484 			   struct xz_buf *b)
485 {
486 	size_t out_start;
487 
488 	/*
489 	 * Flush pending already filtered data to the output buffer. Return
490 	 * immediately if we couldn't flush everything, or if the next
491 	 * filter in the chain had already returned XZ_STREAM_END.
492 	 */
493 	if (s->temp.filtered > 0) {
494 		bcj_flush(s, b);
495 		if (s->temp.filtered > 0)
496 			return XZ_OK;
497 
498 		if (s->ret == XZ_STREAM_END)
499 			return XZ_STREAM_END;
500 	}
501 
502 	/*
503 	 * If we have more output space than what is currently pending in
504 	 * temp, copy the unfiltered data from temp to the output buffer
505 	 * and try to fill the output buffer by decoding more data from the
506 	 * next filter in the chain. Apply the BCJ filter on the new data
507 	 * in the output buffer. If everything cannot be filtered, copy it
508 	 * to temp and rewind the output buffer position accordingly.
509 	 *
510 	 * This needs to be always run when temp.size == 0 to handle a special
511 	 * case where the output buffer is full and the next filter has no
512 	 * more output coming but hasn't returned XZ_STREAM_END yet.
513 	 */
514 	if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) {
515 		out_start = b->out_pos;
516 		memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
517 		b->out_pos += s->temp.size;
518 
519 		s->ret = xz_dec_lzma2_run(lzma2, b);
520 		if (s->ret != XZ_STREAM_END
521 				&& (s->ret != XZ_OK || s->single_call))
522 			return s->ret;
523 
524 		bcj_apply(s, b->out, &out_start, b->out_pos);
525 
526 		/*
527 		 * As an exception, if the next filter returned XZ_STREAM_END,
528 		 * we can do that too, since the last few bytes that remain
529 		 * unfiltered are meant to remain unfiltered.
530 		 */
531 		if (s->ret == XZ_STREAM_END)
532 			return XZ_STREAM_END;
533 
534 		s->temp.size = b->out_pos - out_start;
535 		b->out_pos -= s->temp.size;
536 		memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);
537 
538 		/*
539 		 * If there wasn't enough input to the next filter to fill
540 		 * the output buffer with unfiltered data, there's no point
541 		 * to try decoding more data to temp.
542 		 */
543 		if (b->out_pos + s->temp.size < b->out_size)
544 			return XZ_OK;
545 	}
546 
547 	/*
548 	 * We have unfiltered data in temp. If the output buffer isn't full
549 	 * yet, try to fill the temp buffer by decoding more data from the
550 	 * next filter. Apply the BCJ filter on temp. Then we hopefully can
551 	 * fill the actual output buffer by copying filtered data from temp.
552 	 * A mix of filtered and unfiltered data may be left in temp; it will
553 	 * be taken care on the next call to this function.
554 	 */
555 	if (b->out_pos < b->out_size) {
556 		/* Make b->out{,_pos,_size} temporarily point to s->temp. */
557 		s->out = b->out;
558 		s->out_pos = b->out_pos;
559 		s->out_size = b->out_size;
560 		b->out = s->temp.buf;
561 		b->out_pos = s->temp.size;
562 		b->out_size = sizeof(s->temp.buf);
563 
564 		s->ret = xz_dec_lzma2_run(lzma2, b);
565 
566 		s->temp.size = b->out_pos;
567 		b->out = s->out;
568 		b->out_pos = s->out_pos;
569 		b->out_size = s->out_size;
570 
571 		if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
572 			return s->ret;
573 
574 		bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);
575 
576 		/*
577 		 * If the next filter returned XZ_STREAM_END, we mark that
578 		 * everything is filtered, since the last unfiltered bytes
579 		 * of the stream are meant to be left as is.
580 		 */
581 		if (s->ret == XZ_STREAM_END)
582 			s->temp.filtered = s->temp.size;
583 
584 		bcj_flush(s, b);
585 		if (s->temp.filtered > 0)
586 			return XZ_OK;
587 	}
588 
589 	return s->ret;
590 }
591 
xz_dec_bcj_create(bool single_call)592 struct xz_dec_bcj *xz_dec_bcj_create(bool single_call)
593 {
594 	struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL);
595 	if (s != NULL)
596 		s->single_call = single_call;
597 
598 	return s;
599 }
600 
xz_dec_bcj_reset(struct xz_dec_bcj * s,uint8_t id)601 enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id)
602 {
603 	switch (id) {
604 #ifdef XZ_DEC_X86
605 	case BCJ_X86:
606 #endif
607 #ifdef XZ_DEC_POWERPC
608 	case BCJ_POWERPC:
609 #endif
610 #ifdef XZ_DEC_ARM
611 	case BCJ_ARM:
612 #endif
613 #ifdef XZ_DEC_ARMTHUMB
614 	case BCJ_ARMTHUMB:
615 #endif
616 #ifdef XZ_DEC_SPARC
617 	case BCJ_SPARC:
618 #endif
619 #ifdef XZ_DEC_ARM64
620 	case BCJ_ARM64:
621 #endif
622 #ifdef XZ_DEC_RISCV
623 	case BCJ_RISCV:
624 #endif
625 		break;
626 
627 	default:
628 		/* Unsupported Filter ID */
629 		return XZ_OPTIONS_ERROR;
630 	}
631 
632 	s->type = id;
633 	s->ret = XZ_OK;
634 	s->pos = 0;
635 	s->x86_prev_mask = 0;
636 	s->temp.filtered = 0;
637 	s->temp.size = 0;
638 
639 	return XZ_OK;
640 }
641 
642 #endif
643