xref: /linux/drivers/media/test-drivers/vicodec/codec-fwht.c (revision 9dbbc3b9d09d6deba9f3b9e1d5b355032ed46a75)
1 // SPDX-License-Identifier: LGPL-2.1+
2 /*
3  * Copyright 2016 Tom aan de Wiel
4  * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5  *
6  * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7  *
8  * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9  * R.D. Brown, 1977
10  */
11 
12 #include <linux/string.h>
13 #include <linux/kernel.h>
14 #include <linux/videodev2.h>
15 #include "codec-fwht.h"
16 
17 #define OVERFLOW_BIT BIT(14)
18 
19 /*
20  * Note: bit 0 of the header must always be 0. Otherwise it cannot
21  * be guaranteed that the magic 8 byte sequence (see below) can
22  * never occur in the rlc output.
23  */
24 #define PFRAME_BIT BIT(15)
25 #define DUPS_MASK 0x1ffe
26 
27 #define PBLOCK 0
28 #define IBLOCK 1
29 
30 #define ALL_ZEROS 15
31 
32 static const uint8_t zigzag[64] = {
33 	0,
34 	1,  8,
35 	2,  9, 16,
36 	3, 10, 17, 24,
37 	4, 11, 18, 25, 32,
38 	5, 12, 19, 26, 33, 40,
39 	6, 13, 20, 27, 34, 41, 48,
40 	7, 14, 21, 28, 35, 42, 49, 56,
41 	15, 22, 29, 36, 43, 50, 57,
42 	23, 30, 37, 44, 51, 58,
43 	31, 38, 45, 52, 59,
44 	39, 46, 53, 60,
45 	47, 54, 61,
46 	55, 62,
47 	63,
48 };
49 
50 /*
51  * noinline_for_stack to work around
52  * https://bugs.llvm.org/show_bug.cgi?id=38809
53  */
54 static int noinline_for_stack
55 rlc(const s16 *in, __be16 *output, int blocktype)
56 {
57 	s16 block[8 * 8];
58 	s16 *wp = block;
59 	int i = 0;
60 	int x, y;
61 	int ret = 0;
62 
63 	/* read in block from framebuffer */
64 	int lastzero_run = 0;
65 	int to_encode;
66 
67 	for (y = 0; y < 8; y++) {
68 		for (x = 0; x < 8; x++) {
69 			*wp = in[x + y * 8];
70 			wp++;
71 		}
72 	}
73 
74 	/* keep track of amount of trailing zeros */
75 	for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
76 		lastzero_run++;
77 
78 	*output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
79 	ret++;
80 
81 	to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
82 
83 	i = 0;
84 	while (i < to_encode) {
85 		int cnt = 0;
86 		int tmp;
87 
88 		/* count leading zeros */
89 		while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
90 			cnt++;
91 			i++;
92 			if (i == to_encode) {
93 				cnt--;
94 				break;
95 			}
96 		}
97 		/* 4 bits for run, 12 for coefficient (quantization by 4) */
98 		*output++ = htons((cnt | tmp << 4));
99 		i++;
100 		ret++;
101 	}
102 	if (lastzero_run > 14) {
103 		*output = htons(ALL_ZEROS | 0);
104 		ret++;
105 	}
106 
107 	return ret;
108 }
109 
110 /*
111  * This function will worst-case increase rlc_in by 65*2 bytes:
112  * one s16 value for the header and 8 * 8 coefficients of type s16.
113  */
114 static noinline_for_stack u16
115 derlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input)
116 {
117 	/* header */
118 	const __be16 *input = *rlc_in;
119 	u16 stat;
120 	int dec_count = 0;
121 	s16 block[8 * 8 + 16];
122 	s16 *wp = block;
123 	int i;
124 
125 	if (input > end_of_input)
126 		return OVERFLOW_BIT;
127 	stat = ntohs(*input++);
128 
129 	/*
130 	 * Now de-compress, it expands one byte to up to 15 bytes
131 	 * (or fills the remainder of the 64 bytes with zeroes if it
132 	 * is the last byte to expand).
133 	 *
134 	 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
135 	 * allow for overflow if the incoming data was malformed.
136 	 */
137 	while (dec_count < 8 * 8) {
138 		s16 in;
139 		int length;
140 		int coeff;
141 
142 		if (input > end_of_input)
143 			return OVERFLOW_BIT;
144 		in = ntohs(*input++);
145 		length = in & 0xf;
146 		coeff = in >> 4;
147 
148 		/* fill remainder with zeros */
149 		if (length == 15) {
150 			for (i = 0; i < 64 - dec_count; i++)
151 				*wp++ = 0;
152 			break;
153 		}
154 
155 		for (i = 0; i < length; i++)
156 			*wp++ = 0;
157 		*wp++ = coeff;
158 		dec_count += length + 1;
159 	}
160 
161 	wp = block;
162 
163 	for (i = 0; i < 64; i++) {
164 		int pos = zigzag[i];
165 		int y = pos / 8;
166 		int x = pos % 8;
167 
168 		dwht_out[x + y * 8] = *wp++;
169 	}
170 	*rlc_in = input;
171 	return stat;
172 }
173 
174 static const int quant_table[] = {
175 	2, 2, 2, 2, 2, 2,  2,  2,
176 	2, 2, 2, 2, 2, 2,  2,  2,
177 	2, 2, 2, 2, 2, 2,  2,  3,
178 	2, 2, 2, 2, 2, 2,  3,  6,
179 	2, 2, 2, 2, 2, 3,  6,  6,
180 	2, 2, 2, 2, 3, 6,  6,  6,
181 	2, 2, 2, 3, 6, 6,  6,  6,
182 	2, 2, 3, 6, 6, 6,  6,  8,
183 };
184 
185 static const int quant_table_p[] = {
186 	3, 3, 3, 3, 3, 3,  3,  3,
187 	3, 3, 3, 3, 3, 3,  3,  3,
188 	3, 3, 3, 3, 3, 3,  3,  3,
189 	3, 3, 3, 3, 3, 3,  3,  6,
190 	3, 3, 3, 3, 3, 3,  6,  6,
191 	3, 3, 3, 3, 3, 6,  6,  9,
192 	3, 3, 3, 3, 6, 6,  9,  9,
193 	3, 3, 3, 6, 6, 9,  9,  10,
194 };
195 
196 static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
197 {
198 	const int *quant = quant_table;
199 	int i, j;
200 
201 	for (j = 0; j < 8; j++) {
202 		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
203 			*coeff >>= *quant;
204 			if (*coeff >= -qp && *coeff <= qp)
205 				*coeff = *de_coeff = 0;
206 			else
207 				*de_coeff = *coeff << *quant;
208 		}
209 	}
210 }
211 
212 static void dequantize_intra(s16 *coeff)
213 {
214 	const int *quant = quant_table;
215 	int i, j;
216 
217 	for (j = 0; j < 8; j++)
218 		for (i = 0; i < 8; i++, quant++, coeff++)
219 			*coeff <<= *quant;
220 }
221 
222 static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
223 {
224 	const int *quant = quant_table_p;
225 	int i, j;
226 
227 	for (j = 0; j < 8; j++) {
228 		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
229 			*coeff >>= *quant;
230 			if (*coeff >= -qp && *coeff <= qp)
231 				*coeff = *de_coeff = 0;
232 			else
233 				*de_coeff = *coeff << *quant;
234 		}
235 	}
236 }
237 
238 static void dequantize_inter(s16 *coeff)
239 {
240 	const int *quant = quant_table_p;
241 	int i, j;
242 
243 	for (j = 0; j < 8; j++)
244 		for (i = 0; i < 8; i++, quant++, coeff++)
245 			*coeff <<= *quant;
246 }
247 
248 static void noinline_for_stack fwht(const u8 *block, s16 *output_block,
249 				    unsigned int stride,
250 				    unsigned int input_step, bool intra)
251 {
252 	/* we'll need more than 8 bits for the transformed coefficients */
253 	s32 workspace1[8], workspace2[8];
254 	const u8 *tmp = block;
255 	s16 *out = output_block;
256 	int add = intra ? 256 : 0;
257 	unsigned int i;
258 
259 	/* stage 1 */
260 	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
261 		switch (input_step) {
262 		case 1:
263 			workspace1[0]  = tmp[0] + tmp[1] - add;
264 			workspace1[1]  = tmp[0] - tmp[1];
265 
266 			workspace1[2]  = tmp[2] + tmp[3] - add;
267 			workspace1[3]  = tmp[2] - tmp[3];
268 
269 			workspace1[4]  = tmp[4] + tmp[5] - add;
270 			workspace1[5]  = tmp[4] - tmp[5];
271 
272 			workspace1[6]  = tmp[6] + tmp[7] - add;
273 			workspace1[7]  = tmp[6] - tmp[7];
274 			break;
275 		case 2:
276 			workspace1[0]  = tmp[0] + tmp[2] - add;
277 			workspace1[1]  = tmp[0] - tmp[2];
278 
279 			workspace1[2]  = tmp[4] + tmp[6] - add;
280 			workspace1[3]  = tmp[4] - tmp[6];
281 
282 			workspace1[4]  = tmp[8] + tmp[10] - add;
283 			workspace1[5]  = tmp[8] - tmp[10];
284 
285 			workspace1[6]  = tmp[12] + tmp[14] - add;
286 			workspace1[7]  = tmp[12] - tmp[14];
287 			break;
288 		case 3:
289 			workspace1[0]  = tmp[0] + tmp[3] - add;
290 			workspace1[1]  = tmp[0] - tmp[3];
291 
292 			workspace1[2]  = tmp[6] + tmp[9] - add;
293 			workspace1[3]  = tmp[6] - tmp[9];
294 
295 			workspace1[4]  = tmp[12] + tmp[15] - add;
296 			workspace1[5]  = tmp[12] - tmp[15];
297 
298 			workspace1[6]  = tmp[18] + tmp[21] - add;
299 			workspace1[7]  = tmp[18] - tmp[21];
300 			break;
301 		default:
302 			workspace1[0]  = tmp[0] + tmp[4] - add;
303 			workspace1[1]  = tmp[0] - tmp[4];
304 
305 			workspace1[2]  = tmp[8] + tmp[12] - add;
306 			workspace1[3]  = tmp[8] - tmp[12];
307 
308 			workspace1[4]  = tmp[16] + tmp[20] - add;
309 			workspace1[5]  = tmp[16] - tmp[20];
310 
311 			workspace1[6]  = tmp[24] + tmp[28] - add;
312 			workspace1[7]  = tmp[24] - tmp[28];
313 			break;
314 		}
315 
316 		/* stage 2 */
317 		workspace2[0] = workspace1[0] + workspace1[2];
318 		workspace2[1] = workspace1[0] - workspace1[2];
319 		workspace2[2] = workspace1[1] - workspace1[3];
320 		workspace2[3] = workspace1[1] + workspace1[3];
321 
322 		workspace2[4] = workspace1[4] + workspace1[6];
323 		workspace2[5] = workspace1[4] - workspace1[6];
324 		workspace2[6] = workspace1[5] - workspace1[7];
325 		workspace2[7] = workspace1[5] + workspace1[7];
326 
327 		/* stage 3 */
328 		out[0] = workspace2[0] + workspace2[4];
329 		out[1] = workspace2[0] - workspace2[4];
330 		out[2] = workspace2[1] - workspace2[5];
331 		out[3] = workspace2[1] + workspace2[5];
332 		out[4] = workspace2[2] + workspace2[6];
333 		out[5] = workspace2[2] - workspace2[6];
334 		out[6] = workspace2[3] - workspace2[7];
335 		out[7] = workspace2[3] + workspace2[7];
336 	}
337 
338 	out = output_block;
339 
340 	for (i = 0; i < 8; i++, out++) {
341 		/* stage 1 */
342 		workspace1[0]  = out[0] + out[1 * 8];
343 		workspace1[1]  = out[0] - out[1 * 8];
344 
345 		workspace1[2]  = out[2 * 8] + out[3 * 8];
346 		workspace1[3]  = out[2 * 8] - out[3 * 8];
347 
348 		workspace1[4]  = out[4 * 8] + out[5 * 8];
349 		workspace1[5]  = out[4 * 8] - out[5 * 8];
350 
351 		workspace1[6]  = out[6 * 8] + out[7 * 8];
352 		workspace1[7]  = out[6 * 8] - out[7 * 8];
353 
354 		/* stage 2 */
355 		workspace2[0] = workspace1[0] + workspace1[2];
356 		workspace2[1] = workspace1[0] - workspace1[2];
357 		workspace2[2] = workspace1[1] - workspace1[3];
358 		workspace2[3] = workspace1[1] + workspace1[3];
359 
360 		workspace2[4] = workspace1[4] + workspace1[6];
361 		workspace2[5] = workspace1[4] - workspace1[6];
362 		workspace2[6] = workspace1[5] - workspace1[7];
363 		workspace2[7] = workspace1[5] + workspace1[7];
364 		/* stage 3 */
365 		out[0 * 8] = workspace2[0] + workspace2[4];
366 		out[1 * 8] = workspace2[0] - workspace2[4];
367 		out[2 * 8] = workspace2[1] - workspace2[5];
368 		out[3 * 8] = workspace2[1] + workspace2[5];
369 		out[4 * 8] = workspace2[2] + workspace2[6];
370 		out[5 * 8] = workspace2[2] - workspace2[6];
371 		out[6 * 8] = workspace2[3] - workspace2[7];
372 		out[7 * 8] = workspace2[3] + workspace2[7];
373 	}
374 }
375 
376 /*
377  * Not the nicest way of doing it, but P-blocks get twice the range of
378  * that of the I-blocks. Therefore we need a type bigger than 8 bits.
379  * Furthermore values can be negative... This is just a version that
380  * works with 16 signed data
381  */
382 static void noinline_for_stack
383 fwht16(const s16 *block, s16 *output_block, int stride, int intra)
384 {
385 	/* we'll need more than 8 bits for the transformed coefficients */
386 	s32 workspace1[8], workspace2[8];
387 	const s16 *tmp = block;
388 	s16 *out = output_block;
389 	int i;
390 
391 	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
392 		/* stage 1 */
393 		workspace1[0]  = tmp[0] + tmp[1];
394 		workspace1[1]  = tmp[0] - tmp[1];
395 
396 		workspace1[2]  = tmp[2] + tmp[3];
397 		workspace1[3]  = tmp[2] - tmp[3];
398 
399 		workspace1[4]  = tmp[4] + tmp[5];
400 		workspace1[5]  = tmp[4] - tmp[5];
401 
402 		workspace1[6]  = tmp[6] + tmp[7];
403 		workspace1[7]  = tmp[6] - tmp[7];
404 
405 		/* stage 2 */
406 		workspace2[0] = workspace1[0] + workspace1[2];
407 		workspace2[1] = workspace1[0] - workspace1[2];
408 		workspace2[2] = workspace1[1] - workspace1[3];
409 		workspace2[3] = workspace1[1] + workspace1[3];
410 
411 		workspace2[4] = workspace1[4] + workspace1[6];
412 		workspace2[5] = workspace1[4] - workspace1[6];
413 		workspace2[6] = workspace1[5] - workspace1[7];
414 		workspace2[7] = workspace1[5] + workspace1[7];
415 
416 		/* stage 3 */
417 		out[0] = workspace2[0] + workspace2[4];
418 		out[1] = workspace2[0] - workspace2[4];
419 		out[2] = workspace2[1] - workspace2[5];
420 		out[3] = workspace2[1] + workspace2[5];
421 		out[4] = workspace2[2] + workspace2[6];
422 		out[5] = workspace2[2] - workspace2[6];
423 		out[6] = workspace2[3] - workspace2[7];
424 		out[7] = workspace2[3] + workspace2[7];
425 	}
426 
427 	out = output_block;
428 
429 	for (i = 0; i < 8; i++, out++) {
430 		/* stage 1 */
431 		workspace1[0]  = out[0] + out[1*8];
432 		workspace1[1]  = out[0] - out[1*8];
433 
434 		workspace1[2]  = out[2*8] + out[3*8];
435 		workspace1[3]  = out[2*8] - out[3*8];
436 
437 		workspace1[4]  = out[4*8] + out[5*8];
438 		workspace1[5]  = out[4*8] - out[5*8];
439 
440 		workspace1[6]  = out[6*8] + out[7*8];
441 		workspace1[7]  = out[6*8] - out[7*8];
442 
443 		/* stage 2 */
444 		workspace2[0] = workspace1[0] + workspace1[2];
445 		workspace2[1] = workspace1[0] - workspace1[2];
446 		workspace2[2] = workspace1[1] - workspace1[3];
447 		workspace2[3] = workspace1[1] + workspace1[3];
448 
449 		workspace2[4] = workspace1[4] + workspace1[6];
450 		workspace2[5] = workspace1[4] - workspace1[6];
451 		workspace2[6] = workspace1[5] - workspace1[7];
452 		workspace2[7] = workspace1[5] + workspace1[7];
453 
454 		/* stage 3 */
455 		out[0*8] = workspace2[0] + workspace2[4];
456 		out[1*8] = workspace2[0] - workspace2[4];
457 		out[2*8] = workspace2[1] - workspace2[5];
458 		out[3*8] = workspace2[1] + workspace2[5];
459 		out[4*8] = workspace2[2] + workspace2[6];
460 		out[5*8] = workspace2[2] - workspace2[6];
461 		out[6*8] = workspace2[3] - workspace2[7];
462 		out[7*8] = workspace2[3] + workspace2[7];
463 	}
464 }
465 
466 static noinline_for_stack void
467 ifwht(const s16 *block, s16 *output_block, int intra)
468 {
469 	/*
470 	 * we'll need more than 8 bits for the transformed coefficients
471 	 * use native unit of cpu
472 	 */
473 	int workspace1[8], workspace2[8];
474 	int inter = intra ? 0 : 1;
475 	const s16 *tmp = block;
476 	s16 *out = output_block;
477 	int i;
478 
479 	for (i = 0; i < 8; i++, tmp += 8, out += 8) {
480 		/* stage 1 */
481 		workspace1[0]  = tmp[0] + tmp[1];
482 		workspace1[1]  = tmp[0] - tmp[1];
483 
484 		workspace1[2]  = tmp[2] + tmp[3];
485 		workspace1[3]  = tmp[2] - tmp[3];
486 
487 		workspace1[4]  = tmp[4] + tmp[5];
488 		workspace1[5]  = tmp[4] - tmp[5];
489 
490 		workspace1[6]  = tmp[6] + tmp[7];
491 		workspace1[7]  = tmp[6] - tmp[7];
492 
493 		/* stage 2 */
494 		workspace2[0] = workspace1[0] + workspace1[2];
495 		workspace2[1] = workspace1[0] - workspace1[2];
496 		workspace2[2] = workspace1[1] - workspace1[3];
497 		workspace2[3] = workspace1[1] + workspace1[3];
498 
499 		workspace2[4] = workspace1[4] + workspace1[6];
500 		workspace2[5] = workspace1[4] - workspace1[6];
501 		workspace2[6] = workspace1[5] - workspace1[7];
502 		workspace2[7] = workspace1[5] + workspace1[7];
503 
504 		/* stage 3 */
505 		out[0] = workspace2[0] + workspace2[4];
506 		out[1] = workspace2[0] - workspace2[4];
507 		out[2] = workspace2[1] - workspace2[5];
508 		out[3] = workspace2[1] + workspace2[5];
509 		out[4] = workspace2[2] + workspace2[6];
510 		out[5] = workspace2[2] - workspace2[6];
511 		out[6] = workspace2[3] - workspace2[7];
512 		out[7] = workspace2[3] + workspace2[7];
513 	}
514 
515 	out = output_block;
516 
517 	for (i = 0; i < 8; i++, out++) {
518 		/* stage 1 */
519 		workspace1[0]  = out[0] + out[1 * 8];
520 		workspace1[1]  = out[0] - out[1 * 8];
521 
522 		workspace1[2]  = out[2 * 8] + out[3 * 8];
523 		workspace1[3]  = out[2 * 8] - out[3 * 8];
524 
525 		workspace1[4]  = out[4 * 8] + out[5 * 8];
526 		workspace1[5]  = out[4 * 8] - out[5 * 8];
527 
528 		workspace1[6]  = out[6 * 8] + out[7 * 8];
529 		workspace1[7]  = out[6 * 8] - out[7 * 8];
530 
531 		/* stage 2 */
532 		workspace2[0] = workspace1[0] + workspace1[2];
533 		workspace2[1] = workspace1[0] - workspace1[2];
534 		workspace2[2] = workspace1[1] - workspace1[3];
535 		workspace2[3] = workspace1[1] + workspace1[3];
536 
537 		workspace2[4] = workspace1[4] + workspace1[6];
538 		workspace2[5] = workspace1[4] - workspace1[6];
539 		workspace2[6] = workspace1[5] - workspace1[7];
540 		workspace2[7] = workspace1[5] + workspace1[7];
541 
542 		/* stage 3 */
543 		if (inter) {
544 			int d;
545 
546 			out[0 * 8] = workspace2[0] + workspace2[4];
547 			out[1 * 8] = workspace2[0] - workspace2[4];
548 			out[2 * 8] = workspace2[1] - workspace2[5];
549 			out[3 * 8] = workspace2[1] + workspace2[5];
550 			out[4 * 8] = workspace2[2] + workspace2[6];
551 			out[5 * 8] = workspace2[2] - workspace2[6];
552 			out[6 * 8] = workspace2[3] - workspace2[7];
553 			out[7 * 8] = workspace2[3] + workspace2[7];
554 
555 			for (d = 0; d < 8; d++)
556 				out[8 * d] >>= 6;
557 		} else {
558 			int d;
559 
560 			out[0 * 8] = workspace2[0] + workspace2[4];
561 			out[1 * 8] = workspace2[0] - workspace2[4];
562 			out[2 * 8] = workspace2[1] - workspace2[5];
563 			out[3 * 8] = workspace2[1] + workspace2[5];
564 			out[4 * 8] = workspace2[2] + workspace2[6];
565 			out[5 * 8] = workspace2[2] - workspace2[6];
566 			out[6 * 8] = workspace2[3] - workspace2[7];
567 			out[7 * 8] = workspace2[3] + workspace2[7];
568 
569 			for (d = 0; d < 8; d++) {
570 				out[8 * d] >>= 6;
571 				out[8 * d] += 128;
572 			}
573 		}
574 	}
575 }
576 
577 static void fill_encoder_block(const u8 *input, s16 *dst,
578 			       unsigned int stride, unsigned int input_step)
579 {
580 	int i, j;
581 
582 	for (i = 0; i < 8; i++) {
583 		for (j = 0; j < 8; j++, input += input_step)
584 			*dst++ = *input;
585 		input += stride - 8 * input_step;
586 	}
587 }
588 
589 static int var_intra(const s16 *input)
590 {
591 	int32_t mean = 0;
592 	int32_t ret = 0;
593 	const s16 *tmp = input;
594 	int i;
595 
596 	for (i = 0; i < 8 * 8; i++, tmp++)
597 		mean += *tmp;
598 	mean /= 64;
599 	tmp = input;
600 	for (i = 0; i < 8 * 8; i++, tmp++)
601 		ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
602 	return ret;
603 }
604 
605 static int var_inter(const s16 *old, const s16 *new)
606 {
607 	int32_t ret = 0;
608 	int i;
609 
610 	for (i = 0; i < 8 * 8; i++, old++, new++)
611 		ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
612 	return ret;
613 }
614 
615 static noinline_for_stack int
616 decide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock,
617 		 unsigned int stride, unsigned int input_step)
618 {
619 	s16 tmp[64];
620 	s16 old[64];
621 	s16 *work = tmp;
622 	unsigned int k, l;
623 	int vari;
624 	int vard;
625 
626 	fill_encoder_block(cur, tmp, stride, input_step);
627 	fill_encoder_block(reference, old, 8, 1);
628 	vari = var_intra(tmp);
629 
630 	for (k = 0; k < 8; k++) {
631 		for (l = 0; l < 8; l++) {
632 			*deltablock = *work - *reference;
633 			deltablock++;
634 			work++;
635 			reference++;
636 		}
637 	}
638 	deltablock -= 64;
639 	vard = var_inter(old, tmp);
640 	return vari <= vard ? IBLOCK : PBLOCK;
641 }
642 
643 static void fill_decoder_block(u8 *dst, const s16 *input, int stride,
644 			       unsigned int dst_step)
645 {
646 	int i, j;
647 
648 	for (i = 0; i < 8; i++) {
649 		for (j = 0; j < 8; j++, input++, dst += dst_step) {
650 			if (*input < 0)
651 				*dst = 0;
652 			else if (*input > 255)
653 				*dst = 255;
654 			else
655 				*dst = *input;
656 		}
657 		dst += stride - (8 * dst_step);
658 	}
659 }
660 
661 static void add_deltas(s16 *deltas, const u8 *ref, int stride,
662 		       unsigned int ref_step)
663 {
664 	int k, l;
665 
666 	for (k = 0; k < 8; k++) {
667 		for (l = 0; l < 8; l++) {
668 			*deltas += *ref;
669 			ref += ref_step;
670 			/*
671 			 * Due to quantizing, it might possible that the
672 			 * decoded coefficients are slightly out of range
673 			 */
674 			if (*deltas < 0)
675 				*deltas = 0;
676 			else if (*deltas > 255)
677 				*deltas = 255;
678 			deltas++;
679 		}
680 		ref += stride - (8 * ref_step);
681 	}
682 }
683 
684 static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
685 			struct fwht_cframe *cf, u32 height, u32 width,
686 			u32 stride, unsigned int input_step,
687 			bool is_intra, bool next_is_intra)
688 {
689 	u8 *input_start = input;
690 	__be16 *rlco_start = *rlco;
691 	s16 deltablock[64];
692 	__be16 pframe_bit = htons(PFRAME_BIT);
693 	u32 encoding = 0;
694 	unsigned int last_size = 0;
695 	unsigned int i, j;
696 
697 	width = round_up(width, 8);
698 	height = round_up(height, 8);
699 
700 	for (j = 0; j < height / 8; j++) {
701 		input = input_start + j * 8 * stride;
702 		for (i = 0; i < width / 8; i++) {
703 			/* intra code, first frame is always intra coded. */
704 			int blocktype = IBLOCK;
705 			unsigned int size;
706 
707 			if (!is_intra)
708 				blocktype = decide_blocktype(input, refp,
709 					deltablock, stride, input_step);
710 			if (blocktype == IBLOCK) {
711 				fwht(input, cf->coeffs, stride, input_step, 1);
712 				quantize_intra(cf->coeffs, cf->de_coeffs,
713 					       cf->i_frame_qp);
714 			} else {
715 				/* inter code */
716 				encoding |= FWHT_FRAME_PCODED;
717 				fwht16(deltablock, cf->coeffs, 8, 0);
718 				quantize_inter(cf->coeffs, cf->de_coeffs,
719 					       cf->p_frame_qp);
720 			}
721 			if (!next_is_intra) {
722 				ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
723 
724 				if (blocktype == PBLOCK)
725 					add_deltas(cf->de_fwht, refp, 8, 1);
726 				fill_decoder_block(refp, cf->de_fwht, 8, 1);
727 			}
728 
729 			input += 8 * input_step;
730 			refp += 8 * 8;
731 
732 			size = rlc(cf->coeffs, *rlco, blocktype);
733 			if (last_size == size &&
734 			    !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
735 				__be16 *last_rlco = *rlco - size;
736 				s16 hdr = ntohs(*last_rlco);
737 
738 				if (!((*last_rlco ^ **rlco) & pframe_bit) &&
739 				    (hdr & DUPS_MASK) < DUPS_MASK)
740 					*last_rlco = htons(hdr + 2);
741 				else
742 					*rlco += size;
743 			} else {
744 				*rlco += size;
745 			}
746 			if (*rlco >= rlco_max) {
747 				encoding |= FWHT_FRAME_UNENCODED;
748 				goto exit_loop;
749 			}
750 			last_size = size;
751 		}
752 	}
753 
754 exit_loop:
755 	if (encoding & FWHT_FRAME_UNENCODED) {
756 		u8 *out = (u8 *)rlco_start;
757 		u8 *p;
758 
759 		input = input_start;
760 		/*
761 		 * The compressed stream should never contain the magic
762 		 * header, so when we copy the YUV data we replace 0xff
763 		 * by 0xfe. Since YUV is limited range such values
764 		 * shouldn't appear anyway.
765 		 */
766 		for (j = 0; j < height; j++) {
767 			for (i = 0, p = input; i < width; i++, p += input_step)
768 				*out++ = (*p == 0xff) ? 0xfe : *p;
769 			input += stride;
770 		}
771 		*rlco = (__be16 *)out;
772 		encoding &= ~FWHT_FRAME_PCODED;
773 	}
774 	return encoding;
775 }
776 
777 u32 fwht_encode_frame(struct fwht_raw_frame *frm,
778 		      struct fwht_raw_frame *ref_frm,
779 		      struct fwht_cframe *cf,
780 		      bool is_intra, bool next_is_intra,
781 		      unsigned int width, unsigned int height,
782 		      unsigned int stride, unsigned int chroma_stride)
783 {
784 	unsigned int size = height * width;
785 	__be16 *rlco = cf->rlc_data;
786 	__be16 *rlco_max;
787 	u32 encoding;
788 
789 	rlco_max = rlco + size / 2 - 256;
790 	encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
791 				height, width, stride,
792 				frm->luma_alpha_step, is_intra, next_is_intra);
793 	if (encoding & FWHT_FRAME_UNENCODED)
794 		encoding |= FWHT_LUMA_UNENCODED;
795 	encoding &= ~FWHT_FRAME_UNENCODED;
796 
797 	if (frm->components_num >= 3) {
798 		u32 chroma_h = height / frm->height_div;
799 		u32 chroma_w = width / frm->width_div;
800 		unsigned int chroma_size = chroma_h * chroma_w;
801 
802 		rlco_max = rlco + chroma_size / 2 - 256;
803 		encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max,
804 					 cf, chroma_h, chroma_w,
805 					 chroma_stride, frm->chroma_step,
806 					 is_intra, next_is_intra);
807 		if (encoding & FWHT_FRAME_UNENCODED)
808 			encoding |= FWHT_CB_UNENCODED;
809 		encoding &= ~FWHT_FRAME_UNENCODED;
810 		rlco_max = rlco + chroma_size / 2 - 256;
811 		encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max,
812 					 cf, chroma_h, chroma_w,
813 					 chroma_stride, frm->chroma_step,
814 					 is_intra, next_is_intra);
815 		if (encoding & FWHT_FRAME_UNENCODED)
816 			encoding |= FWHT_CR_UNENCODED;
817 		encoding &= ~FWHT_FRAME_UNENCODED;
818 	}
819 
820 	if (frm->components_num == 4) {
821 		rlco_max = rlco + size / 2 - 256;
822 		encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco,
823 					 rlco_max, cf, height, width,
824 					 stride, frm->luma_alpha_step,
825 					 is_intra, next_is_intra);
826 		if (encoding & FWHT_FRAME_UNENCODED)
827 			encoding |= FWHT_ALPHA_UNENCODED;
828 		encoding &= ~FWHT_FRAME_UNENCODED;
829 	}
830 
831 	cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
832 	return encoding;
833 }
834 
835 static bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco,
836 			 u32 height, u32 width, const u8 *ref, u32 ref_stride,
837 			 unsigned int ref_step, u8 *dst,
838 			 unsigned int dst_stride, unsigned int dst_step,
839 			 bool uncompressed, const __be16 *end_of_rlco_buf)
840 {
841 	unsigned int copies = 0;
842 	s16 copy[8 * 8];
843 	u16 stat;
844 	unsigned int i, j;
845 	bool is_intra = !ref;
846 
847 	width = round_up(width, 8);
848 	height = round_up(height, 8);
849 
850 	if (uncompressed) {
851 		int i;
852 
853 		if (end_of_rlco_buf + 1 < *rlco + width * height / 2)
854 			return false;
855 		for (i = 0; i < height; i++) {
856 			memcpy(dst, *rlco, width);
857 			dst += dst_stride;
858 			*rlco += width / 2;
859 		}
860 		return true;
861 	}
862 
863 	/*
864 	 * When decoding each macroblock the rlco pointer will be increased
865 	 * by 65 * 2 bytes worst-case.
866 	 * To avoid overflow the buffer has to be 65/64th of the actual raw
867 	 * image size, just in case someone feeds it malicious data.
868 	 */
869 	for (j = 0; j < height / 8; j++) {
870 		for (i = 0; i < width / 8; i++) {
871 			const u8 *refp = ref + j * 8 * ref_stride +
872 				i * 8 * ref_step;
873 			u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step;
874 
875 			if (copies) {
876 				memcpy(cf->de_fwht, copy, sizeof(copy));
877 				if ((stat & PFRAME_BIT) && !is_intra)
878 					add_deltas(cf->de_fwht, refp,
879 						   ref_stride, ref_step);
880 				fill_decoder_block(dstp, cf->de_fwht,
881 						   dst_stride, dst_step);
882 				copies--;
883 				continue;
884 			}
885 
886 			stat = derlc(rlco, cf->coeffs, end_of_rlco_buf);
887 			if (stat & OVERFLOW_BIT)
888 				return false;
889 			if ((stat & PFRAME_BIT) && !is_intra)
890 				dequantize_inter(cf->coeffs);
891 			else
892 				dequantize_intra(cf->coeffs);
893 
894 			ifwht(cf->coeffs, cf->de_fwht,
895 			      ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1);
896 
897 			copies = (stat & DUPS_MASK) >> 1;
898 			if (copies)
899 				memcpy(copy, cf->de_fwht, sizeof(copy));
900 			if ((stat & PFRAME_BIT) && !is_intra)
901 				add_deltas(cf->de_fwht, refp,
902 					   ref_stride, ref_step);
903 			fill_decoder_block(dstp, cf->de_fwht, dst_stride,
904 					   dst_step);
905 		}
906 	}
907 	return true;
908 }
909 
910 bool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags,
911 		       unsigned int components_num, unsigned int width,
912 		       unsigned int height, const struct fwht_raw_frame *ref,
913 		       unsigned int ref_stride, unsigned int ref_chroma_stride,
914 		       struct fwht_raw_frame *dst, unsigned int dst_stride,
915 		       unsigned int dst_chroma_stride)
916 {
917 	const __be16 *rlco = cf->rlc_data;
918 	const __be16 *end_of_rlco_buf = cf->rlc_data +
919 			(cf->size / sizeof(*rlco)) - 1;
920 
921 	if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride,
922 			  ref->luma_alpha_step, dst->luma, dst_stride,
923 			  dst->luma_alpha_step,
924 			  hdr_flags & V4L2_FWHT_FL_LUMA_IS_UNCOMPRESSED,
925 			  end_of_rlco_buf))
926 		return false;
927 
928 	if (components_num >= 3) {
929 		u32 h = height;
930 		u32 w = width;
931 
932 		if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT))
933 			h /= 2;
934 		if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH))
935 			w /= 2;
936 
937 		if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride,
938 				  ref->chroma_step, dst->cb, dst_chroma_stride,
939 				  dst->chroma_step,
940 				  hdr_flags & V4L2_FWHT_FL_CB_IS_UNCOMPRESSED,
941 				  end_of_rlco_buf))
942 			return false;
943 		if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride,
944 				  ref->chroma_step, dst->cr, dst_chroma_stride,
945 				  dst->chroma_step,
946 				  hdr_flags & V4L2_FWHT_FL_CR_IS_UNCOMPRESSED,
947 				  end_of_rlco_buf))
948 			return false;
949 	}
950 
951 	if (components_num == 4)
952 		if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride,
953 				  ref->luma_alpha_step, dst->alpha, dst_stride,
954 				  dst->luma_alpha_step,
955 				  hdr_flags & V4L2_FWHT_FL_ALPHA_IS_UNCOMPRESSED,
956 				  end_of_rlco_buf))
957 			return false;
958 	return true;
959 }
960