xref: /freebsd/sys/contrib/openzfs/module/zfs/lz4.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: BSD-2-Clause
2 /*
3    LZ4 - Fast LZ compression algorithm
4    Copyright (C) 2011-present, Yann Collet.
5 
6    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
7 
8    Redistribution and use in source and binary forms, with or without
9    modification, are permitted provided that the following conditions are
10    met:
11 
12        * Redistributions of source code must retain the above copyright
13    notice, this list of conditions and the following disclaimer.
14        * Redistributions in binary form must reproduce the above
15    copyright notice, this list of conditions and the following disclaimer
16    in the documentation and/or other materials provided with the
17    distribution.
18 
19    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31    You can contact the author at :
32     - LZ4 homepage : http://www.lz4.org
33     - LZ4 source repository : https://github.com/lz4/lz4
34 */
35 
36 /*
37  * This file contains unmodified code from lz4 1.9.3's decompressor, plus
38  * associated macros and constants.
39  *
40  * It also contains a couple of defines from the old lz4.c to make things
41  * fit together smoothly.
42  *
43  */
44 
45 #include <sys/zfs_context.h>
46 
47 int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
48     int isize, int maxOutputSize);
49 
50 /*
51  * Tuning parameters
52  */
53 
54 /*
55  * COMPRESSIONLEVEL: Increasing this value improves compression ratio
56  *	 Lowering this value reduces memory usage. Reduced memory usage
57  *	typically improves speed, due to cache effect (ex: L1 32KB for Intel,
58  *	L1 64KB for AMD). Memory usage formula : N->2^(N+2) Bytes
59  *	(examples : 12 -> 16KB ; 17 -> 512KB)
60  */
61 #define	COMPRESSIONLEVEL 12
62 
63 /*
64  * NOTCOMPRESSIBLE_CONFIRMATION: Decreasing this value will make the
65  *	algorithm skip faster data segments considered "incompressible".
66  *	This may decrease compression ratio dramatically, but will be
67  *	faster on incompressible data. Increasing this value will make
68  *	the algorithm search more before declaring a segment "incompressible".
69  *	This could improve compression a bit, but will be slower on
70  *	incompressible data. The default value (6) is recommended.
71  */
72 #define	NOTCOMPRESSIBLE_CONFIRMATION 6
73 
74 /*
75  * Little Endian or Big Endian?
76  * Note: overwrite the below #define if you know your architecture endianness.
77  */
78 #if defined(_ZFS_BIG_ENDIAN)
79 #define	LZ4_BIG_ENDIAN 1
80 #else
81 /*
82  * Little Endian assumed. PDP Endian and other very rare endian format
83  * are unsupported.
84  */
85 #undef LZ4_BIG_ENDIAN
86 #endif
87 
88 /*-************************************
89 *  CPU Feature Detection
90 **************************************/
91 /* LZ4_FORCE_MEMORY_ACCESS
92  * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
93  * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
94  * The below switch allow to select different access method for improved performance.
95  * Method 0 (default) : use `memcpy()`. Safe and portable.
96  * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
97  *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
98  * Method 2 : direct access. This method is portable but violate C standard.
99  *            It can generate buggy code on targets which assembly generation depends on alignment.
100  *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
101  * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
102  * Prefer these methods in priority order (0 > 1 > 2)
103  */
104 #ifndef LZ4_FORCE_MEMORY_ACCESS   /* can be defined externally */
105 #  if defined(__GNUC__) && \
106   ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \
107   || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
108 #    define LZ4_FORCE_MEMORY_ACCESS 2
109 #  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__)
110 #    define LZ4_FORCE_MEMORY_ACCESS 1
111 #  endif
112 #endif
113 
114 /*
115  * LZ4_FORCE_SW_BITCOUNT
116  * Define this parameter if your target system or compiler does not support hardware bit count
117  */
118 /*
119  * Illumos : we can't use GCC's __builtin_ctz family of builtins in the
120  * kernel
121  * Linux : we can use GCC's __builtin_ctz family of builtins in the
122  * kernel
123  */
124 #undef	LZ4_FORCE_SW_BITCOUNT
125 #if defined(__sunos__)
126 #define	LZ4_FORCE_SW_BITCOUNT
127 #endif
128 
129 /*
130  * Compiler Options
131  */
132 /* Disable restrict */
133 #define	restrict
134 
135 /*
136  * Linux : GCC_VERSION is defined as of 3.9-rc1, so undefine it.
137  * torvalds/linux@3f3f8d2f48acfd8ed3b8e6b7377935da57b27b16
138  */
139 #ifdef GCC_VERSION
140 #undef GCC_VERSION
141 #endif
142 
143 #define	GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
144 
145 #ifndef LZ4_FORCE_INLINE
146 #  ifdef _MSC_VER    /* Visual Studio */
147 #    define LZ4_FORCE_INLINE static __forceinline
148 #  else
149 #    if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
150 #      ifdef __GNUC__
151 #        define LZ4_FORCE_INLINE static inline __attribute__((always_inline))
152 #      else
153 #        define LZ4_FORCE_INLINE static inline
154 #      endif
155 #    else
156 #      define LZ4_FORCE_INLINE static
157 #    endif /* __STDC_VERSION__ */
158 #  endif  /* _MSC_VER */
159 #endif /* LZ4_FORCE_INLINE */
160 
161 /* LZ4_FORCE_O2 and LZ4_FORCE_INLINE
162  * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
163  * together with a simple 8-byte copy loop as a fall-back path.
164  * However, this optimization hurts the decompression speed by >30%,
165  * because the execution does not go to the optimized loop
166  * for typical compressible data, and all of the preamble checks
167  * before going to the fall-back path become useless overhead.
168  * This optimization happens only with the -O3 flag, and -O2 generates
169  * a simple 8-byte copy loop.
170  * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
171  * functions are annotated with __attribute__((optimize("O2"))),
172  * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
173  * of LZ4_wildCopy8 does not affect the compression speed.
174  */
175 #if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
176 #  define LZ4_FORCE_O2  __attribute__((optimize("O2")))
177 #  undef LZ4_FORCE_INLINE
178 #  define LZ4_FORCE_INLINE  static __inline __attribute__((optimize("O2"),always_inline))
179 #else
180 #  define LZ4_FORCE_O2
181 #endif
182 
183 #ifndef expect
184 #if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)
185 #  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
186 #else
187 #  define expect(expr,value)    (expr)
188 #endif
189 #endif
190 
191 #ifndef likely
192 #define	likely(expr)	expect((expr) != 0, 1)
193 #endif
194 
195 #ifndef unlikely
196 #define	unlikely(expr)	expect((expr) != 0, 0)
197 #endif
198 
199 #ifndef _KERNEL
200 #include <stdlib.h>   /* malloc, calloc, free */
201 #include <string.h>   /* memset, memcpy */
202 #endif
203 #define ALLOC(s)          malloc(s)
204 #define ALLOC_AND_ZERO(s) calloc(1,s)
205 #define FREEMEM(p)        free(p)
206 
207 #define MEM_INIT(p,v,s)   memset((p),(v),(s))
208 
209 
210 /*-************************************
211 *  Common Constants
212 **************************************/
213 #define MINMATCH 4
214 
215 #define WILDCOPYLENGTH 8
216 #define LASTLITERALS   5   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
217 #define MFLIMIT       12   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
218 #define MATCH_SAFEGUARD_DISTANCE  ((2*WILDCOPYLENGTH) - MINMATCH)   /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */
219 #define FASTLOOP_SAFE_DISTANCE 64
220 
221 #define KB *(1 <<10)
222 #define MB *(1 <<20)
223 #define GB *(1U<<30)
224 
225 #ifndef LZ4_DISTANCE_MAX   /* history window size; can be user-defined at compile time */
226 #  define LZ4_DISTANCE_MAX 65535   /* set to maximum value by default */
227 #endif
228 
229 #define LZ4_DISTANCE_ABSOLUTE_MAX 65535
230 #if (LZ4_DISTANCE_MAX > LZ4_DISTANCE_ABSOLUTE_MAX)   /* max supported by LZ4 format */
231 #  error "LZ4_DISTANCE_MAX is too big : must be <= 65535"
232 #endif
233 
234 #define ML_BITS  4
235 #define ML_MASK  ((1U<<ML_BITS)-1)
236 #define RUN_BITS (8-ML_BITS)
237 #define RUN_MASK ((1U<<RUN_BITS)-1)
238 
239 #define DEBUGLOG(l, ...) {}    /* disabled */
240 
241 #ifndef assert
242 #define assert ASSERT
243 #endif
244 
245 /*-************************************
246 *  Types
247 **************************************/
248 #ifndef _KERNEL
249 #include <limits.h>
250 #endif
251 #if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
252 #ifndef _KERNEL
253 #include <stdint.h>
254 #endif
255   typedef  uint8_t BYTE;
256   typedef uint16_t U16;
257   typedef uint32_t U32;
258   typedef  int32_t S32;
259   typedef uint64_t U64;
260   typedef uintptr_t uptrval;
261 #else
262 # if UINT_MAX != 4294967295UL
263 #   error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4"
264 # endif
265   typedef unsigned char       BYTE;
266   typedef unsigned short      U16;
267   typedef unsigned int        U32;
268   typedef   signed int        S32;
269   typedef unsigned long long  U64;
270   typedef size_t              uptrval;   /* generally true, except OpenVMS-64 */
271 #endif
272 
273 #if defined(__x86_64__)
274   typedef U64    reg_t;   /* 64-bits in x32 mode */
275 #else
276   typedef size_t reg_t;   /* 32-bits in x32 mode */
277 #endif
278 
279 typedef enum {
280     notLimited = 0,
281     limitedOutput = 1,
282     fillOutput = 2
283 } limitedOutput_directive;
284 
285 
286 /*-************************************
287 *  Reading and writing into memory
288 **************************************/
289 
290 /**
291  * LZ4 relies on memcpy with a constant size being inlined. In freestanding
292  * environments, the compiler can't assume the implementation of memcpy() is
293  * standard compliant, so it can't apply its specialized memcpy() inlining
294  * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze
295  * memcpy() as if it were standard compliant, so it can inline it in freestanding
296  * environments. This is needed when decompressing the Linux Kernel, for example.
297  */
298 #if defined(__GNUC__) && (__GNUC__ >= 4)
299 #define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
300 #else
301 #define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
302 #endif
303 
LZ4_isLittleEndian(void)304 static unsigned LZ4_isLittleEndian(void)
305 {
306     const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental */
307     return one.c[0];
308 }
309 
310 
311 #if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2)
312 /* lie to the compiler about data alignment; use with caution */
313 
LZ4_read16(const void * memPtr)314 static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; }
315 
LZ4_write16(void * memPtr,U16 value)316 static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
LZ4_write32(void * memPtr,U32 value)317 static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
318 
319 #elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1)
320 
321 /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
322 /* currently only defined for gcc and icc */
323 typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) unalign;
324 
LZ4_read16(const void * ptr)325 static U16 LZ4_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
326 
LZ4_write32(void * memPtr,U32 value)327 static void LZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
328 
329 #else  /* safe and portable access using memcpy() */
330 
LZ4_read16(const void * memPtr)331 static U16 LZ4_read16(const void* memPtr)
332 {
333     U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
334 }
335 
LZ4_write32(void * memPtr,U32 value)336 static void LZ4_write32(void* memPtr, U32 value)
337 {
338     LZ4_memcpy(memPtr, &value, sizeof(value));
339 }
340 
341 #endif /* LZ4_FORCE_MEMORY_ACCESS */
342 
LZ4_readLE16(const void * memPtr)343 static U16 LZ4_readLE16(const void* memPtr)
344 {
345     if (LZ4_isLittleEndian()) {
346         return LZ4_read16(memPtr);
347     } else {
348         const BYTE* p = (const BYTE*)memPtr;
349         return (U16)((U16)p[0] + (p[1]<<8));
350     }
351 }
352 
353 /* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
354 LZ4_FORCE_INLINE
LZ4_wildCopy8(void * dstPtr,const void * srcPtr,void * dstEnd)355 void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd)
356 {
357     BYTE* d = (BYTE*)dstPtr;
358     const BYTE* s = (const BYTE*)srcPtr;
359     BYTE* const e = (BYTE*)dstEnd;
360 
361     do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d<e);
362 }
363 
364 static const unsigned inc32table[8] = {0, 1, 2,  1,  0,  4, 4, 4};
365 static const int      dec64table[8] = {0, 0, 0, -1, -4,  1, 2, 3};
366 
367 
368 #ifndef LZ4_FAST_DEC_LOOP
369 #  if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64
370 #    define LZ4_FAST_DEC_LOOP 1
371 #  elif defined(__aarch64__) && !defined(__clang__)
372      /* On aarch64, we disable this optimization for clang because on certain
373       * mobile chipsets, performance is reduced with clang. For information
374       * refer to https://github.com/lz4/lz4/pull/707 */
375 #    define LZ4_FAST_DEC_LOOP 1
376 #  else
377 #    define LZ4_FAST_DEC_LOOP 0
378 #  endif
379 #endif
380 
381 #if LZ4_FAST_DEC_LOOP
382 
383 LZ4_FORCE_INLINE void
LZ4_memcpy_using_offset_base(BYTE * dstPtr,const BYTE * srcPtr,BYTE * dstEnd,const size_t offset)384 LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
385 {
386     assert(srcPtr + offset == dstPtr);
387     if (offset < 8) {
388         LZ4_write32(dstPtr, 0);   /* silence an msan warning when offset==0 */
389         dstPtr[0] = srcPtr[0];
390         dstPtr[1] = srcPtr[1];
391         dstPtr[2] = srcPtr[2];
392         dstPtr[3] = srcPtr[3];
393         srcPtr += inc32table[offset];
394         LZ4_memcpy(dstPtr+4, srcPtr, 4);
395         srcPtr -= dec64table[offset];
396         dstPtr += 8;
397     } else {
398         LZ4_memcpy(dstPtr, srcPtr, 8);
399         dstPtr += 8;
400         srcPtr += 8;
401     }
402 
403     LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
404 }
405 
406 /* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
407  * this version copies two times 16 bytes (instead of one time 32 bytes)
408  * because it must be compatible with offsets >= 16. */
409 LZ4_FORCE_INLINE void
LZ4_wildCopy32(void * dstPtr,const void * srcPtr,void * dstEnd)410 LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
411 {
412     BYTE* d = (BYTE*)dstPtr;
413     const BYTE* s = (const BYTE*)srcPtr;
414     BYTE* const e = (BYTE*)dstEnd;
415 
416     do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
417 }
418 
419 /* LZ4_memcpy_using_offset()  presumes :
420  * - dstEnd >= dstPtr + MINMATCH
421  * - there is at least 8 bytes available to write after dstEnd */
422 LZ4_FORCE_INLINE void
LZ4_memcpy_using_offset(BYTE * dstPtr,const BYTE * srcPtr,BYTE * dstEnd,const size_t offset)423 LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
424 {
425     BYTE v[8];
426 
427     assert(dstEnd >= dstPtr + MINMATCH);
428 
429     switch(offset) {
430     case 1:
431         MEM_INIT(v, *srcPtr, 8);
432         break;
433     case 2:
434         LZ4_memcpy(v, srcPtr, 2);
435         LZ4_memcpy(&v[2], srcPtr, 2);
436         LZ4_memcpy(&v[4], v, 4);
437         break;
438     case 4:
439         LZ4_memcpy(v, srcPtr, 4);
440         LZ4_memcpy(&v[4], srcPtr, 4);
441         break;
442     default:
443         LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
444         return;
445     }
446 
447     LZ4_memcpy(dstPtr, v, 8);
448     dstPtr += 8;
449     while (dstPtr < dstEnd) {
450         LZ4_memcpy(dstPtr, v, 8);
451         dstPtr += 8;
452     }
453 }
454 #endif
455 
456 
457 /*-************************************
458 *  Local Structures and types
459 **************************************/
460 typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t;
461 
462 /**
463  * This enum distinguishes several different modes of accessing previous
464  * content in the stream.
465  *
466  * - noDict        : There is no preceding content.
467  * - withPrefix64k : Table entries up to ctx->dictSize before the current blob
468  *                   blob being compressed are valid and refer to the preceding
469  *                   content (of length ctx->dictSize), which is available
470  *                   contiguously preceding in memory the content currently
471  *                   being compressed.
472  * - usingExtDict  : Like withPrefix64k, but the preceding content is somewhere
473  *                   else in memory, starting at ctx->dictionary with length
474  *                   ctx->dictSize.
475  * - usingDictCtx  : Like usingExtDict, but everything concerning the preceding
476  *                   content is in a separate context, pointed to by
477  *                   ctx->dictCtx. ctx->dictionary, ctx->dictSize, and table
478  *                   entries in the current context that refer to positions
479  *                   preceding the beginning of the current compression are
480  *                   ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx
481  *                   ->dictSize describe the location and size of the preceding
482  *                   content, and matches are found by looking in the ctx
483  *                   ->dictCtx->hashTable.
484  */
485 typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive;
486 typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
487 
488 /*-*******************************
489  *  Decompression functions
490  ********************************/
491 
492 typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
493 typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
494 
495 typedef enum { loop_error = -2, initial_error = -1, ok = 0 } variable_length_error;
496 
497 LZ4_FORCE_INLINE unsigned
read_variable_length(const BYTE ** ip,const BYTE * lencheck,int loop_check,int initial_check,variable_length_error * error)498 read_variable_length(const BYTE**ip, const BYTE* lencheck,
499                      int loop_check, int initial_check,
500                      variable_length_error* error)
501 {
502     U32 length = 0;
503     U32 s;
504     if (initial_check && unlikely((*ip) >= lencheck)) {    /* overflow detection */
505         *error = initial_error;
506         return length;
507     }
508     do {
509         s = **ip;
510         (*ip)++;
511         length += s;
512         if (loop_check && unlikely((*ip) >= lencheck)) {    /* overflow detection */
513             *error = loop_error;
514             return length;
515         }
516     } while (s==255);
517 
518     return length;
519 }
520 
521 #define	LZ4_STATIC_ASSERT(c)	ASSERT(c)
522 
523 
524 /*! LZ4_decompress_generic() :
525  *  This generic decompression function covers all use cases.
526  *  It shall be instantiated several times, using different sets of directives.
527  *  Note that it is important for performance that this function really get inlined,
528  *  in order to remove useless branches during compilation optimization.
529  */
530 LZ4_FORCE_INLINE int
LZ4_decompress_generic(const char * const src,char * const dst,int srcSize,int outputSize,endCondition_directive endOnInput,earlyEnd_directive partialDecoding,dict_directive dict,const BYTE * const lowPrefix,const BYTE * const dictStart,const size_t dictSize)531 LZ4_decompress_generic(
532                  const char* const src,
533                  char* const dst,
534                  int srcSize,
535                  int outputSize,         /* If endOnInput==endOnInputSize, this value is `dstCapacity` */
536 
537                  endCondition_directive endOnInput,   /* endOnOutputSize, endOnInputSize */
538                  earlyEnd_directive partialDecoding,  /* full, partial */
539                  dict_directive dict,                 /* noDict, withPrefix64k, usingExtDict */
540                  const BYTE* const lowPrefix,  /* always <= dst, == dst when no prefix */
541                  const BYTE* const dictStart,  /* only if dict==usingExtDict */
542                  const size_t dictSize         /* note : = 0 if noDict */
543                  )
544 {
545     if ((src == NULL) || (outputSize < 0)) { return -1; }
546 
547     {   const BYTE* ip = (const BYTE*) src;
548         const BYTE* const iend = ip + srcSize;
549 
550         BYTE* op = (BYTE*) dst;
551         BYTE* const oend = op + outputSize;
552         BYTE* cpy;
553 
554         const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize;
555 
556         const int safeDecode = (endOnInput==endOnInputSize);
557         const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
558 
559 
560         /* Set up the "end" pointers for the shortcut. */
561         const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
562         const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
563 
564         const BYTE* match;
565         size_t offset;
566         unsigned token;
567         size_t length;
568 
569 
570         DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize);
571 
572         /* Special cases */
573         assert(lowPrefix <= op);
574         if ((endOnInput) && (unlikely(outputSize==0))) {
575             /* Empty output buffer */
576             if (partialDecoding) return 0;
577             return ((srcSize==1) && (*ip==0)) ? 0 : -1;
578         }
579         if ((!endOnInput) && (unlikely(outputSize==0))) { return (*ip==0 ? 1 : -1); }
580         if ((endOnInput) && unlikely(srcSize==0)) { return -1; }
581 
582 	/* Currently the fast loop shows a regression on qualcomm arm chips. */
583 #if LZ4_FAST_DEC_LOOP
584         if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
585             DEBUGLOG(6, "skip fast decode loop");
586             goto safe_decode;
587         }
588 
589         /* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
590         while (1) {
591             /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
592             assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
593             if (endOnInput) { assert(ip < iend); }
594             token = *ip++;
595             length = token >> ML_BITS;  /* literal length */
596 
597             assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
598 
599             /* decode literal length */
600             if (length == RUN_MASK) {
601                 variable_length_error error = ok;
602                 length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error);
603                 if (error == initial_error) { goto _output_error; }
604                 if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
605                 if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
606 
607                 /* copy literals */
608                 cpy = op+length;
609                 LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
610                 if (endOnInput) {  /* LZ4_decompress_safe() */
611                     if ((cpy>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; }
612                     LZ4_wildCopy32(op, ip, cpy);
613                 } else {   /* LZ4_decompress_fast() */
614                     if (cpy>oend-8) { goto safe_literal_copy; }
615                     LZ4_wildCopy8(op, ip, cpy); /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
616                                                  * it doesn't know input length, and only relies on end-of-block properties */
617                 }
618                 ip += length; op = cpy;
619             } else {
620                 cpy = op+length;
621                 if (endOnInput) {  /* LZ4_decompress_safe() */
622                     DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length);
623                     /* We don't need to check oend, since we check it once for each loop below */
624                     if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; }
625                     /* Literals can only be 14, but hope compilers optimize if we copy by a register size */
626                     LZ4_memcpy(op, ip, 16);
627                 } else {  /* LZ4_decompress_fast() */
628                     /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
629                      * it doesn't know input length, and relies on end-of-block properties */
630                     LZ4_memcpy(op, ip, 8);
631                     if (length > 8) { LZ4_memcpy(op+8, ip+8, 8); }
632                 }
633                 ip += length; op = cpy;
634             }
635 
636             /* get offset */
637             offset = LZ4_readLE16(ip); ip+=2;
638             match = op - offset;
639             assert(match <= op);
640 
641             /* get matchlength */
642             length = token & ML_MASK;
643 
644             if (length == ML_MASK) {
645                 variable_length_error error = ok;
646                 if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */
647                 length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error);
648                 if (error != ok) { goto _output_error; }
649                 if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */
650                 length += MINMATCH;
651                 if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
652                     goto safe_match_copy;
653                 }
654             } else {
655                 length += MINMATCH;
656                 if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
657                     goto safe_match_copy;
658                 }
659 
660                 /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
661                 if ((dict == withPrefix64k) || (match >= lowPrefix)) {
662                     if (offset >= 8) {
663                         assert(match >= lowPrefix);
664                         assert(match <= op);
665                         assert(op + 18 <= oend);
666 
667                         LZ4_memcpy(op, match, 8);
668                         LZ4_memcpy(op+8, match+8, 8);
669                         LZ4_memcpy(op+16, match+16, 2);
670                         op += length;
671                         continue;
672             }   }   }
673 
674             if (checkOffset && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */
675             /* match starting within external dictionary */
676             if ((dict==usingExtDict) && (match < lowPrefix)) {
677                 if (unlikely(op+length > oend-LASTLITERALS)) {
678                     if (partialDecoding) {
679                         DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd");
680                         length = MIN(length, (size_t)(oend-op));
681                     } else {
682                         goto _output_error;  /* end-of-block condition violated */
683                 }   }
684 
685                 if (length <= (size_t)(lowPrefix-match)) {
686                     /* match fits entirely within external dictionary : just copy */
687                     memmove(op, dictEnd - (lowPrefix-match), length);
688                     op += length;
689                 } else {
690                     /* match stretches into both external dictionary and current block */
691                     size_t const copySize = (size_t)(lowPrefix - match);
692                     size_t const restSize = length - copySize;
693                     LZ4_memcpy(op, dictEnd - copySize, copySize);
694                     op += copySize;
695                     if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
696                         BYTE* const endOfMatch = op + restSize;
697                         const BYTE* copyFrom = lowPrefix;
698                         while (op < endOfMatch) { *op++ = *copyFrom++; }
699                     } else {
700                         LZ4_memcpy(op, lowPrefix, restSize);
701                         op += restSize;
702                 }   }
703                 continue;
704             }
705 
706             /* copy match within block */
707             cpy = op + length;
708 
709             assert((op <= oend) && (oend-op >= 32));
710             if (unlikely(offset<16)) {
711                 LZ4_memcpy_using_offset(op, match, cpy, offset);
712             } else {
713                 LZ4_wildCopy32(op, match, cpy);
714             }
715 
716             op = cpy;   /* wildcopy correction */
717         }
718     safe_decode:
719 #endif
720 
721         /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */
722         while (1) {
723             token = *ip++;
724             length = token >> ML_BITS;  /* literal length */
725 
726             assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
727 
728             /* A two-stage shortcut for the most common case:
729              * 1) If the literal length is 0..14, and there is enough space,
730              * enter the shortcut and copy 16 bytes on behalf of the literals
731              * (in the fast mode, only 8 bytes can be safely copied this way).
732              * 2) Further if the match length is 4..18, copy 18 bytes in a similar
733              * manner; but we ensure that there's enough space in the output for
734              * those 18 bytes earlier, upon entering the shortcut (in other words,
735              * there is a combined check for both stages).
736              */
737             if ( (endOnInput ? length != RUN_MASK : length <= 8)
738                 /* strictly "less than" on input, to re-enter the loop with at least one byte */
739               && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) {
740                 /* Copy the literals */
741                 LZ4_memcpy(op, ip, endOnInput ? 16 : 8);
742                 op += length; ip += length;
743 
744                 /* The second stage: prepare for match copying, decode full info.
745                  * If it doesn't work out, the info won't be wasted. */
746                 length = token & ML_MASK; /* match length */
747                 offset = LZ4_readLE16(ip); ip += 2;
748                 match = op - offset;
749                 assert(match <= op); /* check overflow */
750 
751                 /* Do not deal with overlapping matches. */
752                 if ( (length != ML_MASK)
753                   && (offset >= 8)
754                   && (dict==withPrefix64k || match >= lowPrefix) ) {
755                     /* Copy the match. */
756                     LZ4_memcpy(op + 0, match + 0, 8);
757                     LZ4_memcpy(op + 8, match + 8, 8);
758                     LZ4_memcpy(op +16, match +16, 2);
759                     op += length + MINMATCH;
760                     /* Both stages worked, load the next token. */
761                     continue;
762                 }
763 
764                 /* The second stage didn't work out, but the info is ready.
765                  * Propel it right to the point of match copying. */
766                 goto _copy_match;
767             }
768 
769             /* decode literal length */
770             if (length == RUN_MASK) {
771                 variable_length_error error = ok;
772                 length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error);
773                 if (error == initial_error) { goto _output_error; }
774                 if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
775                 if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
776             }
777 
778             /* copy literals */
779             cpy = op+length;
780 #if LZ4_FAST_DEC_LOOP
781         safe_literal_copy:
782 #endif
783             LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
784             if ( ((endOnInput) && ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) )
785               || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) )
786             {
787                 /* We've either hit the input parsing restriction or the output parsing restriction.
788                  * In the normal scenario, decoding a full block, it must be the last sequence,
789                  * otherwise it's an error (invalid input or dimensions).
790                  * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow.
791                  */
792                 if (partialDecoding) {
793                     /* Since we are partial decoding we may be in this block because of the output parsing
794                      * restriction, which is not valid since the output buffer is allowed to be undersized.
795                      */
796                     assert(endOnInput);
797                     DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end")
798                     DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length);
799                     DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op));
800                     DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip));
801                     /* Finishing in the middle of a literals segment,
802                      * due to lack of input.
803                      */
804                     if (ip+length > iend) {
805                         length = (size_t)(iend-ip);
806                         cpy = op + length;
807                     }
808                     /* Finishing in the middle of a literals segment,
809                      * due to lack of output space.
810                      */
811                     if (cpy > oend) {
812                         cpy = oend;
813                         assert(op<=oend);
814                         length = (size_t)(oend-op);
815                     }
816                 } else {
817                     /* We must be on the last sequence because of the parsing limitations so check
818                      * that we exactly regenerate the original size (must be exact when !endOnInput).
819                      */
820                     if ((!endOnInput) && (cpy != oend)) { goto _output_error; }
821                      /* We must be on the last sequence (or invalid) because of the parsing limitations
822                       * so check that we exactly consume the input and don't overrun the output buffer.
823                       */
824                     if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) {
825                         DEBUGLOG(6, "should have been last run of literals")
826                         DEBUGLOG(6, "ip(%p) + length(%i) = %p != iend (%p)", ip, (int)length, ip+length, iend);
827                         DEBUGLOG(6, "or cpy(%p) > oend(%p)", cpy, oend);
828                         goto _output_error;
829                     }
830                 }
831                 memmove(op, ip, length);  /* supports overlapping memory regions; only matters for in-place decompression scenarios */
832                 ip += length;
833                 op += length;
834                 /* Necessarily EOF when !partialDecoding.
835                  * When partialDecoding, it is EOF if we've either
836                  * filled the output buffer or
837                  * can't proceed with reading an offset for following match.
838                  */
839                 if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) {
840                     break;
841                 }
842             } else {
843                 LZ4_wildCopy8(op, ip, cpy);   /* may overwrite up to WILDCOPYLENGTH beyond cpy */
844                 ip += length; op = cpy;
845             }
846 
847             /* get offset */
848             offset = LZ4_readLE16(ip); ip+=2;
849             match = op - offset;
850 
851             /* get matchlength */
852             length = token & ML_MASK;
853 
854     _copy_match:
855             if (length == ML_MASK) {
856               variable_length_error error = ok;
857               length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error);
858               if (error != ok) goto _output_error;
859                 if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error;   /* overflow detection */
860             }
861             length += MINMATCH;
862 
863 #if LZ4_FAST_DEC_LOOP
864         safe_match_copy:
865 #endif
866             if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
867             /* match starting within external dictionary */
868             if ((dict==usingExtDict) && (match < lowPrefix)) {
869                 if (unlikely(op+length > oend-LASTLITERALS)) {
870                     if (partialDecoding) length = MIN(length, (size_t)(oend-op));
871                     else goto _output_error;   /* doesn't respect parsing restriction */
872                 }
873 
874                 if (length <= (size_t)(lowPrefix-match)) {
875                     /* match fits entirely within external dictionary : just copy */
876                     memmove(op, dictEnd - (lowPrefix-match), length);
877                     op += length;
878                 } else {
879                     /* match stretches into both external dictionary and current block */
880                     size_t const copySize = (size_t)(lowPrefix - match);
881                     size_t const restSize = length - copySize;
882                     LZ4_memcpy(op, dictEnd - copySize, copySize);
883                     op += copySize;
884                     if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
885                         BYTE* const endOfMatch = op + restSize;
886                         const BYTE* copyFrom = lowPrefix;
887                         while (op < endOfMatch) *op++ = *copyFrom++;
888                     } else {
889                         LZ4_memcpy(op, lowPrefix, restSize);
890                         op += restSize;
891                 }   }
892                 continue;
893             }
894             assert(match >= lowPrefix);
895 
896             /* copy match within block */
897             cpy = op + length;
898 
899             /* partialDecoding : may end anywhere within the block */
900             assert(op<=oend);
901             if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
902                 size_t const mlen = MIN(length, (size_t)(oend-op));
903                 const BYTE* const matchEnd = match + mlen;
904                 BYTE* const copyEnd = op + mlen;
905                 if (matchEnd > op) {   /* overlap copy */
906                     while (op < copyEnd) { *op++ = *match++; }
907                 } else {
908                     LZ4_memcpy(op, match, mlen);
909                 }
910                 op = copyEnd;
911                 if (op == oend) { break; }
912                 continue;
913             }
914 
915             if (unlikely(offset<8)) {
916                 LZ4_write32(op, 0);   /* silence msan warning when offset==0 */
917                 op[0] = match[0];
918                 op[1] = match[1];
919                 op[2] = match[2];
920                 op[3] = match[3];
921                 match += inc32table[offset];
922                 LZ4_memcpy(op+4, match, 4);
923                 match -= dec64table[offset];
924             } else {
925                 LZ4_memcpy(op, match, 8);
926                 match += 8;
927             }
928             op += 8;
929 
930             if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
931                 BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1);
932                 if (cpy > oend-LASTLITERALS) { goto _output_error; } /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
933                 if (op < oCopyLimit) {
934                     LZ4_wildCopy8(op, match, oCopyLimit);
935                     match += oCopyLimit - op;
936                     op = oCopyLimit;
937                 }
938                 while (op < cpy) { *op++ = *match++; }
939             } else {
940                 LZ4_memcpy(op, match, 8);
941                 if (length > 16)  { LZ4_wildCopy8(op+8, match+8, cpy); }
942             }
943             op = cpy;   /* wildcopy correction */
944         }
945 
946         /* end of decoding */
947         if (endOnInput) {
948             DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst));
949            return (int) (((char*)op)-dst);     /* Nb of output bytes decoded */
950        } else {
951            return (int) (((const char*)ip)-src);   /* Nb of input bytes read */
952        }
953 
954         /* Overflow error detected */
955     _output_error:
956         return (int) (-(((const char*)ip)-src))-1;
957     }
958 }
959 
960 /*
961  * LZ4_uncompress_unknownOutputSize() :
962  * 	isize  : is the input size, therefore the compressed size
963  * 	maxOutputSize : is the size of the destination buffer (which must be
964  * 		already allocated)
965  * 	return : the number of bytes decoded in the destination buffer
966  * 		(necessarily <= maxOutputSize). If the source stream is
967  * 		malformed, the function will stop decoding and return a
968  * 		negative result, indicating the byte position of the faulty
969  * 		instruction. This function never writes beyond dest +
970  * 		maxOutputSize, and is therefore protected against malicious
971  * 		data packets.
972  * 	note   : Destination buffer must be already allocated.
973  *		This version is slightly slower than real_LZ4_uncompress()
974  *
975  */
976 
977 /*
978  * Note: In upstream code, LZ4_uncompress_unknownOutputSize is now a legacy
979  *       wrapper for LZ4_decompress_safe which is a wrapper for
980  *	 LZ4_decompress_generic; this wrapper flattens that, rather than
981  *	 rewriting the callers.
982  */
LZ4_uncompress_unknownOutputSize(const char * source,char * dest,int compressedSize,int maxDecompressedSize)983 int LZ4_uncompress_unknownOutputSize(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
984 {
985     return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize,
986                                   endOnInputSize, decode_full_block, noDict,
987                                   (BYTE*)dest, NULL, 0);
988 }
989