xref: /freebsd/sys/contrib/zstd/lib/common/entropy_common.c (revision 37f1f2684f2670b204080ef2d6c303becd28545f)
1*37f1f268SConrad Meyer /* ******************************************************************
2*37f1f268SConrad Meyer  * Common functions of New Generation Entropy library
3*37f1f268SConrad Meyer  * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
4*37f1f268SConrad Meyer  *
5*37f1f268SConrad Meyer  *  You can contact the author at :
6*37f1f268SConrad Meyer  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
7*37f1f268SConrad Meyer  *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
8*37f1f268SConrad Meyer  *
9*37f1f268SConrad Meyer  * This source code is licensed under both the BSD-style license (found in the
10*37f1f268SConrad Meyer  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
11*37f1f268SConrad Meyer  * in the COPYING file in the root directory of this source tree).
12*37f1f268SConrad Meyer  * You may select, at your option, one of the above-listed licenses.
13*37f1f268SConrad Meyer ****************************************************************** */
140c16b537SWarner Losh 
150c16b537SWarner Losh /* *************************************
160c16b537SWarner Losh *  Dependencies
170c16b537SWarner Losh ***************************************/
180c16b537SWarner Losh #include "mem.h"
190c16b537SWarner Losh #include "error_private.h"       /* ERR_*, ERROR */
200c16b537SWarner Losh #define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
210c16b537SWarner Losh #include "fse.h"
220c16b537SWarner Losh #define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
230c16b537SWarner Losh #include "huf.h"
240c16b537SWarner Losh 
250c16b537SWarner Losh 
260c16b537SWarner Losh /*===   Version   ===*/
270c16b537SWarner Losh unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
280c16b537SWarner Losh 
290c16b537SWarner Losh 
300c16b537SWarner Losh /*===   Error Management   ===*/
310c16b537SWarner Losh unsigned FSE_isError(size_t code) { return ERR_isError(code); }
320c16b537SWarner Losh const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
330c16b537SWarner Losh 
340c16b537SWarner Losh unsigned HUF_isError(size_t code) { return ERR_isError(code); }
350c16b537SWarner Losh const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
360c16b537SWarner Losh 
370c16b537SWarner Losh 
380c16b537SWarner Losh /*-**************************************************************
390c16b537SWarner Losh *  FSE NCount encoding-decoding
400c16b537SWarner Losh ****************************************************************/
410c16b537SWarner Losh size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
420c16b537SWarner Losh                  const void* headerBuffer, size_t hbSize)
430c16b537SWarner Losh {
440c16b537SWarner Losh     const BYTE* const istart = (const BYTE*) headerBuffer;
450c16b537SWarner Losh     const BYTE* const iend = istart + hbSize;
460c16b537SWarner Losh     const BYTE* ip = istart;
470c16b537SWarner Losh     int nbBits;
480c16b537SWarner Losh     int remaining;
490c16b537SWarner Losh     int threshold;
500c16b537SWarner Losh     U32 bitStream;
510c16b537SWarner Losh     int bitCount;
520c16b537SWarner Losh     unsigned charnum = 0;
530c16b537SWarner Losh     int previous0 = 0;
540c16b537SWarner Losh 
550f743729SConrad Meyer     if (hbSize < 4) {
560f743729SConrad Meyer         /* This function only works when hbSize >= 4 */
570f743729SConrad Meyer         char buffer[4];
580f743729SConrad Meyer         memset(buffer, 0, sizeof(buffer));
590f743729SConrad Meyer         memcpy(buffer, headerBuffer, hbSize);
600f743729SConrad Meyer         {   size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr,
610f743729SConrad Meyer                                                     buffer, sizeof(buffer));
620f743729SConrad Meyer             if (FSE_isError(countSize)) return countSize;
630f743729SConrad Meyer             if (countSize > hbSize) return ERROR(corruption_detected);
640f743729SConrad Meyer             return countSize;
650f743729SConrad Meyer     }   }
660f743729SConrad Meyer     assert(hbSize >= 4);
670f743729SConrad Meyer 
680f743729SConrad Meyer     /* init */
690f743729SConrad Meyer     memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0]));   /* all symbols not present in NCount have a frequency of 0 */
700c16b537SWarner Losh     bitStream = MEM_readLE32(ip);
710c16b537SWarner Losh     nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
720c16b537SWarner Losh     if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
730c16b537SWarner Losh     bitStream >>= 4;
740c16b537SWarner Losh     bitCount = 4;
750c16b537SWarner Losh     *tableLogPtr = nbBits;
760c16b537SWarner Losh     remaining = (1<<nbBits)+1;
770c16b537SWarner Losh     threshold = 1<<nbBits;
780c16b537SWarner Losh     nbBits++;
790c16b537SWarner Losh 
800c16b537SWarner Losh     while ((remaining>1) & (charnum<=*maxSVPtr)) {
810c16b537SWarner Losh         if (previous0) {
820c16b537SWarner Losh             unsigned n0 = charnum;
830c16b537SWarner Losh             while ((bitStream & 0xFFFF) == 0xFFFF) {
840c16b537SWarner Losh                 n0 += 24;
850c16b537SWarner Losh                 if (ip < iend-5) {
860c16b537SWarner Losh                     ip += 2;
870c16b537SWarner Losh                     bitStream = MEM_readLE32(ip) >> bitCount;
880c16b537SWarner Losh                 } else {
890c16b537SWarner Losh                     bitStream >>= 16;
900c16b537SWarner Losh                     bitCount   += 16;
910c16b537SWarner Losh             }   }
920c16b537SWarner Losh             while ((bitStream & 3) == 3) {
930c16b537SWarner Losh                 n0 += 3;
940c16b537SWarner Losh                 bitStream >>= 2;
950c16b537SWarner Losh                 bitCount += 2;
960c16b537SWarner Losh             }
970c16b537SWarner Losh             n0 += bitStream & 3;
980c16b537SWarner Losh             bitCount += 2;
990c16b537SWarner Losh             if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
1000c16b537SWarner Losh             while (charnum < n0) normalizedCounter[charnum++] = 0;
1010c16b537SWarner Losh             if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
1020f743729SConrad Meyer                 assert((bitCount >> 3) <= 3); /* For first condition to work */
1030c16b537SWarner Losh                 ip += bitCount>>3;
1040c16b537SWarner Losh                 bitCount &= 7;
1050c16b537SWarner Losh                 bitStream = MEM_readLE32(ip) >> bitCount;
1060c16b537SWarner Losh             } else {
1070c16b537SWarner Losh                 bitStream >>= 2;
1080c16b537SWarner Losh         }   }
1090c16b537SWarner Losh         {   int const max = (2*threshold-1) - remaining;
1100c16b537SWarner Losh             int count;
1110c16b537SWarner Losh 
1120c16b537SWarner Losh             if ((bitStream & (threshold-1)) < (U32)max) {
1130c16b537SWarner Losh                 count = bitStream & (threshold-1);
1140c16b537SWarner Losh                 bitCount += nbBits-1;
1150c16b537SWarner Losh             } else {
1160c16b537SWarner Losh                 count = bitStream & (2*threshold-1);
1170c16b537SWarner Losh                 if (count >= threshold) count -= max;
1180c16b537SWarner Losh                 bitCount += nbBits;
1190c16b537SWarner Losh             }
1200c16b537SWarner Losh 
1210c16b537SWarner Losh             count--;   /* extra accuracy */
1220c16b537SWarner Losh             remaining -= count < 0 ? -count : count;   /* -1 means +1 */
1230c16b537SWarner Losh             normalizedCounter[charnum++] = (short)count;
1240c16b537SWarner Losh             previous0 = !count;
1250c16b537SWarner Losh             while (remaining < threshold) {
1260c16b537SWarner Losh                 nbBits--;
1270c16b537SWarner Losh                 threshold >>= 1;
1280c16b537SWarner Losh             }
1290c16b537SWarner Losh 
1300c16b537SWarner Losh             if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
1310c16b537SWarner Losh                 ip += bitCount>>3;
1320c16b537SWarner Losh                 bitCount &= 7;
1330c16b537SWarner Losh             } else {
1340c16b537SWarner Losh                 bitCount -= (int)(8 * (iend - 4 - ip));
1350c16b537SWarner Losh                 ip = iend - 4;
1360c16b537SWarner Losh             }
1370c16b537SWarner Losh             bitStream = MEM_readLE32(ip) >> (bitCount & 31);
1380c16b537SWarner Losh     }   }   /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
1390c16b537SWarner Losh     if (remaining != 1) return ERROR(corruption_detected);
1400c16b537SWarner Losh     if (bitCount > 32) return ERROR(corruption_detected);
1410c16b537SWarner Losh     *maxSVPtr = charnum-1;
1420c16b537SWarner Losh 
1430c16b537SWarner Losh     ip += (bitCount+7)>>3;
1440c16b537SWarner Losh     return ip-istart;
1450c16b537SWarner Losh }
1460c16b537SWarner Losh 
1470c16b537SWarner Losh 
1480c16b537SWarner Losh /*! HUF_readStats() :
1490c16b537SWarner Losh     Read compact Huffman tree, saved by HUF_writeCTable().
1500c16b537SWarner Losh     `huffWeight` is destination buffer.
1510c16b537SWarner Losh     `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
1520c16b537SWarner Losh     @return : size read from `src` , or an error Code .
1530c16b537SWarner Losh     Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
1540c16b537SWarner Losh */
1550c16b537SWarner Losh size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
1560c16b537SWarner Losh                      U32* nbSymbolsPtr, U32* tableLogPtr,
1570c16b537SWarner Losh                      const void* src, size_t srcSize)
1580c16b537SWarner Losh {
1590c16b537SWarner Losh     U32 weightTotal;
1600c16b537SWarner Losh     const BYTE* ip = (const BYTE*) src;
1610c16b537SWarner Losh     size_t iSize;
1620c16b537SWarner Losh     size_t oSize;
1630c16b537SWarner Losh 
1640c16b537SWarner Losh     if (!srcSize) return ERROR(srcSize_wrong);
1650c16b537SWarner Losh     iSize = ip[0];
1660c16b537SWarner Losh     /* memset(huffWeight, 0, hwSize);   *//* is not necessary, even though some analyzer complain ... */
1670c16b537SWarner Losh 
1680c16b537SWarner Losh     if (iSize >= 128) {  /* special header */
1690c16b537SWarner Losh         oSize = iSize - 127;
1700c16b537SWarner Losh         iSize = ((oSize+1)/2);
1710c16b537SWarner Losh         if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
1720c16b537SWarner Losh         if (oSize >= hwSize) return ERROR(corruption_detected);
1730c16b537SWarner Losh         ip += 1;
1740c16b537SWarner Losh         {   U32 n;
1750c16b537SWarner Losh             for (n=0; n<oSize; n+=2) {
1760c16b537SWarner Losh                 huffWeight[n]   = ip[n/2] >> 4;
1770c16b537SWarner Losh                 huffWeight[n+1] = ip[n/2] & 15;
1780c16b537SWarner Losh     }   }   }
1790c16b537SWarner Losh     else  {   /* header compressed with FSE (normal case) */
1800c16b537SWarner Losh         FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)];  /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */
1810c16b537SWarner Losh         if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
1820c16b537SWarner Losh         oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6);   /* max (hwSize-1) values decoded, as last one is implied */
1830c16b537SWarner Losh         if (FSE_isError(oSize)) return oSize;
1840c16b537SWarner Losh     }
1850c16b537SWarner Losh 
1860c16b537SWarner Losh     /* collect weight stats */
1870c16b537SWarner Losh     memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
1880c16b537SWarner Losh     weightTotal = 0;
1890c16b537SWarner Losh     {   U32 n; for (n=0; n<oSize; n++) {
1900c16b537SWarner Losh             if (huffWeight[n] >= HUF_TABLELOG_MAX) return ERROR(corruption_detected);
1910c16b537SWarner Losh             rankStats[huffWeight[n]]++;
1920c16b537SWarner Losh             weightTotal += (1 << huffWeight[n]) >> 1;
1930c16b537SWarner Losh     }   }
1940c16b537SWarner Losh     if (weightTotal == 0) return ERROR(corruption_detected);
1950c16b537SWarner Losh 
1960c16b537SWarner Losh     /* get last non-null symbol weight (implied, total must be 2^n) */
1970c16b537SWarner Losh     {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
1980c16b537SWarner Losh         if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
1990c16b537SWarner Losh         *tableLogPtr = tableLog;
2000c16b537SWarner Losh         /* determine last weight */
2010c16b537SWarner Losh         {   U32 const total = 1 << tableLog;
2020c16b537SWarner Losh             U32 const rest = total - weightTotal;
2030c16b537SWarner Losh             U32 const verif = 1 << BIT_highbit32(rest);
2040c16b537SWarner Losh             U32 const lastWeight = BIT_highbit32(rest) + 1;
2050c16b537SWarner Losh             if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
2060c16b537SWarner Losh             huffWeight[oSize] = (BYTE)lastWeight;
2070c16b537SWarner Losh             rankStats[lastWeight]++;
2080c16b537SWarner Losh     }   }
2090c16b537SWarner Losh 
2100c16b537SWarner Losh     /* check tree construction validity */
2110c16b537SWarner Losh     if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
2120c16b537SWarner Losh 
2130c16b537SWarner Losh     /* results */
2140c16b537SWarner Losh     *nbSymbolsPtr = (U32)(oSize+1);
2150c16b537SWarner Losh     return iSize+1;
2160c16b537SWarner Losh }
217