xref: /freebsd/sys/contrib/zstd/lib/dictBuilder/zdict.c (revision 0c16b53773565120a8f80a31a0af2ef56ccd368e)
1*0c16b537SWarner Losh /*
2*0c16b537SWarner Losh  * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
3*0c16b537SWarner Losh  * All rights reserved.
4*0c16b537SWarner Losh  *
5*0c16b537SWarner Losh  * This source code is licensed under both the BSD-style license (found in the
6*0c16b537SWarner Losh  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*0c16b537SWarner Losh  * in the COPYING file in the root directory of this source tree).
8*0c16b537SWarner Losh  * You may select, at your option, one of the above-listed licenses.
9*0c16b537SWarner Losh  */
10*0c16b537SWarner Losh 
11*0c16b537SWarner Losh 
12*0c16b537SWarner Losh /*-**************************************
13*0c16b537SWarner Losh *  Tuning parameters
14*0c16b537SWarner Losh ****************************************/
15*0c16b537SWarner Losh #define MINRATIO 4   /* minimum nb of apparition to be selected in dictionary */
16*0c16b537SWarner Losh #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
17*0c16b537SWarner Losh #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
18*0c16b537SWarner Losh 
19*0c16b537SWarner Losh 
20*0c16b537SWarner Losh /*-**************************************
21*0c16b537SWarner Losh *  Compiler Options
22*0c16b537SWarner Losh ****************************************/
23*0c16b537SWarner Losh /* Unix Large Files support (>4GB) */
24*0c16b537SWarner Losh #define _FILE_OFFSET_BITS 64
25*0c16b537SWarner Losh #if (defined(__sun__) && (!defined(__LP64__)))   /* Sun Solaris 32-bits requires specific definitions */
26*0c16b537SWarner Losh #  define _LARGEFILE_SOURCE
27*0c16b537SWarner Losh #elif ! defined(__LP64__)                        /* No point defining Large file for 64 bit */
28*0c16b537SWarner Losh #  define _LARGEFILE64_SOURCE
29*0c16b537SWarner Losh #endif
30*0c16b537SWarner Losh 
31*0c16b537SWarner Losh 
32*0c16b537SWarner Losh /*-*************************************
33*0c16b537SWarner Losh *  Dependencies
34*0c16b537SWarner Losh ***************************************/
35*0c16b537SWarner Losh #include <stdlib.h>        /* malloc, free */
36*0c16b537SWarner Losh #include <string.h>        /* memset */
37*0c16b537SWarner Losh #include <stdio.h>         /* fprintf, fopen, ftello64 */
38*0c16b537SWarner Losh #include <time.h>          /* clock */
39*0c16b537SWarner Losh 
40*0c16b537SWarner Losh #include "mem.h"           /* read */
41*0c16b537SWarner Losh #include "fse.h"           /* FSE_normalizeCount, FSE_writeNCount */
42*0c16b537SWarner Losh #define HUF_STATIC_LINKING_ONLY
43*0c16b537SWarner Losh #include "huf.h"           /* HUF_buildCTable, HUF_writeCTable */
44*0c16b537SWarner Losh #include "zstd_internal.h" /* includes zstd.h */
45*0c16b537SWarner Losh #include "xxhash.h"        /* XXH64 */
46*0c16b537SWarner Losh #include "divsufsort.h"
47*0c16b537SWarner Losh #ifndef ZDICT_STATIC_LINKING_ONLY
48*0c16b537SWarner Losh #  define ZDICT_STATIC_LINKING_ONLY
49*0c16b537SWarner Losh #endif
50*0c16b537SWarner Losh #include "zdict.h"
51*0c16b537SWarner Losh 
52*0c16b537SWarner Losh 
53*0c16b537SWarner Losh /*-*************************************
54*0c16b537SWarner Losh *  Constants
55*0c16b537SWarner Losh ***************************************/
56*0c16b537SWarner Losh #define KB *(1 <<10)
57*0c16b537SWarner Losh #define MB *(1 <<20)
58*0c16b537SWarner Losh #define GB *(1U<<30)
59*0c16b537SWarner Losh 
60*0c16b537SWarner Losh #define DICTLISTSIZE_DEFAULT 10000
61*0c16b537SWarner Losh 
62*0c16b537SWarner Losh #define NOISELENGTH 32
63*0c16b537SWarner Losh 
64*0c16b537SWarner Losh static const int g_compressionLevel_default = 3;
65*0c16b537SWarner Losh static const U32 g_selectivity_default = 9;
66*0c16b537SWarner Losh 
67*0c16b537SWarner Losh 
68*0c16b537SWarner Losh /*-*************************************
69*0c16b537SWarner Losh *  Console display
70*0c16b537SWarner Losh ***************************************/
71*0c16b537SWarner Losh #define DISPLAY(...)         { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
72*0c16b537SWarner Losh #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); }    /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
73*0c16b537SWarner Losh 
74*0c16b537SWarner Losh static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
75*0c16b537SWarner Losh 
76*0c16b537SWarner Losh static void ZDICT_printHex(const void* ptr, size_t length)
77*0c16b537SWarner Losh {
78*0c16b537SWarner Losh     const BYTE* const b = (const BYTE*)ptr;
79*0c16b537SWarner Losh     size_t u;
80*0c16b537SWarner Losh     for (u=0; u<length; u++) {
81*0c16b537SWarner Losh         BYTE c = b[u];
82*0c16b537SWarner Losh         if (c<32 || c>126) c = '.';   /* non-printable char */
83*0c16b537SWarner Losh         DISPLAY("%c", c);
84*0c16b537SWarner Losh     }
85*0c16b537SWarner Losh }
86*0c16b537SWarner Losh 
87*0c16b537SWarner Losh 
88*0c16b537SWarner Losh /*-********************************************************
89*0c16b537SWarner Losh *  Helper functions
90*0c16b537SWarner Losh **********************************************************/
91*0c16b537SWarner Losh unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
92*0c16b537SWarner Losh 
93*0c16b537SWarner Losh const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
94*0c16b537SWarner Losh 
95*0c16b537SWarner Losh unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
96*0c16b537SWarner Losh {
97*0c16b537SWarner Losh     if (dictSize < 8) return 0;
98*0c16b537SWarner Losh     if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
99*0c16b537SWarner Losh     return MEM_readLE32((const char*)dictBuffer + 4);
100*0c16b537SWarner Losh }
101*0c16b537SWarner Losh 
102*0c16b537SWarner Losh 
103*0c16b537SWarner Losh /*-********************************************************
104*0c16b537SWarner Losh *  Dictionary training functions
105*0c16b537SWarner Losh **********************************************************/
106*0c16b537SWarner Losh static unsigned ZDICT_NbCommonBytes (register size_t val)
107*0c16b537SWarner Losh {
108*0c16b537SWarner Losh     if (MEM_isLittleEndian()) {
109*0c16b537SWarner Losh         if (MEM_64bits()) {
110*0c16b537SWarner Losh #       if defined(_MSC_VER) && defined(_WIN64)
111*0c16b537SWarner Losh             unsigned long r = 0;
112*0c16b537SWarner Losh             _BitScanForward64( &r, (U64)val );
113*0c16b537SWarner Losh             return (unsigned)(r>>3);
114*0c16b537SWarner Losh #       elif defined(__GNUC__) && (__GNUC__ >= 3)
115*0c16b537SWarner Losh             return (__builtin_ctzll((U64)val) >> 3);
116*0c16b537SWarner Losh #       else
117*0c16b537SWarner Losh             static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
118*0c16b537SWarner Losh             return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
119*0c16b537SWarner Losh #       endif
120*0c16b537SWarner Losh         } else { /* 32 bits */
121*0c16b537SWarner Losh #       if defined(_MSC_VER)
122*0c16b537SWarner Losh             unsigned long r=0;
123*0c16b537SWarner Losh             _BitScanForward( &r, (U32)val );
124*0c16b537SWarner Losh             return (unsigned)(r>>3);
125*0c16b537SWarner Losh #       elif defined(__GNUC__) && (__GNUC__ >= 3)
126*0c16b537SWarner Losh             return (__builtin_ctz((U32)val) >> 3);
127*0c16b537SWarner Losh #       else
128*0c16b537SWarner Losh             static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
129*0c16b537SWarner Losh             return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
130*0c16b537SWarner Losh #       endif
131*0c16b537SWarner Losh         }
132*0c16b537SWarner Losh     } else {  /* Big Endian CPU */
133*0c16b537SWarner Losh         if (MEM_64bits()) {
134*0c16b537SWarner Losh #       if defined(_MSC_VER) && defined(_WIN64)
135*0c16b537SWarner Losh             unsigned long r = 0;
136*0c16b537SWarner Losh             _BitScanReverse64( &r, val );
137*0c16b537SWarner Losh             return (unsigned)(r>>3);
138*0c16b537SWarner Losh #       elif defined(__GNUC__) && (__GNUC__ >= 3)
139*0c16b537SWarner Losh             return (__builtin_clzll(val) >> 3);
140*0c16b537SWarner Losh #       else
141*0c16b537SWarner Losh             unsigned r;
142*0c16b537SWarner Losh             const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
143*0c16b537SWarner Losh             if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
144*0c16b537SWarner Losh             if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
145*0c16b537SWarner Losh             r += (!val);
146*0c16b537SWarner Losh             return r;
147*0c16b537SWarner Losh #       endif
148*0c16b537SWarner Losh         } else { /* 32 bits */
149*0c16b537SWarner Losh #       if defined(_MSC_VER)
150*0c16b537SWarner Losh             unsigned long r = 0;
151*0c16b537SWarner Losh             _BitScanReverse( &r, (unsigned long)val );
152*0c16b537SWarner Losh             return (unsigned)(r>>3);
153*0c16b537SWarner Losh #       elif defined(__GNUC__) && (__GNUC__ >= 3)
154*0c16b537SWarner Losh             return (__builtin_clz((U32)val) >> 3);
155*0c16b537SWarner Losh #       else
156*0c16b537SWarner Losh             unsigned r;
157*0c16b537SWarner Losh             if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
158*0c16b537SWarner Losh             r += (!val);
159*0c16b537SWarner Losh             return r;
160*0c16b537SWarner Losh #       endif
161*0c16b537SWarner Losh     }   }
162*0c16b537SWarner Losh }
163*0c16b537SWarner Losh 
164*0c16b537SWarner Losh 
165*0c16b537SWarner Losh /*! ZDICT_count() :
166*0c16b537SWarner Losh     Count the nb of common bytes between 2 pointers.
167*0c16b537SWarner Losh     Note : this function presumes end of buffer followed by noisy guard band.
168*0c16b537SWarner Losh */
169*0c16b537SWarner Losh static size_t ZDICT_count(const void* pIn, const void* pMatch)
170*0c16b537SWarner Losh {
171*0c16b537SWarner Losh     const char* const pStart = (const char*)pIn;
172*0c16b537SWarner Losh     for (;;) {
173*0c16b537SWarner Losh         size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
174*0c16b537SWarner Losh         if (!diff) {
175*0c16b537SWarner Losh             pIn = (const char*)pIn+sizeof(size_t);
176*0c16b537SWarner Losh             pMatch = (const char*)pMatch+sizeof(size_t);
177*0c16b537SWarner Losh             continue;
178*0c16b537SWarner Losh         }
179*0c16b537SWarner Losh         pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
180*0c16b537SWarner Losh         return (size_t)((const char*)pIn - pStart);
181*0c16b537SWarner Losh     }
182*0c16b537SWarner Losh }
183*0c16b537SWarner Losh 
184*0c16b537SWarner Losh 
185*0c16b537SWarner Losh typedef struct {
186*0c16b537SWarner Losh     U32 pos;
187*0c16b537SWarner Losh     U32 length;
188*0c16b537SWarner Losh     U32 savings;
189*0c16b537SWarner Losh } dictItem;
190*0c16b537SWarner Losh 
191*0c16b537SWarner Losh static void ZDICT_initDictItem(dictItem* d)
192*0c16b537SWarner Losh {
193*0c16b537SWarner Losh     d->pos = 1;
194*0c16b537SWarner Losh     d->length = 0;
195*0c16b537SWarner Losh     d->savings = (U32)(-1);
196*0c16b537SWarner Losh }
197*0c16b537SWarner Losh 
198*0c16b537SWarner Losh 
199*0c16b537SWarner Losh #define LLIMIT 64          /* heuristic determined experimentally */
200*0c16b537SWarner Losh #define MINMATCHLENGTH 7   /* heuristic determined experimentally */
201*0c16b537SWarner Losh static dictItem ZDICT_analyzePos(
202*0c16b537SWarner Losh                        BYTE* doneMarks,
203*0c16b537SWarner Losh                        const int* suffix, U32 start,
204*0c16b537SWarner Losh                        const void* buffer, U32 minRatio, U32 notificationLevel)
205*0c16b537SWarner Losh {
206*0c16b537SWarner Losh     U32 lengthList[LLIMIT] = {0};
207*0c16b537SWarner Losh     U32 cumulLength[LLIMIT] = {0};
208*0c16b537SWarner Losh     U32 savings[LLIMIT] = {0};
209*0c16b537SWarner Losh     const BYTE* b = (const BYTE*)buffer;
210*0c16b537SWarner Losh     size_t length;
211*0c16b537SWarner Losh     size_t maxLength = LLIMIT;
212*0c16b537SWarner Losh     size_t pos = suffix[start];
213*0c16b537SWarner Losh     U32 end = start;
214*0c16b537SWarner Losh     dictItem solution;
215*0c16b537SWarner Losh 
216*0c16b537SWarner Losh     /* init */
217*0c16b537SWarner Losh     memset(&solution, 0, sizeof(solution));
218*0c16b537SWarner Losh     doneMarks[pos] = 1;
219*0c16b537SWarner Losh 
220*0c16b537SWarner Losh     /* trivial repetition cases */
221*0c16b537SWarner Losh     if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
222*0c16b537SWarner Losh        ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
223*0c16b537SWarner Losh        ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
224*0c16b537SWarner Losh         /* skip and mark segment */
225*0c16b537SWarner Losh         U16 u16 = MEM_read16(b+pos+4);
226*0c16b537SWarner Losh         U32 u, e = 6;
227*0c16b537SWarner Losh         while (MEM_read16(b+pos+e) == u16) e+=2 ;
228*0c16b537SWarner Losh         if (b[pos+e] == b[pos+e-1]) e++;
229*0c16b537SWarner Losh         for (u=1; u<e; u++)
230*0c16b537SWarner Losh             doneMarks[pos+u] = 1;
231*0c16b537SWarner Losh         return solution;
232*0c16b537SWarner Losh     }
233*0c16b537SWarner Losh 
234*0c16b537SWarner Losh     /* look forward */
235*0c16b537SWarner Losh     do {
236*0c16b537SWarner Losh         end++;
237*0c16b537SWarner Losh         length = ZDICT_count(b + pos, b + suffix[end]);
238*0c16b537SWarner Losh     } while (length >=MINMATCHLENGTH);
239*0c16b537SWarner Losh 
240*0c16b537SWarner Losh     /* look backward */
241*0c16b537SWarner Losh     do {
242*0c16b537SWarner Losh         length = ZDICT_count(b + pos, b + *(suffix+start-1));
243*0c16b537SWarner Losh         if (length >=MINMATCHLENGTH) start--;
244*0c16b537SWarner Losh     } while(length >= MINMATCHLENGTH);
245*0c16b537SWarner Losh 
246*0c16b537SWarner Losh     /* exit if not found a minimum nb of repetitions */
247*0c16b537SWarner Losh     if (end-start < minRatio) {
248*0c16b537SWarner Losh         U32 idx;
249*0c16b537SWarner Losh         for(idx=start; idx<end; idx++)
250*0c16b537SWarner Losh             doneMarks[suffix[idx]] = 1;
251*0c16b537SWarner Losh         return solution;
252*0c16b537SWarner Losh     }
253*0c16b537SWarner Losh 
254*0c16b537SWarner Losh     {   int i;
255*0c16b537SWarner Losh         U32 searchLength;
256*0c16b537SWarner Losh         U32 refinedStart = start;
257*0c16b537SWarner Losh         U32 refinedEnd = end;
258*0c16b537SWarner Losh 
259*0c16b537SWarner Losh         DISPLAYLEVEL(4, "\n");
260*0c16b537SWarner Losh         DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u  ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
261*0c16b537SWarner Losh         DISPLAYLEVEL(4, "\n");
262*0c16b537SWarner Losh 
263*0c16b537SWarner Losh         for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
264*0c16b537SWarner Losh             BYTE currentChar = 0;
265*0c16b537SWarner Losh             U32 currentCount = 0;
266*0c16b537SWarner Losh             U32 currentID = refinedStart;
267*0c16b537SWarner Losh             U32 id;
268*0c16b537SWarner Losh             U32 selectedCount = 0;
269*0c16b537SWarner Losh             U32 selectedID = currentID;
270*0c16b537SWarner Losh             for (id =refinedStart; id < refinedEnd; id++) {
271*0c16b537SWarner Losh                 if (b[ suffix[id] + searchLength] != currentChar) {
272*0c16b537SWarner Losh                     if (currentCount > selectedCount) {
273*0c16b537SWarner Losh                         selectedCount = currentCount;
274*0c16b537SWarner Losh                         selectedID = currentID;
275*0c16b537SWarner Losh                     }
276*0c16b537SWarner Losh                     currentID = id;
277*0c16b537SWarner Losh                     currentChar = b[ suffix[id] + searchLength];
278*0c16b537SWarner Losh                     currentCount = 0;
279*0c16b537SWarner Losh                 }
280*0c16b537SWarner Losh                 currentCount ++;
281*0c16b537SWarner Losh             }
282*0c16b537SWarner Losh             if (currentCount > selectedCount) {  /* for last */
283*0c16b537SWarner Losh                 selectedCount = currentCount;
284*0c16b537SWarner Losh                 selectedID = currentID;
285*0c16b537SWarner Losh             }
286*0c16b537SWarner Losh 
287*0c16b537SWarner Losh             if (selectedCount < minRatio)
288*0c16b537SWarner Losh                 break;
289*0c16b537SWarner Losh             refinedStart = selectedID;
290*0c16b537SWarner Losh             refinedEnd = refinedStart + selectedCount;
291*0c16b537SWarner Losh         }
292*0c16b537SWarner Losh 
293*0c16b537SWarner Losh         /* evaluate gain based on new ref */
294*0c16b537SWarner Losh         start = refinedStart;
295*0c16b537SWarner Losh         pos = suffix[refinedStart];
296*0c16b537SWarner Losh         end = start;
297*0c16b537SWarner Losh         memset(lengthList, 0, sizeof(lengthList));
298*0c16b537SWarner Losh 
299*0c16b537SWarner Losh         /* look forward */
300*0c16b537SWarner Losh         do {
301*0c16b537SWarner Losh             end++;
302*0c16b537SWarner Losh             length = ZDICT_count(b + pos, b + suffix[end]);
303*0c16b537SWarner Losh             if (length >= LLIMIT) length = LLIMIT-1;
304*0c16b537SWarner Losh             lengthList[length]++;
305*0c16b537SWarner Losh         } while (length >=MINMATCHLENGTH);
306*0c16b537SWarner Losh 
307*0c16b537SWarner Losh         /* look backward */
308*0c16b537SWarner Losh         length = MINMATCHLENGTH;
309*0c16b537SWarner Losh         while ((length >= MINMATCHLENGTH) & (start > 0)) {
310*0c16b537SWarner Losh             length = ZDICT_count(b + pos, b + suffix[start - 1]);
311*0c16b537SWarner Losh             if (length >= LLIMIT) length = LLIMIT - 1;
312*0c16b537SWarner Losh             lengthList[length]++;
313*0c16b537SWarner Losh             if (length >= MINMATCHLENGTH) start--;
314*0c16b537SWarner Losh         }
315*0c16b537SWarner Losh 
316*0c16b537SWarner Losh         /* largest useful length */
317*0c16b537SWarner Losh         memset(cumulLength, 0, sizeof(cumulLength));
318*0c16b537SWarner Losh         cumulLength[maxLength-1] = lengthList[maxLength-1];
319*0c16b537SWarner Losh         for (i=(int)(maxLength-2); i>=0; i--)
320*0c16b537SWarner Losh             cumulLength[i] = cumulLength[i+1] + lengthList[i];
321*0c16b537SWarner Losh 
322*0c16b537SWarner Losh         for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
323*0c16b537SWarner Losh         maxLength = i;
324*0c16b537SWarner Losh 
325*0c16b537SWarner Losh         /* reduce maxLength in case of final into repetitive data */
326*0c16b537SWarner Losh         {   U32 l = (U32)maxLength;
327*0c16b537SWarner Losh             BYTE const c = b[pos + maxLength-1];
328*0c16b537SWarner Losh             while (b[pos+l-2]==c) l--;
329*0c16b537SWarner Losh             maxLength = l;
330*0c16b537SWarner Losh         }
331*0c16b537SWarner Losh         if (maxLength < MINMATCHLENGTH) return solution;   /* skip : no long-enough solution */
332*0c16b537SWarner Losh 
333*0c16b537SWarner Losh         /* calculate savings */
334*0c16b537SWarner Losh         savings[5] = 0;
335*0c16b537SWarner Losh         for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
336*0c16b537SWarner Losh             savings[i] = savings[i-1] + (lengthList[i] * (i-3));
337*0c16b537SWarner Losh 
338*0c16b537SWarner Losh         DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f)  \n",
339*0c16b537SWarner Losh                      (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
340*0c16b537SWarner Losh 
341*0c16b537SWarner Losh         solution.pos = (U32)pos;
342*0c16b537SWarner Losh         solution.length = (U32)maxLength;
343*0c16b537SWarner Losh         solution.savings = savings[maxLength];
344*0c16b537SWarner Losh 
345*0c16b537SWarner Losh         /* mark positions done */
346*0c16b537SWarner Losh         {   U32 id;
347*0c16b537SWarner Losh             for (id=start; id<end; id++) {
348*0c16b537SWarner Losh                 U32 p, pEnd;
349*0c16b537SWarner Losh                 U32 const testedPos = suffix[id];
350*0c16b537SWarner Losh                 if (testedPos == pos)
351*0c16b537SWarner Losh                     length = solution.length;
352*0c16b537SWarner Losh                 else {
353*0c16b537SWarner Losh                     length = ZDICT_count(b+pos, b+testedPos);
354*0c16b537SWarner Losh                     if (length > solution.length) length = solution.length;
355*0c16b537SWarner Losh                 }
356*0c16b537SWarner Losh                 pEnd = (U32)(testedPos + length);
357*0c16b537SWarner Losh                 for (p=testedPos; p<pEnd; p++)
358*0c16b537SWarner Losh                     doneMarks[p] = 1;
359*0c16b537SWarner Losh     }   }   }
360*0c16b537SWarner Losh 
361*0c16b537SWarner Losh     return solution;
362*0c16b537SWarner Losh }
363*0c16b537SWarner Losh 
364*0c16b537SWarner Losh 
365*0c16b537SWarner Losh static int isIncluded(const void* in, const void* container, size_t length)
366*0c16b537SWarner Losh {
367*0c16b537SWarner Losh     const char* const ip = (const char*) in;
368*0c16b537SWarner Losh     const char* const into = (const char*) container;
369*0c16b537SWarner Losh     size_t u;
370*0c16b537SWarner Losh 
371*0c16b537SWarner Losh     for (u=0; u<length; u++) {  /* works because end of buffer is a noisy guard band */
372*0c16b537SWarner Losh         if (ip[u] != into[u]) break;
373*0c16b537SWarner Losh     }
374*0c16b537SWarner Losh 
375*0c16b537SWarner Losh     return u==length;
376*0c16b537SWarner Losh }
377*0c16b537SWarner Losh 
378*0c16b537SWarner Losh /*! ZDICT_tryMerge() :
379*0c16b537SWarner Losh     check if dictItem can be merged, do it if possible
380*0c16b537SWarner Losh     @return : id of destination elt, 0 if not merged
381*0c16b537SWarner Losh */
382*0c16b537SWarner Losh static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
383*0c16b537SWarner Losh {
384*0c16b537SWarner Losh     const U32 tableSize = table->pos;
385*0c16b537SWarner Losh     const U32 eltEnd = elt.pos + elt.length;
386*0c16b537SWarner Losh     const char* const buf = (const char*) buffer;
387*0c16b537SWarner Losh 
388*0c16b537SWarner Losh     /* tail overlap */
389*0c16b537SWarner Losh     U32 u; for (u=1; u<tableSize; u++) {
390*0c16b537SWarner Losh         if (u==eltNbToSkip) continue;
391*0c16b537SWarner Losh         if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) {  /* overlap, existing > new */
392*0c16b537SWarner Losh             /* append */
393*0c16b537SWarner Losh             U32 const addedLength = table[u].pos - elt.pos;
394*0c16b537SWarner Losh             table[u].length += addedLength;
395*0c16b537SWarner Losh             table[u].pos = elt.pos;
396*0c16b537SWarner Losh             table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
397*0c16b537SWarner Losh             table[u].savings += elt.length / 8;    /* rough approx bonus */
398*0c16b537SWarner Losh             elt = table[u];
399*0c16b537SWarner Losh             /* sort : improve rank */
400*0c16b537SWarner Losh             while ((u>1) && (table[u-1].savings < elt.savings))
401*0c16b537SWarner Losh             table[u] = table[u-1], u--;
402*0c16b537SWarner Losh             table[u] = elt;
403*0c16b537SWarner Losh             return u;
404*0c16b537SWarner Losh     }   }
405*0c16b537SWarner Losh 
406*0c16b537SWarner Losh     /* front overlap */
407*0c16b537SWarner Losh     for (u=1; u<tableSize; u++) {
408*0c16b537SWarner Losh         if (u==eltNbToSkip) continue;
409*0c16b537SWarner Losh 
410*0c16b537SWarner Losh         if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) {  /* overlap, existing < new */
411*0c16b537SWarner Losh             /* append */
412*0c16b537SWarner Losh             int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
413*0c16b537SWarner Losh             table[u].savings += elt.length / 8;    /* rough approx bonus */
414*0c16b537SWarner Losh             if (addedLength > 0) {   /* otherwise, elt fully included into existing */
415*0c16b537SWarner Losh                 table[u].length += addedLength;
416*0c16b537SWarner Losh                 table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
417*0c16b537SWarner Losh             }
418*0c16b537SWarner Losh             /* sort : improve rank */
419*0c16b537SWarner Losh             elt = table[u];
420*0c16b537SWarner Losh             while ((u>1) && (table[u-1].savings < elt.savings))
421*0c16b537SWarner Losh                 table[u] = table[u-1], u--;
422*0c16b537SWarner Losh             table[u] = elt;
423*0c16b537SWarner Losh             return u;
424*0c16b537SWarner Losh         }
425*0c16b537SWarner Losh 
426*0c16b537SWarner Losh         if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
427*0c16b537SWarner Losh             if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
428*0c16b537SWarner Losh                 size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
429*0c16b537SWarner Losh                 table[u].pos = elt.pos;
430*0c16b537SWarner Losh                 table[u].savings += (U32)(elt.savings * addedLength / elt.length);
431*0c16b537SWarner Losh                 table[u].length = MIN(elt.length, table[u].length + 1);
432*0c16b537SWarner Losh                 return u;
433*0c16b537SWarner Losh             }
434*0c16b537SWarner Losh         }
435*0c16b537SWarner Losh     }
436*0c16b537SWarner Losh 
437*0c16b537SWarner Losh     return 0;
438*0c16b537SWarner Losh }
439*0c16b537SWarner Losh 
440*0c16b537SWarner Losh 
441*0c16b537SWarner Losh static void ZDICT_removeDictItem(dictItem* table, U32 id)
442*0c16b537SWarner Losh {
443*0c16b537SWarner Losh     /* convention : table[0].pos stores nb of elts */
444*0c16b537SWarner Losh     U32 const max = table[0].pos;
445*0c16b537SWarner Losh     U32 u;
446*0c16b537SWarner Losh     if (!id) return;   /* protection, should never happen */
447*0c16b537SWarner Losh     for (u=id; u<max-1; u++)
448*0c16b537SWarner Losh         table[u] = table[u+1];
449*0c16b537SWarner Losh     table->pos--;
450*0c16b537SWarner Losh }
451*0c16b537SWarner Losh 
452*0c16b537SWarner Losh 
453*0c16b537SWarner Losh static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
454*0c16b537SWarner Losh {
455*0c16b537SWarner Losh     /* merge if possible */
456*0c16b537SWarner Losh     U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
457*0c16b537SWarner Losh     if (mergeId) {
458*0c16b537SWarner Losh         U32 newMerge = 1;
459*0c16b537SWarner Losh         while (newMerge) {
460*0c16b537SWarner Losh             newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
461*0c16b537SWarner Losh             if (newMerge) ZDICT_removeDictItem(table, mergeId);
462*0c16b537SWarner Losh             mergeId = newMerge;
463*0c16b537SWarner Losh         }
464*0c16b537SWarner Losh         return;
465*0c16b537SWarner Losh     }
466*0c16b537SWarner Losh 
467*0c16b537SWarner Losh     /* insert */
468*0c16b537SWarner Losh     {   U32 current;
469*0c16b537SWarner Losh         U32 nextElt = table->pos;
470*0c16b537SWarner Losh         if (nextElt >= maxSize) nextElt = maxSize-1;
471*0c16b537SWarner Losh         current = nextElt-1;
472*0c16b537SWarner Losh         while (table[current].savings < elt.savings) {
473*0c16b537SWarner Losh             table[current+1] = table[current];
474*0c16b537SWarner Losh             current--;
475*0c16b537SWarner Losh         }
476*0c16b537SWarner Losh         table[current+1] = elt;
477*0c16b537SWarner Losh         table->pos = nextElt+1;
478*0c16b537SWarner Losh     }
479*0c16b537SWarner Losh }
480*0c16b537SWarner Losh 
481*0c16b537SWarner Losh 
482*0c16b537SWarner Losh static U32 ZDICT_dictSize(const dictItem* dictList)
483*0c16b537SWarner Losh {
484*0c16b537SWarner Losh     U32 u, dictSize = 0;
485*0c16b537SWarner Losh     for (u=1; u<dictList[0].pos; u++)
486*0c16b537SWarner Losh         dictSize += dictList[u].length;
487*0c16b537SWarner Losh     return dictSize;
488*0c16b537SWarner Losh }
489*0c16b537SWarner Losh 
490*0c16b537SWarner Losh 
491*0c16b537SWarner Losh static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
492*0c16b537SWarner Losh                             const void* const buffer, size_t bufferSize,   /* buffer must end with noisy guard band */
493*0c16b537SWarner Losh                             const size_t* fileSizes, unsigned nbFiles,
494*0c16b537SWarner Losh                             U32 minRatio, U32 notificationLevel)
495*0c16b537SWarner Losh {
496*0c16b537SWarner Losh     int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
497*0c16b537SWarner Losh     int* const suffix = suffix0+1;
498*0c16b537SWarner Losh     U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
499*0c16b537SWarner Losh     BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks));   /* +16 for overflow security */
500*0c16b537SWarner Losh     U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
501*0c16b537SWarner Losh     size_t result = 0;
502*0c16b537SWarner Losh     clock_t displayClock = 0;
503*0c16b537SWarner Losh     clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
504*0c16b537SWarner Losh 
505*0c16b537SWarner Losh #   define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
506*0c16b537SWarner Losh             if (ZDICT_clockSpan(displayClock) > refreshRate)  \
507*0c16b537SWarner Losh             { displayClock = clock(); DISPLAY(__VA_ARGS__); \
508*0c16b537SWarner Losh             if (notificationLevel>=4) fflush(stderr); } }
509*0c16b537SWarner Losh 
510*0c16b537SWarner Losh     /* init */
511*0c16b537SWarner Losh     DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
512*0c16b537SWarner Losh     if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
513*0c16b537SWarner Losh         result = ERROR(memory_allocation);
514*0c16b537SWarner Losh         goto _cleanup;
515*0c16b537SWarner Losh     }
516*0c16b537SWarner Losh     if (minRatio < MINRATIO) minRatio = MINRATIO;
517*0c16b537SWarner Losh     memset(doneMarks, 0, bufferSize+16);
518*0c16b537SWarner Losh 
519*0c16b537SWarner Losh     /* limit sample set size (divsufsort limitation)*/
520*0c16b537SWarner Losh     if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
521*0c16b537SWarner Losh     while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
522*0c16b537SWarner Losh 
523*0c16b537SWarner Losh     /* sort */
524*0c16b537SWarner Losh     DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
525*0c16b537SWarner Losh     {   int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
526*0c16b537SWarner Losh         if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
527*0c16b537SWarner Losh     }
528*0c16b537SWarner Losh     suffix[bufferSize] = (int)bufferSize;   /* leads into noise */
529*0c16b537SWarner Losh     suffix0[0] = (int)bufferSize;           /* leads into noise */
530*0c16b537SWarner Losh     /* build reverse suffix sort */
531*0c16b537SWarner Losh     {   size_t pos;
532*0c16b537SWarner Losh         for (pos=0; pos < bufferSize; pos++)
533*0c16b537SWarner Losh             reverseSuffix[suffix[pos]] = (U32)pos;
534*0c16b537SWarner Losh         /* note filePos tracks borders between samples.
535*0c16b537SWarner Losh            It's not used at this stage, but planned to become useful in a later update */
536*0c16b537SWarner Losh         filePos[0] = 0;
537*0c16b537SWarner Losh         for (pos=1; pos<nbFiles; pos++)
538*0c16b537SWarner Losh             filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
539*0c16b537SWarner Losh     }
540*0c16b537SWarner Losh 
541*0c16b537SWarner Losh     DISPLAYLEVEL(2, "finding patterns ... \n");
542*0c16b537SWarner Losh     DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
543*0c16b537SWarner Losh 
544*0c16b537SWarner Losh     {   U32 cursor; for (cursor=0; cursor < bufferSize; ) {
545*0c16b537SWarner Losh             dictItem solution;
546*0c16b537SWarner Losh             if (doneMarks[cursor]) { cursor++; continue; }
547*0c16b537SWarner Losh             solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
548*0c16b537SWarner Losh             if (solution.length==0) { cursor++; continue; }
549*0c16b537SWarner Losh             ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
550*0c16b537SWarner Losh             cursor += solution.length;
551*0c16b537SWarner Losh             DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
552*0c16b537SWarner Losh     }   }
553*0c16b537SWarner Losh 
554*0c16b537SWarner Losh _cleanup:
555*0c16b537SWarner Losh     free(suffix0);
556*0c16b537SWarner Losh     free(reverseSuffix);
557*0c16b537SWarner Losh     free(doneMarks);
558*0c16b537SWarner Losh     free(filePos);
559*0c16b537SWarner Losh     return result;
560*0c16b537SWarner Losh }
561*0c16b537SWarner Losh 
562*0c16b537SWarner Losh 
563*0c16b537SWarner Losh static void ZDICT_fillNoise(void* buffer, size_t length)
564*0c16b537SWarner Losh {
565*0c16b537SWarner Losh     unsigned const prime1 = 2654435761U;
566*0c16b537SWarner Losh     unsigned const prime2 = 2246822519U;
567*0c16b537SWarner Losh     unsigned acc = prime1;
568*0c16b537SWarner Losh     size_t p=0;;
569*0c16b537SWarner Losh     for (p=0; p<length; p++) {
570*0c16b537SWarner Losh         acc *= prime2;
571*0c16b537SWarner Losh         ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
572*0c16b537SWarner Losh     }
573*0c16b537SWarner Losh }
574*0c16b537SWarner Losh 
575*0c16b537SWarner Losh 
576*0c16b537SWarner Losh typedef struct
577*0c16b537SWarner Losh {
578*0c16b537SWarner Losh     ZSTD_CCtx* ref;
579*0c16b537SWarner Losh     ZSTD_CCtx* zc;
580*0c16b537SWarner Losh     void* workPlace;   /* must be ZSTD_BLOCKSIZE_MAX allocated */
581*0c16b537SWarner Losh } EStats_ress_t;
582*0c16b537SWarner Losh 
583*0c16b537SWarner Losh #define MAXREPOFFSET 1024
584*0c16b537SWarner Losh 
585*0c16b537SWarner Losh static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
586*0c16b537SWarner Losh                             U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
587*0c16b537SWarner Losh                             const void* src, size_t srcSize, U32 notificationLevel)
588*0c16b537SWarner Losh {
589*0c16b537SWarner Losh     size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
590*0c16b537SWarner Losh     size_t cSize;
591*0c16b537SWarner Losh 
592*0c16b537SWarner Losh     if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */
593*0c16b537SWarner Losh     {  size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
594*0c16b537SWarner Losh             if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
595*0c16b537SWarner Losh     }
596*0c16b537SWarner Losh     cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
597*0c16b537SWarner Losh     if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
598*0c16b537SWarner Losh 
599*0c16b537SWarner Losh     if (cSize) {  /* if == 0; block is not compressible */
600*0c16b537SWarner Losh         const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
601*0c16b537SWarner Losh 
602*0c16b537SWarner Losh         /* literals stats */
603*0c16b537SWarner Losh         {   const BYTE* bytePtr;
604*0c16b537SWarner Losh             for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
605*0c16b537SWarner Losh                 countLit[*bytePtr]++;
606*0c16b537SWarner Losh         }
607*0c16b537SWarner Losh 
608*0c16b537SWarner Losh         /* seqStats */
609*0c16b537SWarner Losh         {   U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
610*0c16b537SWarner Losh             ZSTD_seqToCodes(seqStorePtr);
611*0c16b537SWarner Losh 
612*0c16b537SWarner Losh             {   const BYTE* codePtr = seqStorePtr->ofCode;
613*0c16b537SWarner Losh                 U32 u;
614*0c16b537SWarner Losh                 for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
615*0c16b537SWarner Losh             }
616*0c16b537SWarner Losh 
617*0c16b537SWarner Losh             {   const BYTE* codePtr = seqStorePtr->mlCode;
618*0c16b537SWarner Losh                 U32 u;
619*0c16b537SWarner Losh                 for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
620*0c16b537SWarner Losh             }
621*0c16b537SWarner Losh 
622*0c16b537SWarner Losh             {   const BYTE* codePtr = seqStorePtr->llCode;
623*0c16b537SWarner Losh                 U32 u;
624*0c16b537SWarner Losh                 for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
625*0c16b537SWarner Losh             }
626*0c16b537SWarner Losh 
627*0c16b537SWarner Losh             if (nbSeq >= 2) { /* rep offsets */
628*0c16b537SWarner Losh                 const seqDef* const seq = seqStorePtr->sequencesStart;
629*0c16b537SWarner Losh                 U32 offset1 = seq[0].offset - 3;
630*0c16b537SWarner Losh                 U32 offset2 = seq[1].offset - 3;
631*0c16b537SWarner Losh                 if (offset1 >= MAXREPOFFSET) offset1 = 0;
632*0c16b537SWarner Losh                 if (offset2 >= MAXREPOFFSET) offset2 = 0;
633*0c16b537SWarner Losh                 repOffsets[offset1] += 3;
634*0c16b537SWarner Losh                 repOffsets[offset2] += 1;
635*0c16b537SWarner Losh     }   }   }
636*0c16b537SWarner Losh }
637*0c16b537SWarner Losh 
638*0c16b537SWarner Losh static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
639*0c16b537SWarner Losh {
640*0c16b537SWarner Losh     size_t total=0;
641*0c16b537SWarner Losh     unsigned u;
642*0c16b537SWarner Losh     for (u=0; u<nbFiles; u++) total += fileSizes[u];
643*0c16b537SWarner Losh     return total;
644*0c16b537SWarner Losh }
645*0c16b537SWarner Losh 
646*0c16b537SWarner Losh typedef struct { U32 offset; U32 count; } offsetCount_t;
647*0c16b537SWarner Losh 
648*0c16b537SWarner Losh static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count)
649*0c16b537SWarner Losh {
650*0c16b537SWarner Losh     U32 u;
651*0c16b537SWarner Losh     table[ZSTD_REP_NUM].offset = val;
652*0c16b537SWarner Losh     table[ZSTD_REP_NUM].count = count;
653*0c16b537SWarner Losh     for (u=ZSTD_REP_NUM; u>0; u--) {
654*0c16b537SWarner Losh         offsetCount_t tmp;
655*0c16b537SWarner Losh         if (table[u-1].count >= table[u].count) break;
656*0c16b537SWarner Losh         tmp = table[u-1];
657*0c16b537SWarner Losh         table[u-1] = table[u];
658*0c16b537SWarner Losh         table[u] = tmp;
659*0c16b537SWarner Losh     }
660*0c16b537SWarner Losh }
661*0c16b537SWarner Losh 
662*0c16b537SWarner Losh 
663*0c16b537SWarner Losh #define OFFCODE_MAX 30  /* only applicable to first block */
664*0c16b537SWarner Losh static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
665*0c16b537SWarner Losh                                    unsigned compressionLevel,
666*0c16b537SWarner Losh                              const void*  srcBuffer, const size_t* fileSizes, unsigned nbFiles,
667*0c16b537SWarner Losh                              const void* dictBuffer, size_t  dictBufferSize,
668*0c16b537SWarner Losh                                    unsigned notificationLevel)
669*0c16b537SWarner Losh {
670*0c16b537SWarner Losh     U32 countLit[256];
671*0c16b537SWarner Losh     HUF_CREATE_STATIC_CTABLE(hufTable, 255);
672*0c16b537SWarner Losh     U32 offcodeCount[OFFCODE_MAX+1];
673*0c16b537SWarner Losh     short offcodeNCount[OFFCODE_MAX+1];
674*0c16b537SWarner Losh     U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
675*0c16b537SWarner Losh     U32 matchLengthCount[MaxML+1];
676*0c16b537SWarner Losh     short matchLengthNCount[MaxML+1];
677*0c16b537SWarner Losh     U32 litLengthCount[MaxLL+1];
678*0c16b537SWarner Losh     short litLengthNCount[MaxLL+1];
679*0c16b537SWarner Losh     U32 repOffset[MAXREPOFFSET];
680*0c16b537SWarner Losh     offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
681*0c16b537SWarner Losh     EStats_ress_t esr;
682*0c16b537SWarner Losh     ZSTD_parameters params;
683*0c16b537SWarner Losh     U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
684*0c16b537SWarner Losh     size_t pos = 0, errorCode;
685*0c16b537SWarner Losh     size_t eSize = 0;
686*0c16b537SWarner Losh     size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
687*0c16b537SWarner Losh     size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
688*0c16b537SWarner Losh     BYTE* dstPtr = (BYTE*)dstBuffer;
689*0c16b537SWarner Losh 
690*0c16b537SWarner Losh     /* init */
691*0c16b537SWarner Losh     esr.ref = ZSTD_createCCtx();
692*0c16b537SWarner Losh     esr.zc = ZSTD_createCCtx();
693*0c16b537SWarner Losh     esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
694*0c16b537SWarner Losh     if (!esr.ref || !esr.zc || !esr.workPlace) {
695*0c16b537SWarner Losh         eSize = ERROR(memory_allocation);
696*0c16b537SWarner Losh         DISPLAYLEVEL(1, "Not enough memory \n");
697*0c16b537SWarner Losh         goto _cleanup;
698*0c16b537SWarner Losh     }
699*0c16b537SWarner Losh     if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; }   /* too large dictionary */
700*0c16b537SWarner Losh     for (u=0; u<256; u++) countLit[u] = 1;   /* any character must be described */
701*0c16b537SWarner Losh     for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
702*0c16b537SWarner Losh     for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
703*0c16b537SWarner Losh     for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
704*0c16b537SWarner Losh     memset(repOffset, 0, sizeof(repOffset));
705*0c16b537SWarner Losh     repOffset[1] = repOffset[4] = repOffset[8] = 1;
706*0c16b537SWarner Losh     memset(bestRepOffset, 0, sizeof(bestRepOffset));
707*0c16b537SWarner Losh     if (compressionLevel<=0) compressionLevel = g_compressionLevel_default;
708*0c16b537SWarner Losh     params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
709*0c16b537SWarner Losh     {   size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
710*0c16b537SWarner Losh         if (ZSTD_isError(beginResult)) {
711*0c16b537SWarner Losh             DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
712*0c16b537SWarner Losh             eSize = ERROR(GENERIC);
713*0c16b537SWarner Losh             goto _cleanup;
714*0c16b537SWarner Losh     }   }
715*0c16b537SWarner Losh 
716*0c16b537SWarner Losh     /* collect stats on all files */
717*0c16b537SWarner Losh     for (u=0; u<nbFiles; u++) {
718*0c16b537SWarner Losh         ZDICT_countEStats(esr, params,
719*0c16b537SWarner Losh                           countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
720*0c16b537SWarner Losh                          (const char*)srcBuffer + pos, fileSizes[u],
721*0c16b537SWarner Losh                           notificationLevel);
722*0c16b537SWarner Losh         pos += fileSizes[u];
723*0c16b537SWarner Losh     }
724*0c16b537SWarner Losh 
725*0c16b537SWarner Losh     /* analyze */
726*0c16b537SWarner Losh     errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
727*0c16b537SWarner Losh     if (HUF_isError(errorCode)) {
728*0c16b537SWarner Losh         eSize = ERROR(GENERIC);
729*0c16b537SWarner Losh         DISPLAYLEVEL(1, "HUF_buildCTable error \n");
730*0c16b537SWarner Losh         goto _cleanup;
731*0c16b537SWarner Losh     }
732*0c16b537SWarner Losh     huffLog = (U32)errorCode;
733*0c16b537SWarner Losh 
734*0c16b537SWarner Losh     /* looking for most common first offsets */
735*0c16b537SWarner Losh     {   U32 offset;
736*0c16b537SWarner Losh         for (offset=1; offset<MAXREPOFFSET; offset++)
737*0c16b537SWarner Losh             ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
738*0c16b537SWarner Losh     }
739*0c16b537SWarner Losh     /* note : the result of this phase should be used to better appreciate the impact on statistics */
740*0c16b537SWarner Losh 
741*0c16b537SWarner Losh     total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
742*0c16b537SWarner Losh     errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
743*0c16b537SWarner Losh     if (FSE_isError(errorCode)) {
744*0c16b537SWarner Losh         eSize = ERROR(GENERIC);
745*0c16b537SWarner Losh         DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
746*0c16b537SWarner Losh         goto _cleanup;
747*0c16b537SWarner Losh     }
748*0c16b537SWarner Losh     Offlog = (U32)errorCode;
749*0c16b537SWarner Losh 
750*0c16b537SWarner Losh     total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
751*0c16b537SWarner Losh     errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
752*0c16b537SWarner Losh     if (FSE_isError(errorCode)) {
753*0c16b537SWarner Losh         eSize = ERROR(GENERIC);
754*0c16b537SWarner Losh         DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
755*0c16b537SWarner Losh         goto _cleanup;
756*0c16b537SWarner Losh     }
757*0c16b537SWarner Losh     mlLog = (U32)errorCode;
758*0c16b537SWarner Losh 
759*0c16b537SWarner Losh     total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
760*0c16b537SWarner Losh     errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
761*0c16b537SWarner Losh     if (FSE_isError(errorCode)) {
762*0c16b537SWarner Losh         eSize = ERROR(GENERIC);
763*0c16b537SWarner Losh         DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
764*0c16b537SWarner Losh         goto _cleanup;
765*0c16b537SWarner Losh     }
766*0c16b537SWarner Losh     llLog = (U32)errorCode;
767*0c16b537SWarner Losh 
768*0c16b537SWarner Losh     /* write result to buffer */
769*0c16b537SWarner Losh     {   size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
770*0c16b537SWarner Losh         if (HUF_isError(hhSize)) {
771*0c16b537SWarner Losh             eSize = ERROR(GENERIC);
772*0c16b537SWarner Losh             DISPLAYLEVEL(1, "HUF_writeCTable error \n");
773*0c16b537SWarner Losh             goto _cleanup;
774*0c16b537SWarner Losh         }
775*0c16b537SWarner Losh         dstPtr += hhSize;
776*0c16b537SWarner Losh         maxDstSize -= hhSize;
777*0c16b537SWarner Losh         eSize += hhSize;
778*0c16b537SWarner Losh     }
779*0c16b537SWarner Losh 
780*0c16b537SWarner Losh     {   size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
781*0c16b537SWarner Losh         if (FSE_isError(ohSize)) {
782*0c16b537SWarner Losh             eSize = ERROR(GENERIC);
783*0c16b537SWarner Losh             DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
784*0c16b537SWarner Losh             goto _cleanup;
785*0c16b537SWarner Losh         }
786*0c16b537SWarner Losh         dstPtr += ohSize;
787*0c16b537SWarner Losh         maxDstSize -= ohSize;
788*0c16b537SWarner Losh         eSize += ohSize;
789*0c16b537SWarner Losh     }
790*0c16b537SWarner Losh 
791*0c16b537SWarner Losh     {   size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
792*0c16b537SWarner Losh         if (FSE_isError(mhSize)) {
793*0c16b537SWarner Losh             eSize = ERROR(GENERIC);
794*0c16b537SWarner Losh             DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
795*0c16b537SWarner Losh             goto _cleanup;
796*0c16b537SWarner Losh         }
797*0c16b537SWarner Losh         dstPtr += mhSize;
798*0c16b537SWarner Losh         maxDstSize -= mhSize;
799*0c16b537SWarner Losh         eSize += mhSize;
800*0c16b537SWarner Losh     }
801*0c16b537SWarner Losh 
802*0c16b537SWarner Losh     {   size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
803*0c16b537SWarner Losh         if (FSE_isError(lhSize)) {
804*0c16b537SWarner Losh             eSize = ERROR(GENERIC);
805*0c16b537SWarner Losh             DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
806*0c16b537SWarner Losh             goto _cleanup;
807*0c16b537SWarner Losh         }
808*0c16b537SWarner Losh         dstPtr += lhSize;
809*0c16b537SWarner Losh         maxDstSize -= lhSize;
810*0c16b537SWarner Losh         eSize += lhSize;
811*0c16b537SWarner Losh     }
812*0c16b537SWarner Losh 
813*0c16b537SWarner Losh     if (maxDstSize<12) {
814*0c16b537SWarner Losh         eSize = ERROR(GENERIC);
815*0c16b537SWarner Losh         DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
816*0c16b537SWarner Losh         goto _cleanup;
817*0c16b537SWarner Losh     }
818*0c16b537SWarner Losh # if 0
819*0c16b537SWarner Losh     MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset);
820*0c16b537SWarner Losh     MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset);
821*0c16b537SWarner Losh     MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
822*0c16b537SWarner Losh #else
823*0c16b537SWarner Losh     /* at this stage, we don't use the result of "most common first offset",
824*0c16b537SWarner Losh        as the impact of statistics is not properly evaluated */
825*0c16b537SWarner Losh     MEM_writeLE32(dstPtr+0, repStartValue[0]);
826*0c16b537SWarner Losh     MEM_writeLE32(dstPtr+4, repStartValue[1]);
827*0c16b537SWarner Losh     MEM_writeLE32(dstPtr+8, repStartValue[2]);
828*0c16b537SWarner Losh #endif
829*0c16b537SWarner Losh     eSize += 12;
830*0c16b537SWarner Losh 
831*0c16b537SWarner Losh _cleanup:
832*0c16b537SWarner Losh     ZSTD_freeCCtx(esr.ref);
833*0c16b537SWarner Losh     ZSTD_freeCCtx(esr.zc);
834*0c16b537SWarner Losh     free(esr.workPlace);
835*0c16b537SWarner Losh 
836*0c16b537SWarner Losh     return eSize;
837*0c16b537SWarner Losh }
838*0c16b537SWarner Losh 
839*0c16b537SWarner Losh 
840*0c16b537SWarner Losh 
841*0c16b537SWarner Losh size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
842*0c16b537SWarner Losh                           const void* customDictContent, size_t dictContentSize,
843*0c16b537SWarner Losh                           const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
844*0c16b537SWarner Losh                           ZDICT_params_t params)
845*0c16b537SWarner Losh {
846*0c16b537SWarner Losh     size_t hSize;
847*0c16b537SWarner Losh #define HBUFFSIZE 256   /* should prove large enough for all entropy headers */
848*0c16b537SWarner Losh     BYTE header[HBUFFSIZE];
849*0c16b537SWarner Losh     int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
850*0c16b537SWarner Losh     U32 const notificationLevel = params.notificationLevel;
851*0c16b537SWarner Losh 
852*0c16b537SWarner Losh     /* check conditions */
853*0c16b537SWarner Losh     if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
854*0c16b537SWarner Losh     if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
855*0c16b537SWarner Losh     if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
856*0c16b537SWarner Losh 
857*0c16b537SWarner Losh     /* dictionary header */
858*0c16b537SWarner Losh     MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
859*0c16b537SWarner Losh     {   U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
860*0c16b537SWarner Losh         U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
861*0c16b537SWarner Losh         U32 const dictID = params.dictID ? params.dictID : compliantID;
862*0c16b537SWarner Losh         MEM_writeLE32(header+4, dictID);
863*0c16b537SWarner Losh     }
864*0c16b537SWarner Losh     hSize = 8;
865*0c16b537SWarner Losh 
866*0c16b537SWarner Losh     /* entropy tables */
867*0c16b537SWarner Losh     DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
868*0c16b537SWarner Losh     DISPLAYLEVEL(2, "statistics ... \n");
869*0c16b537SWarner Losh     {   size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
870*0c16b537SWarner Losh                                   compressionLevel,
871*0c16b537SWarner Losh                                   samplesBuffer, samplesSizes, nbSamples,
872*0c16b537SWarner Losh                                   customDictContent, dictContentSize,
873*0c16b537SWarner Losh                                   notificationLevel);
874*0c16b537SWarner Losh         if (ZDICT_isError(eSize)) return eSize;
875*0c16b537SWarner Losh         hSize += eSize;
876*0c16b537SWarner Losh     }
877*0c16b537SWarner Losh 
878*0c16b537SWarner Losh     /* copy elements in final buffer ; note : src and dst buffer can overlap */
879*0c16b537SWarner Losh     if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
880*0c16b537SWarner Losh     {   size_t const dictSize = hSize + dictContentSize;
881*0c16b537SWarner Losh         char* dictEnd = (char*)dictBuffer + dictSize;
882*0c16b537SWarner Losh         memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
883*0c16b537SWarner Losh         memcpy(dictBuffer, header, hSize);
884*0c16b537SWarner Losh         return dictSize;
885*0c16b537SWarner Losh     }
886*0c16b537SWarner Losh }
887*0c16b537SWarner Losh 
888*0c16b537SWarner Losh 
889*0c16b537SWarner Losh size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
890*0c16b537SWarner Losh                                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
891*0c16b537SWarner Losh                                                  ZDICT_params_t params)
892*0c16b537SWarner Losh {
893*0c16b537SWarner Losh     int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
894*0c16b537SWarner Losh     U32 const notificationLevel = params.notificationLevel;
895*0c16b537SWarner Losh     size_t hSize = 8;
896*0c16b537SWarner Losh 
897*0c16b537SWarner Losh     /* calculate entropy tables */
898*0c16b537SWarner Losh     DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
899*0c16b537SWarner Losh     DISPLAYLEVEL(2, "statistics ... \n");
900*0c16b537SWarner Losh     {   size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
901*0c16b537SWarner Losh                                   compressionLevel,
902*0c16b537SWarner Losh                                   samplesBuffer, samplesSizes, nbSamples,
903*0c16b537SWarner Losh                                   (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
904*0c16b537SWarner Losh                                   notificationLevel);
905*0c16b537SWarner Losh         if (ZDICT_isError(eSize)) return eSize;
906*0c16b537SWarner Losh         hSize += eSize;
907*0c16b537SWarner Losh     }
908*0c16b537SWarner Losh 
909*0c16b537SWarner Losh     /* add dictionary header (after entropy tables) */
910*0c16b537SWarner Losh     MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
911*0c16b537SWarner Losh     {   U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
912*0c16b537SWarner Losh         U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
913*0c16b537SWarner Losh         U32 const dictID = params.dictID ? params.dictID : compliantID;
914*0c16b537SWarner Losh         MEM_writeLE32((char*)dictBuffer+4, dictID);
915*0c16b537SWarner Losh     }
916*0c16b537SWarner Losh 
917*0c16b537SWarner Losh     if (hSize + dictContentSize < dictBufferCapacity)
918*0c16b537SWarner Losh         memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
919*0c16b537SWarner Losh     return MIN(dictBufferCapacity, hSize+dictContentSize);
920*0c16b537SWarner Losh }
921*0c16b537SWarner Losh 
922*0c16b537SWarner Losh 
923*0c16b537SWarner Losh /*! ZDICT_trainFromBuffer_unsafe_legacy() :
924*0c16b537SWarner Losh *   Warning : `samplesBuffer` must be followed by noisy guard band.
925*0c16b537SWarner Losh *   @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
926*0c16b537SWarner Losh */
927*0c16b537SWarner Losh size_t ZDICT_trainFromBuffer_unsafe_legacy(
928*0c16b537SWarner Losh                             void* dictBuffer, size_t maxDictSize,
929*0c16b537SWarner Losh                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
930*0c16b537SWarner Losh                             ZDICT_legacy_params_t params)
931*0c16b537SWarner Losh {
932*0c16b537SWarner Losh     U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
933*0c16b537SWarner Losh     dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
934*0c16b537SWarner Losh     unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
935*0c16b537SWarner Losh     unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
936*0c16b537SWarner Losh     size_t const targetDictSize = maxDictSize;
937*0c16b537SWarner Losh     size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
938*0c16b537SWarner Losh     size_t dictSize = 0;
939*0c16b537SWarner Losh     U32 const notificationLevel = params.zParams.notificationLevel;
940*0c16b537SWarner Losh 
941*0c16b537SWarner Losh     /* checks */
942*0c16b537SWarner Losh     if (!dictList) return ERROR(memory_allocation);
943*0c16b537SWarner Losh     if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); }   /* requested dictionary size is too small */
944*0c16b537SWarner Losh     if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* not enough source to create dictionary */
945*0c16b537SWarner Losh 
946*0c16b537SWarner Losh     /* init */
947*0c16b537SWarner Losh     ZDICT_initDictItem(dictList);
948*0c16b537SWarner Losh 
949*0c16b537SWarner Losh     /* build dictionary */
950*0c16b537SWarner Losh     ZDICT_trainBuffer_legacy(dictList, dictListSize,
951*0c16b537SWarner Losh                        samplesBuffer, samplesBuffSize,
952*0c16b537SWarner Losh                        samplesSizes, nbSamples,
953*0c16b537SWarner Losh                        minRep, notificationLevel);
954*0c16b537SWarner Losh 
955*0c16b537SWarner Losh     /* display best matches */
956*0c16b537SWarner Losh     if (params.zParams.notificationLevel>= 3) {
957*0c16b537SWarner Losh         U32 const nb = MIN(25, dictList[0].pos);
958*0c16b537SWarner Losh         U32 const dictContentSize = ZDICT_dictSize(dictList);
959*0c16b537SWarner Losh         U32 u;
960*0c16b537SWarner Losh         DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize);
961*0c16b537SWarner Losh         DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
962*0c16b537SWarner Losh         for (u=1; u<nb; u++) {
963*0c16b537SWarner Losh             U32 const pos = dictList[u].pos;
964*0c16b537SWarner Losh             U32 const length = dictList[u].length;
965*0c16b537SWarner Losh             U32 const printedLength = MIN(40, length);
966*0c16b537SWarner Losh             if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
967*0c16b537SWarner Losh                 return ERROR(GENERIC);   /* should never happen */
968*0c16b537SWarner Losh             DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
969*0c16b537SWarner Losh                          u, length, pos, dictList[u].savings);
970*0c16b537SWarner Losh             ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
971*0c16b537SWarner Losh             DISPLAYLEVEL(3, "| \n");
972*0c16b537SWarner Losh     }   }
973*0c16b537SWarner Losh 
974*0c16b537SWarner Losh 
975*0c16b537SWarner Losh     /* create dictionary */
976*0c16b537SWarner Losh     {   U32 dictContentSize = ZDICT_dictSize(dictList);
977*0c16b537SWarner Losh         if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* dictionary content too small */
978*0c16b537SWarner Losh         if (dictContentSize < targetDictSize/4) {
979*0c16b537SWarner Losh             DISPLAYLEVEL(2, "!  warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
980*0c16b537SWarner Losh             if (samplesBuffSize < 10 * targetDictSize)
981*0c16b537SWarner Losh                 DISPLAYLEVEL(2, "!  consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
982*0c16b537SWarner Losh             if (minRep > MINRATIO) {
983*0c16b537SWarner Losh                 DISPLAYLEVEL(2, "!  consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
984*0c16b537SWarner Losh                 DISPLAYLEVEL(2, "!  note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
985*0c16b537SWarner Losh             }
986*0c16b537SWarner Losh         }
987*0c16b537SWarner Losh 
988*0c16b537SWarner Losh         if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
989*0c16b537SWarner Losh             U32 proposedSelectivity = selectivity-1;
990*0c16b537SWarner Losh             while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
991*0c16b537SWarner Losh             DISPLAYLEVEL(2, "!  note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
992*0c16b537SWarner Losh             DISPLAYLEVEL(2, "!  consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
993*0c16b537SWarner Losh             DISPLAYLEVEL(2, "!  always test dictionary efficiency on real samples \n");
994*0c16b537SWarner Losh         }
995*0c16b537SWarner Losh 
996*0c16b537SWarner Losh         /* limit dictionary size */
997*0c16b537SWarner Losh         {   U32 const max = dictList->pos;   /* convention : nb of useful elts within dictList */
998*0c16b537SWarner Losh             U32 currentSize = 0;
999*0c16b537SWarner Losh             U32 n; for (n=1; n<max; n++) {
1000*0c16b537SWarner Losh                 currentSize += dictList[n].length;
1001*0c16b537SWarner Losh                 if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
1002*0c16b537SWarner Losh             }
1003*0c16b537SWarner Losh             dictList->pos = n;
1004*0c16b537SWarner Losh             dictContentSize = currentSize;
1005*0c16b537SWarner Losh         }
1006*0c16b537SWarner Losh 
1007*0c16b537SWarner Losh         /* build dict content */
1008*0c16b537SWarner Losh         {   U32 u;
1009*0c16b537SWarner Losh             BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
1010*0c16b537SWarner Losh             for (u=1; u<dictList->pos; u++) {
1011*0c16b537SWarner Losh                 U32 l = dictList[u].length;
1012*0c16b537SWarner Losh                 ptr -= l;
1013*0c16b537SWarner Losh                 if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); }   /* should not happen */
1014*0c16b537SWarner Losh                 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
1015*0c16b537SWarner Losh         }   }
1016*0c16b537SWarner Losh 
1017*0c16b537SWarner Losh         dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
1018*0c16b537SWarner Losh                                                              samplesBuffer, samplesSizes, nbSamples,
1019*0c16b537SWarner Losh                                                              params.zParams);
1020*0c16b537SWarner Losh     }
1021*0c16b537SWarner Losh 
1022*0c16b537SWarner Losh     /* clean up */
1023*0c16b537SWarner Losh     free(dictList);
1024*0c16b537SWarner Losh     return dictSize;
1025*0c16b537SWarner Losh }
1026*0c16b537SWarner Losh 
1027*0c16b537SWarner Losh 
1028*0c16b537SWarner Losh /* issue : samplesBuffer need to be followed by a noisy guard band.
1029*0c16b537SWarner Losh *  work around : duplicate the buffer, and add the noise */
1030*0c16b537SWarner Losh size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
1031*0c16b537SWarner Losh                               const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1032*0c16b537SWarner Losh                               ZDICT_legacy_params_t params)
1033*0c16b537SWarner Losh {
1034*0c16b537SWarner Losh     size_t result;
1035*0c16b537SWarner Losh     void* newBuff;
1036*0c16b537SWarner Losh     size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
1037*0c16b537SWarner Losh     if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0;   /* not enough content => no dictionary */
1038*0c16b537SWarner Losh 
1039*0c16b537SWarner Losh     newBuff = malloc(sBuffSize + NOISELENGTH);
1040*0c16b537SWarner Losh     if (!newBuff) return ERROR(memory_allocation);
1041*0c16b537SWarner Losh 
1042*0c16b537SWarner Losh     memcpy(newBuff, samplesBuffer, sBuffSize);
1043*0c16b537SWarner Losh     ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH);   /* guard band, for end of buffer condition */
1044*0c16b537SWarner Losh 
1045*0c16b537SWarner Losh     result =
1046*0c16b537SWarner Losh         ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
1047*0c16b537SWarner Losh                                             samplesSizes, nbSamples, params);
1048*0c16b537SWarner Losh     free(newBuff);
1049*0c16b537SWarner Losh     return result;
1050*0c16b537SWarner Losh }
1051*0c16b537SWarner Losh 
1052*0c16b537SWarner Losh 
1053*0c16b537SWarner Losh size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1054*0c16b537SWarner Losh                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1055*0c16b537SWarner Losh {
1056*0c16b537SWarner Losh     ZDICT_cover_params_t params;
1057*0c16b537SWarner Losh     memset(&params, 0, sizeof(params));
1058*0c16b537SWarner Losh     params.d = 8;
1059*0c16b537SWarner Losh     params.steps = 4;
1060*0c16b537SWarner Losh     /* Default to level 6 since no compression level information is avaialble */
1061*0c16b537SWarner Losh     params.zParams.compressionLevel = 6;
1062*0c16b537SWarner Losh     return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
1063*0c16b537SWarner Losh                                                samplesBuffer, samplesSizes,
1064*0c16b537SWarner Losh                                                nbSamples, &params);
1065*0c16b537SWarner Losh }
1066*0c16b537SWarner Losh 
1067*0c16b537SWarner Losh size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1068*0c16b537SWarner Losh                                         const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1069*0c16b537SWarner Losh {
1070*0c16b537SWarner Losh     ZDICT_params_t params;
1071*0c16b537SWarner Losh     memset(&params, 0, sizeof(params));
1072*0c16b537SWarner Losh     return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
1073*0c16b537SWarner Losh                                                      samplesBuffer, samplesSizes, nbSamples,
1074*0c16b537SWarner Losh                                                      params);
1075*0c16b537SWarner Losh }
1076