1 /* 2 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11 12 /*-************************************** 13 * Tuning parameters 14 ****************************************/ 15 #define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */ 16 #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20) 17 #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO) 18 19 20 /*-************************************** 21 * Compiler Options 22 ****************************************/ 23 /* Unix Large Files support (>4GB) */ 24 #define _FILE_OFFSET_BITS 64 25 #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */ 26 # define _LARGEFILE_SOURCE 27 #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */ 28 # define _LARGEFILE64_SOURCE 29 #endif 30 31 32 /*-************************************* 33 * Dependencies 34 ***************************************/ 35 #include <stdlib.h> /* malloc, free */ 36 #include <string.h> /* memset */ 37 #include <stdio.h> /* fprintf, fopen, ftello64 */ 38 #include <time.h> /* clock */ 39 40 #include "mem.h" /* read */ 41 #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */ 42 #define HUF_STATIC_LINKING_ONLY 43 #include "huf.h" /* HUF_buildCTable, HUF_writeCTable */ 44 #include "zstd_internal.h" /* includes zstd.h */ 45 #include "xxhash.h" /* XXH64 */ 46 #include "divsufsort.h" 47 #ifndef ZDICT_STATIC_LINKING_ONLY 48 # define ZDICT_STATIC_LINKING_ONLY 49 #endif 50 #include "zdict.h" 51 52 53 /*-************************************* 54 * Constants 55 ***************************************/ 56 #define KB *(1 <<10) 57 #define MB *(1 <<20) 58 #define GB *(1U<<30) 59 60 #define DICTLISTSIZE_DEFAULT 10000 61 62 #define NOISELENGTH 32 63 64 static const int g_compressionLevel_default = 3; 65 static const U32 g_selectivity_default = 9; 66 67 68 /*-************************************* 69 * Console display 70 ***************************************/ 71 #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } 72 #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ 73 74 static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; } 75 76 static void ZDICT_printHex(const void* ptr, size_t length) 77 { 78 const BYTE* const b = (const BYTE*)ptr; 79 size_t u; 80 for (u=0; u<length; u++) { 81 BYTE c = b[u]; 82 if (c<32 || c>126) c = '.'; /* non-printable char */ 83 DISPLAY("%c", c); 84 } 85 } 86 87 88 /*-******************************************************** 89 * Helper functions 90 **********************************************************/ 91 unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); } 92 93 const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } 94 95 unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize) 96 { 97 if (dictSize < 8) return 0; 98 if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0; 99 return MEM_readLE32((const char*)dictBuffer + 4); 100 } 101 102 103 /*-******************************************************** 104 * Dictionary training functions 105 **********************************************************/ 106 static unsigned ZDICT_NbCommonBytes (size_t val) 107 { 108 if (MEM_isLittleEndian()) { 109 if (MEM_64bits()) { 110 # if defined(_MSC_VER) && defined(_WIN64) 111 unsigned long r = 0; 112 _BitScanForward64( &r, (U64)val ); 113 return (unsigned)(r>>3); 114 # elif defined(__GNUC__) && (__GNUC__ >= 3) 115 return (__builtin_ctzll((U64)val) >> 3); 116 # else 117 static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; 118 return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; 119 # endif 120 } else { /* 32 bits */ 121 # if defined(_MSC_VER) 122 unsigned long r=0; 123 _BitScanForward( &r, (U32)val ); 124 return (unsigned)(r>>3); 125 # elif defined(__GNUC__) && (__GNUC__ >= 3) 126 return (__builtin_ctz((U32)val) >> 3); 127 # else 128 static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; 129 return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; 130 # endif 131 } 132 } else { /* Big Endian CPU */ 133 if (MEM_64bits()) { 134 # if defined(_MSC_VER) && defined(_WIN64) 135 unsigned long r = 0; 136 _BitScanReverse64( &r, val ); 137 return (unsigned)(r>>3); 138 # elif defined(__GNUC__) && (__GNUC__ >= 3) 139 return (__builtin_clzll(val) >> 3); 140 # else 141 unsigned r; 142 const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ 143 if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } 144 if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } 145 r += (!val); 146 return r; 147 # endif 148 } else { /* 32 bits */ 149 # if defined(_MSC_VER) 150 unsigned long r = 0; 151 _BitScanReverse( &r, (unsigned long)val ); 152 return (unsigned)(r>>3); 153 # elif defined(__GNUC__) && (__GNUC__ >= 3) 154 return (__builtin_clz((U32)val) >> 3); 155 # else 156 unsigned r; 157 if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } 158 r += (!val); 159 return r; 160 # endif 161 } } 162 } 163 164 165 /*! ZDICT_count() : 166 Count the nb of common bytes between 2 pointers. 167 Note : this function presumes end of buffer followed by noisy guard band. 168 */ 169 static size_t ZDICT_count(const void* pIn, const void* pMatch) 170 { 171 const char* const pStart = (const char*)pIn; 172 for (;;) { 173 size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); 174 if (!diff) { 175 pIn = (const char*)pIn+sizeof(size_t); 176 pMatch = (const char*)pMatch+sizeof(size_t); 177 continue; 178 } 179 pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff); 180 return (size_t)((const char*)pIn - pStart); 181 } 182 } 183 184 185 typedef struct { 186 U32 pos; 187 U32 length; 188 U32 savings; 189 } dictItem; 190 191 static void ZDICT_initDictItem(dictItem* d) 192 { 193 d->pos = 1; 194 d->length = 0; 195 d->savings = (U32)(-1); 196 } 197 198 199 #define LLIMIT 64 /* heuristic determined experimentally */ 200 #define MINMATCHLENGTH 7 /* heuristic determined experimentally */ 201 static dictItem ZDICT_analyzePos( 202 BYTE* doneMarks, 203 const int* suffix, U32 start, 204 const void* buffer, U32 minRatio, U32 notificationLevel) 205 { 206 U32 lengthList[LLIMIT] = {0}; 207 U32 cumulLength[LLIMIT] = {0}; 208 U32 savings[LLIMIT] = {0}; 209 const BYTE* b = (const BYTE*)buffer; 210 size_t length; 211 size_t maxLength = LLIMIT; 212 size_t pos = suffix[start]; 213 U32 end = start; 214 dictItem solution; 215 216 /* init */ 217 memset(&solution, 0, sizeof(solution)); 218 doneMarks[pos] = 1; 219 220 /* trivial repetition cases */ 221 if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2)) 222 ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) 223 ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { 224 /* skip and mark segment */ 225 U16 u16 = MEM_read16(b+pos+4); 226 U32 u, e = 6; 227 while (MEM_read16(b+pos+e) == u16) e+=2 ; 228 if (b[pos+e] == b[pos+e-1]) e++; 229 for (u=1; u<e; u++) 230 doneMarks[pos+u] = 1; 231 return solution; 232 } 233 234 /* look forward */ 235 do { 236 end++; 237 length = ZDICT_count(b + pos, b + suffix[end]); 238 } while (length >=MINMATCHLENGTH); 239 240 /* look backward */ 241 do { 242 length = ZDICT_count(b + pos, b + *(suffix+start-1)); 243 if (length >=MINMATCHLENGTH) start--; 244 } while(length >= MINMATCHLENGTH); 245 246 /* exit if not found a minimum nb of repetitions */ 247 if (end-start < minRatio) { 248 U32 idx; 249 for(idx=start; idx<end; idx++) 250 doneMarks[suffix[idx]] = 1; 251 return solution; 252 } 253 254 { int i; 255 U32 searchLength; 256 U32 refinedStart = start; 257 U32 refinedEnd = end; 258 259 DISPLAYLEVEL(4, "\n"); 260 DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos); 261 DISPLAYLEVEL(4, "\n"); 262 263 for (searchLength = MINMATCHLENGTH ; ; searchLength++) { 264 BYTE currentChar = 0; 265 U32 currentCount = 0; 266 U32 currentID = refinedStart; 267 U32 id; 268 U32 selectedCount = 0; 269 U32 selectedID = currentID; 270 for (id =refinedStart; id < refinedEnd; id++) { 271 if (b[ suffix[id] + searchLength] != currentChar) { 272 if (currentCount > selectedCount) { 273 selectedCount = currentCount; 274 selectedID = currentID; 275 } 276 currentID = id; 277 currentChar = b[ suffix[id] + searchLength]; 278 currentCount = 0; 279 } 280 currentCount ++; 281 } 282 if (currentCount > selectedCount) { /* for last */ 283 selectedCount = currentCount; 284 selectedID = currentID; 285 } 286 287 if (selectedCount < minRatio) 288 break; 289 refinedStart = selectedID; 290 refinedEnd = refinedStart + selectedCount; 291 } 292 293 /* evaluate gain based on new ref */ 294 start = refinedStart; 295 pos = suffix[refinedStart]; 296 end = start; 297 memset(lengthList, 0, sizeof(lengthList)); 298 299 /* look forward */ 300 do { 301 end++; 302 length = ZDICT_count(b + pos, b + suffix[end]); 303 if (length >= LLIMIT) length = LLIMIT-1; 304 lengthList[length]++; 305 } while (length >=MINMATCHLENGTH); 306 307 /* look backward */ 308 length = MINMATCHLENGTH; 309 while ((length >= MINMATCHLENGTH) & (start > 0)) { 310 length = ZDICT_count(b + pos, b + suffix[start - 1]); 311 if (length >= LLIMIT) length = LLIMIT - 1; 312 lengthList[length]++; 313 if (length >= MINMATCHLENGTH) start--; 314 } 315 316 /* largest useful length */ 317 memset(cumulLength, 0, sizeof(cumulLength)); 318 cumulLength[maxLength-1] = lengthList[maxLength-1]; 319 for (i=(int)(maxLength-2); i>=0; i--) 320 cumulLength[i] = cumulLength[i+1] + lengthList[i]; 321 322 for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break; 323 maxLength = i; 324 325 /* reduce maxLength in case of final into repetitive data */ 326 { U32 l = (U32)maxLength; 327 BYTE const c = b[pos + maxLength-1]; 328 while (b[pos+l-2]==c) l--; 329 maxLength = l; 330 } 331 if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */ 332 333 /* calculate savings */ 334 savings[5] = 0; 335 for (i=MINMATCHLENGTH; i<=(int)maxLength; i++) 336 savings[i] = savings[i-1] + (lengthList[i] * (i-3)); 337 338 DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n", 339 (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength); 340 341 solution.pos = (U32)pos; 342 solution.length = (U32)maxLength; 343 solution.savings = savings[maxLength]; 344 345 /* mark positions done */ 346 { U32 id; 347 for (id=start; id<end; id++) { 348 U32 p, pEnd; 349 U32 const testedPos = suffix[id]; 350 if (testedPos == pos) 351 length = solution.length; 352 else { 353 length = ZDICT_count(b+pos, b+testedPos); 354 if (length > solution.length) length = solution.length; 355 } 356 pEnd = (U32)(testedPos + length); 357 for (p=testedPos; p<pEnd; p++) 358 doneMarks[p] = 1; 359 } } } 360 361 return solution; 362 } 363 364 365 static int isIncluded(const void* in, const void* container, size_t length) 366 { 367 const char* const ip = (const char*) in; 368 const char* const into = (const char*) container; 369 size_t u; 370 371 for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */ 372 if (ip[u] != into[u]) break; 373 } 374 375 return u==length; 376 } 377 378 /*! ZDICT_tryMerge() : 379 check if dictItem can be merged, do it if possible 380 @return : id of destination elt, 0 if not merged 381 */ 382 static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer) 383 { 384 const U32 tableSize = table->pos; 385 const U32 eltEnd = elt.pos + elt.length; 386 const char* const buf = (const char*) buffer; 387 388 /* tail overlap */ 389 U32 u; for (u=1; u<tableSize; u++) { 390 if (u==eltNbToSkip) continue; 391 if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */ 392 /* append */ 393 U32 const addedLength = table[u].pos - elt.pos; 394 table[u].length += addedLength; 395 table[u].pos = elt.pos; 396 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ 397 table[u].savings += elt.length / 8; /* rough approx bonus */ 398 elt = table[u]; 399 /* sort : improve rank */ 400 while ((u>1) && (table[u-1].savings < elt.savings)) 401 table[u] = table[u-1], u--; 402 table[u] = elt; 403 return u; 404 } } 405 406 /* front overlap */ 407 for (u=1; u<tableSize; u++) { 408 if (u==eltNbToSkip) continue; 409 410 if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */ 411 /* append */ 412 int const addedLength = (int)eltEnd - (table[u].pos + table[u].length); 413 table[u].savings += elt.length / 8; /* rough approx bonus */ 414 if (addedLength > 0) { /* otherwise, elt fully included into existing */ 415 table[u].length += addedLength; 416 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ 417 } 418 /* sort : improve rank */ 419 elt = table[u]; 420 while ((u>1) && (table[u-1].savings < elt.savings)) 421 table[u] = table[u-1], u--; 422 table[u] = elt; 423 return u; 424 } 425 426 if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) { 427 if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) { 428 size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 ); 429 table[u].pos = elt.pos; 430 table[u].savings += (U32)(elt.savings * addedLength / elt.length); 431 table[u].length = MIN(elt.length, table[u].length + 1); 432 return u; 433 } 434 } 435 } 436 437 return 0; 438 } 439 440 441 static void ZDICT_removeDictItem(dictItem* table, U32 id) 442 { 443 /* convention : table[0].pos stores nb of elts */ 444 U32 const max = table[0].pos; 445 U32 u; 446 if (!id) return; /* protection, should never happen */ 447 for (u=id; u<max-1; u++) 448 table[u] = table[u+1]; 449 table->pos--; 450 } 451 452 453 static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer) 454 { 455 /* merge if possible */ 456 U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer); 457 if (mergeId) { 458 U32 newMerge = 1; 459 while (newMerge) { 460 newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer); 461 if (newMerge) ZDICT_removeDictItem(table, mergeId); 462 mergeId = newMerge; 463 } 464 return; 465 } 466 467 /* insert */ 468 { U32 current; 469 U32 nextElt = table->pos; 470 if (nextElt >= maxSize) nextElt = maxSize-1; 471 current = nextElt-1; 472 while (table[current].savings < elt.savings) { 473 table[current+1] = table[current]; 474 current--; 475 } 476 table[current+1] = elt; 477 table->pos = nextElt+1; 478 } 479 } 480 481 482 static U32 ZDICT_dictSize(const dictItem* dictList) 483 { 484 U32 u, dictSize = 0; 485 for (u=1; u<dictList[0].pos; u++) 486 dictSize += dictList[u].length; 487 return dictSize; 488 } 489 490 491 static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize, 492 const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */ 493 const size_t* fileSizes, unsigned nbFiles, 494 U32 minRatio, U32 notificationLevel) 495 { 496 int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0)); 497 int* const suffix = suffix0+1; 498 U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix)); 499 BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */ 500 U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos)); 501 size_t result = 0; 502 clock_t displayClock = 0; 503 clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10; 504 505 # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \ 506 if (ZDICT_clockSpan(displayClock) > refreshRate) \ 507 { displayClock = clock(); DISPLAY(__VA_ARGS__); \ 508 if (notificationLevel>=4) fflush(stderr); } } 509 510 /* init */ 511 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ 512 if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) { 513 result = ERROR(memory_allocation); 514 goto _cleanup; 515 } 516 if (minRatio < MINRATIO) minRatio = MINRATIO; 517 memset(doneMarks, 0, bufferSize+16); 518 519 /* limit sample set size (divsufsort limitation)*/ 520 if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20)); 521 while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles]; 522 523 /* sort */ 524 DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20)); 525 { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0); 526 if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; } 527 } 528 suffix[bufferSize] = (int)bufferSize; /* leads into noise */ 529 suffix0[0] = (int)bufferSize; /* leads into noise */ 530 /* build reverse suffix sort */ 531 { size_t pos; 532 for (pos=0; pos < bufferSize; pos++) 533 reverseSuffix[suffix[pos]] = (U32)pos; 534 /* note filePos tracks borders between samples. 535 It's not used at this stage, but planned to become useful in a later update */ 536 filePos[0] = 0; 537 for (pos=1; pos<nbFiles; pos++) 538 filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]); 539 } 540 541 DISPLAYLEVEL(2, "finding patterns ... \n"); 542 DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio); 543 544 { U32 cursor; for (cursor=0; cursor < bufferSize; ) { 545 dictItem solution; 546 if (doneMarks[cursor]) { cursor++; continue; } 547 solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel); 548 if (solution.length==0) { cursor++; continue; } 549 ZDICT_insertDictItem(dictList, dictListSize, solution, buffer); 550 cursor += solution.length; 551 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100); 552 } } 553 554 _cleanup: 555 free(suffix0); 556 free(reverseSuffix); 557 free(doneMarks); 558 free(filePos); 559 return result; 560 } 561 562 563 static void ZDICT_fillNoise(void* buffer, size_t length) 564 { 565 unsigned const prime1 = 2654435761U; 566 unsigned const prime2 = 2246822519U; 567 unsigned acc = prime1; 568 size_t p=0;; 569 for (p=0; p<length; p++) { 570 acc *= prime2; 571 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21); 572 } 573 } 574 575 576 typedef struct 577 { 578 ZSTD_CCtx* ref; 579 ZSTD_CCtx* zc; 580 void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */ 581 } EStats_ress_t; 582 583 #define MAXREPOFFSET 1024 584 585 static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, 586 U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets, 587 const void* src, size_t srcSize, U32 notificationLevel) 588 { 589 size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog); 590 size_t cSize; 591 592 if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ 593 { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0); 594 if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } 595 } 596 cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); 597 if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; } 598 599 if (cSize) { /* if == 0; block is not compressible */ 600 const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc); 601 602 /* literals stats */ 603 { const BYTE* bytePtr; 604 for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++) 605 countLit[*bytePtr]++; 606 } 607 608 /* seqStats */ 609 { U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); 610 ZSTD_seqToCodes(seqStorePtr); 611 612 { const BYTE* codePtr = seqStorePtr->ofCode; 613 U32 u; 614 for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++; 615 } 616 617 { const BYTE* codePtr = seqStorePtr->mlCode; 618 U32 u; 619 for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++; 620 } 621 622 { const BYTE* codePtr = seqStorePtr->llCode; 623 U32 u; 624 for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++; 625 } 626 627 if (nbSeq >= 2) { /* rep offsets */ 628 const seqDef* const seq = seqStorePtr->sequencesStart; 629 U32 offset1 = seq[0].offset - 3; 630 U32 offset2 = seq[1].offset - 3; 631 if (offset1 >= MAXREPOFFSET) offset1 = 0; 632 if (offset2 >= MAXREPOFFSET) offset2 = 0; 633 repOffsets[offset1] += 3; 634 repOffsets[offset2] += 1; 635 } } } 636 } 637 638 static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles) 639 { 640 size_t total=0; 641 unsigned u; 642 for (u=0; u<nbFiles; u++) total += fileSizes[u]; 643 return total; 644 } 645 646 typedef struct { U32 offset; U32 count; } offsetCount_t; 647 648 static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count) 649 { 650 U32 u; 651 table[ZSTD_REP_NUM].offset = val; 652 table[ZSTD_REP_NUM].count = count; 653 for (u=ZSTD_REP_NUM; u>0; u--) { 654 offsetCount_t tmp; 655 if (table[u-1].count >= table[u].count) break; 656 tmp = table[u-1]; 657 table[u-1] = table[u]; 658 table[u] = tmp; 659 } 660 } 661 662 663 #define OFFCODE_MAX 30 /* only applicable to first block */ 664 static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, 665 unsigned compressionLevel, 666 const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles, 667 const void* dictBuffer, size_t dictBufferSize, 668 unsigned notificationLevel) 669 { 670 U32 countLit[256]; 671 HUF_CREATE_STATIC_CTABLE(hufTable, 255); 672 U32 offcodeCount[OFFCODE_MAX+1]; 673 short offcodeNCount[OFFCODE_MAX+1]; 674 U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB)); 675 U32 matchLengthCount[MaxML+1]; 676 short matchLengthNCount[MaxML+1]; 677 U32 litLengthCount[MaxLL+1]; 678 short litLengthNCount[MaxLL+1]; 679 U32 repOffset[MAXREPOFFSET]; 680 offsetCount_t bestRepOffset[ZSTD_REP_NUM+1]; 681 EStats_ress_t esr; 682 ZSTD_parameters params; 683 U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total; 684 size_t pos = 0, errorCode; 685 size_t eSize = 0; 686 size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles); 687 size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles); 688 BYTE* dstPtr = (BYTE*)dstBuffer; 689 690 /* init */ 691 esr.ref = ZSTD_createCCtx(); 692 esr.zc = ZSTD_createCCtx(); 693 esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX); 694 if (!esr.ref || !esr.zc || !esr.workPlace) { 695 eSize = ERROR(memory_allocation); 696 DISPLAYLEVEL(1, "Not enough memory \n"); 697 goto _cleanup; 698 } 699 if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */ 700 for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */ 701 for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1; 702 for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1; 703 for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1; 704 memset(repOffset, 0, sizeof(repOffset)); 705 repOffset[1] = repOffset[4] = repOffset[8] = 1; 706 memset(bestRepOffset, 0, sizeof(bestRepOffset)); 707 if (compressionLevel<=0) compressionLevel = g_compressionLevel_default; 708 params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize); 709 { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0); 710 if (ZSTD_isError(beginResult)) { 711 DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult)); 712 eSize = ERROR(GENERIC); 713 goto _cleanup; 714 } } 715 716 /* collect stats on all files */ 717 for (u=0; u<nbFiles; u++) { 718 ZDICT_countEStats(esr, params, 719 countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset, 720 (const char*)srcBuffer + pos, fileSizes[u], 721 notificationLevel); 722 pos += fileSizes[u]; 723 } 724 725 /* analyze */ 726 errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog); 727 if (HUF_isError(errorCode)) { 728 eSize = ERROR(GENERIC); 729 DISPLAYLEVEL(1, "HUF_buildCTable error \n"); 730 goto _cleanup; 731 } 732 huffLog = (U32)errorCode; 733 734 /* looking for most common first offsets */ 735 { U32 offset; 736 for (offset=1; offset<MAXREPOFFSET; offset++) 737 ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]); 738 } 739 /* note : the result of this phase should be used to better appreciate the impact on statistics */ 740 741 total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u]; 742 errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax); 743 if (FSE_isError(errorCode)) { 744 eSize = ERROR(GENERIC); 745 DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n"); 746 goto _cleanup; 747 } 748 Offlog = (U32)errorCode; 749 750 total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u]; 751 errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML); 752 if (FSE_isError(errorCode)) { 753 eSize = ERROR(GENERIC); 754 DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n"); 755 goto _cleanup; 756 } 757 mlLog = (U32)errorCode; 758 759 total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u]; 760 errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL); 761 if (FSE_isError(errorCode)) { 762 eSize = ERROR(GENERIC); 763 DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n"); 764 goto _cleanup; 765 } 766 llLog = (U32)errorCode; 767 768 /* write result to buffer */ 769 { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog); 770 if (HUF_isError(hhSize)) { 771 eSize = ERROR(GENERIC); 772 DISPLAYLEVEL(1, "HUF_writeCTable error \n"); 773 goto _cleanup; 774 } 775 dstPtr += hhSize; 776 maxDstSize -= hhSize; 777 eSize += hhSize; 778 } 779 780 { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog); 781 if (FSE_isError(ohSize)) { 782 eSize = ERROR(GENERIC); 783 DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n"); 784 goto _cleanup; 785 } 786 dstPtr += ohSize; 787 maxDstSize -= ohSize; 788 eSize += ohSize; 789 } 790 791 { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog); 792 if (FSE_isError(mhSize)) { 793 eSize = ERROR(GENERIC); 794 DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n"); 795 goto _cleanup; 796 } 797 dstPtr += mhSize; 798 maxDstSize -= mhSize; 799 eSize += mhSize; 800 } 801 802 { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog); 803 if (FSE_isError(lhSize)) { 804 eSize = ERROR(GENERIC); 805 DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n"); 806 goto _cleanup; 807 } 808 dstPtr += lhSize; 809 maxDstSize -= lhSize; 810 eSize += lhSize; 811 } 812 813 if (maxDstSize<12) { 814 eSize = ERROR(GENERIC); 815 DISPLAYLEVEL(1, "not enough space to write RepOffsets \n"); 816 goto _cleanup; 817 } 818 # if 0 819 MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset); 820 MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset); 821 MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset); 822 #else 823 /* at this stage, we don't use the result of "most common first offset", 824 as the impact of statistics is not properly evaluated */ 825 MEM_writeLE32(dstPtr+0, repStartValue[0]); 826 MEM_writeLE32(dstPtr+4, repStartValue[1]); 827 MEM_writeLE32(dstPtr+8, repStartValue[2]); 828 #endif 829 eSize += 12; 830 831 _cleanup: 832 ZSTD_freeCCtx(esr.ref); 833 ZSTD_freeCCtx(esr.zc); 834 free(esr.workPlace); 835 836 return eSize; 837 } 838 839 840 841 size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, 842 const void* customDictContent, size_t dictContentSize, 843 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, 844 ZDICT_params_t params) 845 { 846 size_t hSize; 847 #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */ 848 BYTE header[HBUFFSIZE]; 849 int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel; 850 U32 const notificationLevel = params.notificationLevel; 851 852 /* check conditions */ 853 if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall); 854 if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong); 855 if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall); 856 857 /* dictionary header */ 858 MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY); 859 { U64 const randomID = XXH64(customDictContent, dictContentSize, 0); 860 U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; 861 U32 const dictID = params.dictID ? params.dictID : compliantID; 862 MEM_writeLE32(header+4, dictID); 863 } 864 hSize = 8; 865 866 /* entropy tables */ 867 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ 868 DISPLAYLEVEL(2, "statistics ... \n"); 869 { size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize, 870 compressionLevel, 871 samplesBuffer, samplesSizes, nbSamples, 872 customDictContent, dictContentSize, 873 notificationLevel); 874 if (ZDICT_isError(eSize)) return eSize; 875 hSize += eSize; 876 } 877 878 /* copy elements in final buffer ; note : src and dst buffer can overlap */ 879 if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize; 880 { size_t const dictSize = hSize + dictContentSize; 881 char* dictEnd = (char*)dictBuffer + dictSize; 882 memmove(dictEnd - dictContentSize, customDictContent, dictContentSize); 883 memcpy(dictBuffer, header, hSize); 884 return dictSize; 885 } 886 } 887 888 889 size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, 890 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, 891 ZDICT_params_t params) 892 { 893 int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel; 894 U32 const notificationLevel = params.notificationLevel; 895 size_t hSize = 8; 896 897 /* calculate entropy tables */ 898 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ 899 DISPLAYLEVEL(2, "statistics ... \n"); 900 { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize, 901 compressionLevel, 902 samplesBuffer, samplesSizes, nbSamples, 903 (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 904 notificationLevel); 905 if (ZDICT_isError(eSize)) return eSize; 906 hSize += eSize; 907 } 908 909 /* add dictionary header (after entropy tables) */ 910 MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY); 911 { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0); 912 U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; 913 U32 const dictID = params.dictID ? params.dictID : compliantID; 914 MEM_writeLE32((char*)dictBuffer+4, dictID); 915 } 916 917 if (hSize + dictContentSize < dictBufferCapacity) 918 memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize); 919 return MIN(dictBufferCapacity, hSize+dictContentSize); 920 } 921 922 923 /*! ZDICT_trainFromBuffer_unsafe_legacy() : 924 * Warning : `samplesBuffer` must be followed by noisy guard band. 925 * @return : size of dictionary, or an error code which can be tested with ZDICT_isError() 926 */ 927 size_t ZDICT_trainFromBuffer_unsafe_legacy( 928 void* dictBuffer, size_t maxDictSize, 929 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, 930 ZDICT_legacy_params_t params) 931 { 932 U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16)); 933 dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); 934 unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel; 935 unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity; 936 size_t const targetDictSize = maxDictSize; 937 size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); 938 size_t dictSize = 0; 939 U32 const notificationLevel = params.zParams.notificationLevel; 940 941 /* checks */ 942 if (!dictList) return ERROR(memory_allocation); 943 if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */ 944 if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */ 945 946 /* init */ 947 ZDICT_initDictItem(dictList); 948 949 /* build dictionary */ 950 ZDICT_trainBuffer_legacy(dictList, dictListSize, 951 samplesBuffer, samplesBuffSize, 952 samplesSizes, nbSamples, 953 minRep, notificationLevel); 954 955 /* display best matches */ 956 if (params.zParams.notificationLevel>= 3) { 957 U32 const nb = MIN(25, dictList[0].pos); 958 U32 const dictContentSize = ZDICT_dictSize(dictList); 959 U32 u; 960 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize); 961 DISPLAYLEVEL(3, "list %u best segments \n", nb-1); 962 for (u=1; u<nb; u++) { 963 U32 const pos = dictList[u].pos; 964 U32 const length = dictList[u].length; 965 U32 const printedLength = MIN(40, length); 966 if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) 967 return ERROR(GENERIC); /* should never happen */ 968 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", 969 u, length, pos, dictList[u].savings); 970 ZDICT_printHex((const char*)samplesBuffer+pos, printedLength); 971 DISPLAYLEVEL(3, "| \n"); 972 } } 973 974 975 /* create dictionary */ 976 { U32 dictContentSize = ZDICT_dictSize(dictList); 977 if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */ 978 if (dictContentSize < targetDictSize/4) { 979 DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize); 980 if (samplesBuffSize < 10 * targetDictSize) 981 DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20)); 982 if (minRep > MINRATIO) { 983 DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1); 984 DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n"); 985 } 986 } 987 988 if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) { 989 U32 proposedSelectivity = selectivity-1; 990 while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; } 991 DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize); 992 DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity); 993 DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n"); 994 } 995 996 /* limit dictionary size */ 997 { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */ 998 U32 currentSize = 0; 999 U32 n; for (n=1; n<max; n++) { 1000 currentSize += dictList[n].length; 1001 if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; } 1002 } 1003 dictList->pos = n; 1004 dictContentSize = currentSize; 1005 } 1006 1007 /* build dict content */ 1008 { U32 u; 1009 BYTE* ptr = (BYTE*)dictBuffer + maxDictSize; 1010 for (u=1; u<dictList->pos; u++) { 1011 U32 l = dictList[u].length; 1012 ptr -= l; 1013 if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); } /* should not happen */ 1014 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l); 1015 } } 1016 1017 dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize, 1018 samplesBuffer, samplesSizes, nbSamples, 1019 params.zParams); 1020 } 1021 1022 /* clean up */ 1023 free(dictList); 1024 return dictSize; 1025 } 1026 1027 1028 /* issue : samplesBuffer need to be followed by a noisy guard band. 1029 * work around : duplicate the buffer, and add the noise */ 1030 size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity, 1031 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, 1032 ZDICT_legacy_params_t params) 1033 { 1034 size_t result; 1035 void* newBuff; 1036 size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); 1037 if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */ 1038 1039 newBuff = malloc(sBuffSize + NOISELENGTH); 1040 if (!newBuff) return ERROR(memory_allocation); 1041 1042 memcpy(newBuff, samplesBuffer, sBuffSize); 1043 ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */ 1044 1045 result = 1046 ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff, 1047 samplesSizes, nbSamples, params); 1048 free(newBuff); 1049 return result; 1050 } 1051 1052 1053 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, 1054 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) 1055 { 1056 ZDICT_cover_params_t params; 1057 memset(¶ms, 0, sizeof(params)); 1058 params.d = 8; 1059 params.steps = 4; 1060 /* Default to level 6 since no compression level information is avaialble */ 1061 params.zParams.compressionLevel = 6; 1062 return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity, 1063 samplesBuffer, samplesSizes, 1064 nbSamples, ¶ms); 1065 } 1066 1067 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, 1068 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) 1069 { 1070 ZDICT_params_t params; 1071 memset(¶ms, 0, sizeof(params)); 1072 return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity, 1073 samplesBuffer, samplesSizes, nbSamples, 1074 params); 1075 } 1076