1 /* 2 * Copyright (c) Yann Collet, Facebook, Inc. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11 12 /*-************************************** 13 * Tuning parameters 14 ****************************************/ 15 #define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */ 16 #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20) 17 #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO) 18 19 20 /*-************************************** 21 * Compiler Options 22 ****************************************/ 23 /* Unix Large Files support (>4GB) */ 24 #define _FILE_OFFSET_BITS 64 25 #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */ 26 # ifndef _LARGEFILE_SOURCE 27 # define _LARGEFILE_SOURCE 28 # endif 29 #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */ 30 # ifndef _LARGEFILE64_SOURCE 31 # define _LARGEFILE64_SOURCE 32 # endif 33 #endif 34 35 36 /*-************************************* 37 * Dependencies 38 ***************************************/ 39 #include <stdlib.h> /* malloc, free */ 40 #include <string.h> /* memset */ 41 #include <stdio.h> /* fprintf, fopen, ftello64 */ 42 #include <time.h> /* clock */ 43 44 #ifndef ZDICT_STATIC_LINKING_ONLY 45 # define ZDICT_STATIC_LINKING_ONLY 46 #endif 47 #define HUF_STATIC_LINKING_ONLY 48 49 #include "../common/mem.h" /* read */ 50 #include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */ 51 #include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */ 52 #include "../common/zstd_internal.h" /* includes zstd.h */ 53 #include "../common/xxhash.h" /* XXH64 */ 54 #include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */ 55 #include "../zdict.h" 56 #include "divsufsort.h" 57 58 59 /*-************************************* 60 * Constants 61 ***************************************/ 62 #define KB *(1 <<10) 63 #define MB *(1 <<20) 64 #define GB *(1U<<30) 65 66 #define DICTLISTSIZE_DEFAULT 10000 67 68 #define NOISELENGTH 32 69 70 static const U32 g_selectivity_default = 9; 71 72 73 /*-************************************* 74 * Console display 75 ***************************************/ 76 #undef DISPLAY 77 #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } 78 #undef DISPLAYLEVEL 79 #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ 80 81 static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; } 82 83 static void ZDICT_printHex(const void* ptr, size_t length) 84 { 85 const BYTE* const b = (const BYTE*)ptr; 86 size_t u; 87 for (u=0; u<length; u++) { 88 BYTE c = b[u]; 89 if (c<32 || c>126) c = '.'; /* non-printable char */ 90 DISPLAY("%c", c); 91 } 92 } 93 94 95 /*-******************************************************** 96 * Helper functions 97 **********************************************************/ 98 unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); } 99 100 const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } 101 102 unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize) 103 { 104 if (dictSize < 8) return 0; 105 if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0; 106 return MEM_readLE32((const char*)dictBuffer + 4); 107 } 108 109 size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize) 110 { 111 size_t headerSize; 112 if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted); 113 114 { ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t)); 115 U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE); 116 if (!bs || !wksp) { 117 headerSize = ERROR(memory_allocation); 118 } else { 119 ZSTD_reset_compressedBlockState(bs); 120 headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize); 121 } 122 123 free(bs); 124 free(wksp); 125 } 126 127 return headerSize; 128 } 129 130 /*-******************************************************** 131 * Dictionary training functions 132 **********************************************************/ 133 static unsigned ZDICT_NbCommonBytes (size_t val) 134 { 135 if (MEM_isLittleEndian()) { 136 if (MEM_64bits()) { 137 # if defined(_MSC_VER) && defined(_WIN64) 138 if (val != 0) { 139 unsigned long r; 140 _BitScanForward64(&r, (U64)val); 141 return (unsigned)(r >> 3); 142 } else { 143 /* Should not reach this code path */ 144 __assume(0); 145 } 146 # elif defined(__GNUC__) && (__GNUC__ >= 3) 147 return (unsigned)(__builtin_ctzll((U64)val) >> 3); 148 # else 149 static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; 150 return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; 151 # endif 152 } else { /* 32 bits */ 153 # if defined(_MSC_VER) 154 if (val != 0) { 155 unsigned long r; 156 _BitScanForward(&r, (U32)val); 157 return (unsigned)(r >> 3); 158 } else { 159 /* Should not reach this code path */ 160 __assume(0); 161 } 162 # elif defined(__GNUC__) && (__GNUC__ >= 3) 163 return (unsigned)(__builtin_ctz((U32)val) >> 3); 164 # else 165 static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; 166 return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; 167 # endif 168 } 169 } else { /* Big Endian CPU */ 170 if (MEM_64bits()) { 171 # if defined(_MSC_VER) && defined(_WIN64) 172 if (val != 0) { 173 unsigned long r; 174 _BitScanReverse64(&r, val); 175 return (unsigned)(r >> 3); 176 } else { 177 /* Should not reach this code path */ 178 __assume(0); 179 } 180 # elif defined(__GNUC__) && (__GNUC__ >= 3) 181 return (unsigned)(__builtin_clzll(val) >> 3); 182 # else 183 unsigned r; 184 const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ 185 if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } 186 if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } 187 r += (!val); 188 return r; 189 # endif 190 } else { /* 32 bits */ 191 # if defined(_MSC_VER) 192 if (val != 0) { 193 unsigned long r; 194 _BitScanReverse(&r, (unsigned long)val); 195 return (unsigned)(r >> 3); 196 } else { 197 /* Should not reach this code path */ 198 __assume(0); 199 } 200 # elif defined(__GNUC__) && (__GNUC__ >= 3) 201 return (unsigned)(__builtin_clz((U32)val) >> 3); 202 # else 203 unsigned r; 204 if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } 205 r += (!val); 206 return r; 207 # endif 208 } } 209 } 210 211 212 /*! ZDICT_count() : 213 Count the nb of common bytes between 2 pointers. 214 Note : this function presumes end of buffer followed by noisy guard band. 215 */ 216 static size_t ZDICT_count(const void* pIn, const void* pMatch) 217 { 218 const char* const pStart = (const char*)pIn; 219 for (;;) { 220 size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); 221 if (!diff) { 222 pIn = (const char*)pIn+sizeof(size_t); 223 pMatch = (const char*)pMatch+sizeof(size_t); 224 continue; 225 } 226 pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff); 227 return (size_t)((const char*)pIn - pStart); 228 } 229 } 230 231 232 typedef struct { 233 U32 pos; 234 U32 length; 235 U32 savings; 236 } dictItem; 237 238 static void ZDICT_initDictItem(dictItem* d) 239 { 240 d->pos = 1; 241 d->length = 0; 242 d->savings = (U32)(-1); 243 } 244 245 246 #define LLIMIT 64 /* heuristic determined experimentally */ 247 #define MINMATCHLENGTH 7 /* heuristic determined experimentally */ 248 static dictItem ZDICT_analyzePos( 249 BYTE* doneMarks, 250 const int* suffix, U32 start, 251 const void* buffer, U32 minRatio, U32 notificationLevel) 252 { 253 U32 lengthList[LLIMIT] = {0}; 254 U32 cumulLength[LLIMIT] = {0}; 255 U32 savings[LLIMIT] = {0}; 256 const BYTE* b = (const BYTE*)buffer; 257 size_t maxLength = LLIMIT; 258 size_t pos = (size_t)suffix[start]; 259 U32 end = start; 260 dictItem solution; 261 262 /* init */ 263 memset(&solution, 0, sizeof(solution)); 264 doneMarks[pos] = 1; 265 266 /* trivial repetition cases */ 267 if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2)) 268 ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) 269 ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { 270 /* skip and mark segment */ 271 U16 const pattern16 = MEM_read16(b+pos+4); 272 U32 u, patternEnd = 6; 273 while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ; 274 if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++; 275 for (u=1; u<patternEnd; u++) 276 doneMarks[pos+u] = 1; 277 return solution; 278 } 279 280 /* look forward */ 281 { size_t length; 282 do { 283 end++; 284 length = ZDICT_count(b + pos, b + suffix[end]); 285 } while (length >= MINMATCHLENGTH); 286 } 287 288 /* look backward */ 289 { size_t length; 290 do { 291 length = ZDICT_count(b + pos, b + *(suffix+start-1)); 292 if (length >=MINMATCHLENGTH) start--; 293 } while(length >= MINMATCHLENGTH); 294 } 295 296 /* exit if not found a minimum nb of repetitions */ 297 if (end-start < minRatio) { 298 U32 idx; 299 for(idx=start; idx<end; idx++) 300 doneMarks[suffix[idx]] = 1; 301 return solution; 302 } 303 304 { int i; 305 U32 mml; 306 U32 refinedStart = start; 307 U32 refinedEnd = end; 308 309 DISPLAYLEVEL(4, "\n"); 310 DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos); 311 DISPLAYLEVEL(4, "\n"); 312 313 for (mml = MINMATCHLENGTH ; ; mml++) { 314 BYTE currentChar = 0; 315 U32 currentCount = 0; 316 U32 currentID = refinedStart; 317 U32 id; 318 U32 selectedCount = 0; 319 U32 selectedID = currentID; 320 for (id =refinedStart; id < refinedEnd; id++) { 321 if (b[suffix[id] + mml] != currentChar) { 322 if (currentCount > selectedCount) { 323 selectedCount = currentCount; 324 selectedID = currentID; 325 } 326 currentID = id; 327 currentChar = b[ suffix[id] + mml]; 328 currentCount = 0; 329 } 330 currentCount ++; 331 } 332 if (currentCount > selectedCount) { /* for last */ 333 selectedCount = currentCount; 334 selectedID = currentID; 335 } 336 337 if (selectedCount < minRatio) 338 break; 339 refinedStart = selectedID; 340 refinedEnd = refinedStart + selectedCount; 341 } 342 343 /* evaluate gain based on new dict */ 344 start = refinedStart; 345 pos = suffix[refinedStart]; 346 end = start; 347 memset(lengthList, 0, sizeof(lengthList)); 348 349 /* look forward */ 350 { size_t length; 351 do { 352 end++; 353 length = ZDICT_count(b + pos, b + suffix[end]); 354 if (length >= LLIMIT) length = LLIMIT-1; 355 lengthList[length]++; 356 } while (length >=MINMATCHLENGTH); 357 } 358 359 /* look backward */ 360 { size_t length = MINMATCHLENGTH; 361 while ((length >= MINMATCHLENGTH) & (start > 0)) { 362 length = ZDICT_count(b + pos, b + suffix[start - 1]); 363 if (length >= LLIMIT) length = LLIMIT - 1; 364 lengthList[length]++; 365 if (length >= MINMATCHLENGTH) start--; 366 } 367 } 368 369 /* largest useful length */ 370 memset(cumulLength, 0, sizeof(cumulLength)); 371 cumulLength[maxLength-1] = lengthList[maxLength-1]; 372 for (i=(int)(maxLength-2); i>=0; i--) 373 cumulLength[i] = cumulLength[i+1] + lengthList[i]; 374 375 for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break; 376 maxLength = i; 377 378 /* reduce maxLength in case of final into repetitive data */ 379 { U32 l = (U32)maxLength; 380 BYTE const c = b[pos + maxLength-1]; 381 while (b[pos+l-2]==c) l--; 382 maxLength = l; 383 } 384 if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */ 385 386 /* calculate savings */ 387 savings[5] = 0; 388 for (i=MINMATCHLENGTH; i<=(int)maxLength; i++) 389 savings[i] = savings[i-1] + (lengthList[i] * (i-3)); 390 391 DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n", 392 (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / (double)maxLength); 393 394 solution.pos = (U32)pos; 395 solution.length = (U32)maxLength; 396 solution.savings = savings[maxLength]; 397 398 /* mark positions done */ 399 { U32 id; 400 for (id=start; id<end; id++) { 401 U32 p, pEnd, length; 402 U32 const testedPos = (U32)suffix[id]; 403 if (testedPos == pos) 404 length = solution.length; 405 else { 406 length = (U32)ZDICT_count(b+pos, b+testedPos); 407 if (length > solution.length) length = solution.length; 408 } 409 pEnd = (U32)(testedPos + length); 410 for (p=testedPos; p<pEnd; p++) 411 doneMarks[p] = 1; 412 } } } 413 414 return solution; 415 } 416 417 418 static int isIncluded(const void* in, const void* container, size_t length) 419 { 420 const char* const ip = (const char*) in; 421 const char* const into = (const char*) container; 422 size_t u; 423 424 for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */ 425 if (ip[u] != into[u]) break; 426 } 427 428 return u==length; 429 } 430 431 /*! ZDICT_tryMerge() : 432 check if dictItem can be merged, do it if possible 433 @return : id of destination elt, 0 if not merged 434 */ 435 static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer) 436 { 437 const U32 tableSize = table->pos; 438 const U32 eltEnd = elt.pos + elt.length; 439 const char* const buf = (const char*) buffer; 440 441 /* tail overlap */ 442 U32 u; for (u=1; u<tableSize; u++) { 443 if (u==eltNbToSkip) continue; 444 if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */ 445 /* append */ 446 U32 const addedLength = table[u].pos - elt.pos; 447 table[u].length += addedLength; 448 table[u].pos = elt.pos; 449 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ 450 table[u].savings += elt.length / 8; /* rough approx bonus */ 451 elt = table[u]; 452 /* sort : improve rank */ 453 while ((u>1) && (table[u-1].savings < elt.savings)) 454 table[u] = table[u-1], u--; 455 table[u] = elt; 456 return u; 457 } } 458 459 /* front overlap */ 460 for (u=1; u<tableSize; u++) { 461 if (u==eltNbToSkip) continue; 462 463 if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */ 464 /* append */ 465 int const addedLength = (int)eltEnd - (int)(table[u].pos + table[u].length); 466 table[u].savings += elt.length / 8; /* rough approx bonus */ 467 if (addedLength > 0) { /* otherwise, elt fully included into existing */ 468 table[u].length += addedLength; 469 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ 470 } 471 /* sort : improve rank */ 472 elt = table[u]; 473 while ((u>1) && (table[u-1].savings < elt.savings)) 474 table[u] = table[u-1], u--; 475 table[u] = elt; 476 return u; 477 } 478 479 if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) { 480 if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) { 481 size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 ); 482 table[u].pos = elt.pos; 483 table[u].savings += (U32)(elt.savings * addedLength / elt.length); 484 table[u].length = MIN(elt.length, table[u].length + 1); 485 return u; 486 } 487 } 488 } 489 490 return 0; 491 } 492 493 494 static void ZDICT_removeDictItem(dictItem* table, U32 id) 495 { 496 /* convention : table[0].pos stores nb of elts */ 497 U32 const max = table[0].pos; 498 U32 u; 499 if (!id) return; /* protection, should never happen */ 500 for (u=id; u<max-1; u++) 501 table[u] = table[u+1]; 502 table->pos--; 503 } 504 505 506 static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer) 507 { 508 /* merge if possible */ 509 U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer); 510 if (mergeId) { 511 U32 newMerge = 1; 512 while (newMerge) { 513 newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer); 514 if (newMerge) ZDICT_removeDictItem(table, mergeId); 515 mergeId = newMerge; 516 } 517 return; 518 } 519 520 /* insert */ 521 { U32 current; 522 U32 nextElt = table->pos; 523 if (nextElt >= maxSize) nextElt = maxSize-1; 524 current = nextElt-1; 525 while (table[current].savings < elt.savings) { 526 table[current+1] = table[current]; 527 current--; 528 } 529 table[current+1] = elt; 530 table->pos = nextElt+1; 531 } 532 } 533 534 535 static U32 ZDICT_dictSize(const dictItem* dictList) 536 { 537 U32 u, dictSize = 0; 538 for (u=1; u<dictList[0].pos; u++) 539 dictSize += dictList[u].length; 540 return dictSize; 541 } 542 543 544 static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize, 545 const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */ 546 const size_t* fileSizes, unsigned nbFiles, 547 unsigned minRatio, U32 notificationLevel) 548 { 549 int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0)); 550 int* const suffix = suffix0+1; 551 U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix)); 552 BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */ 553 U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos)); 554 size_t result = 0; 555 clock_t displayClock = 0; 556 clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10; 557 558 # undef DISPLAYUPDATE 559 # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \ 560 if (ZDICT_clockSpan(displayClock) > refreshRate) \ 561 { displayClock = clock(); DISPLAY(__VA_ARGS__); \ 562 if (notificationLevel>=4) fflush(stderr); } } 563 564 /* init */ 565 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ 566 if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) { 567 result = ERROR(memory_allocation); 568 goto _cleanup; 569 } 570 if (minRatio < MINRATIO) minRatio = MINRATIO; 571 memset(doneMarks, 0, bufferSize+16); 572 573 /* limit sample set size (divsufsort limitation)*/ 574 if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20)); 575 while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles]; 576 577 /* sort */ 578 DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20)); 579 { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0); 580 if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; } 581 } 582 suffix[bufferSize] = (int)bufferSize; /* leads into noise */ 583 suffix0[0] = (int)bufferSize; /* leads into noise */ 584 /* build reverse suffix sort */ 585 { size_t pos; 586 for (pos=0; pos < bufferSize; pos++) 587 reverseSuffix[suffix[pos]] = (U32)pos; 588 /* note filePos tracks borders between samples. 589 It's not used at this stage, but planned to become useful in a later update */ 590 filePos[0] = 0; 591 for (pos=1; pos<nbFiles; pos++) 592 filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]); 593 } 594 595 DISPLAYLEVEL(2, "finding patterns ... \n"); 596 DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio); 597 598 { U32 cursor; for (cursor=0; cursor < bufferSize; ) { 599 dictItem solution; 600 if (doneMarks[cursor]) { cursor++; continue; } 601 solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel); 602 if (solution.length==0) { cursor++; continue; } 603 ZDICT_insertDictItem(dictList, dictListSize, solution, buffer); 604 cursor += solution.length; 605 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100); 606 } } 607 608 _cleanup: 609 free(suffix0); 610 free(reverseSuffix); 611 free(doneMarks); 612 free(filePos); 613 return result; 614 } 615 616 617 static void ZDICT_fillNoise(void* buffer, size_t length) 618 { 619 unsigned const prime1 = 2654435761U; 620 unsigned const prime2 = 2246822519U; 621 unsigned acc = prime1; 622 size_t p=0; 623 for (p=0; p<length; p++) { 624 acc *= prime2; 625 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21); 626 } 627 } 628 629 630 typedef struct 631 { 632 ZSTD_CDict* dict; /* dictionary */ 633 ZSTD_CCtx* zc; /* working context */ 634 void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */ 635 } EStats_ress_t; 636 637 #define MAXREPOFFSET 1024 638 639 static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params, 640 unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets, 641 const void* src, size_t srcSize, 642 U32 notificationLevel) 643 { 644 size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog); 645 size_t cSize; 646 647 if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ 648 { size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict); 649 if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; } 650 651 } 652 cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); 653 if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; } 654 655 if (cSize) { /* if == 0; block is not compressible */ 656 const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc); 657 658 /* literals stats */ 659 { const BYTE* bytePtr; 660 for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++) 661 countLit[*bytePtr]++; 662 } 663 664 /* seqStats */ 665 { U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); 666 ZSTD_seqToCodes(seqStorePtr); 667 668 { const BYTE* codePtr = seqStorePtr->ofCode; 669 U32 u; 670 for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++; 671 } 672 673 { const BYTE* codePtr = seqStorePtr->mlCode; 674 U32 u; 675 for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++; 676 } 677 678 { const BYTE* codePtr = seqStorePtr->llCode; 679 U32 u; 680 for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++; 681 } 682 683 if (nbSeq >= 2) { /* rep offsets */ 684 const seqDef* const seq = seqStorePtr->sequencesStart; 685 U32 offset1 = seq[0].offBase - ZSTD_REP_NUM; 686 U32 offset2 = seq[1].offBase - ZSTD_REP_NUM; 687 if (offset1 >= MAXREPOFFSET) offset1 = 0; 688 if (offset2 >= MAXREPOFFSET) offset2 = 0; 689 repOffsets[offset1] += 3; 690 repOffsets[offset2] += 1; 691 } } } 692 } 693 694 static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles) 695 { 696 size_t total=0; 697 unsigned u; 698 for (u=0; u<nbFiles; u++) total += fileSizes[u]; 699 return total; 700 } 701 702 typedef struct { U32 offset; U32 count; } offsetCount_t; 703 704 static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count) 705 { 706 U32 u; 707 table[ZSTD_REP_NUM].offset = val; 708 table[ZSTD_REP_NUM].count = count; 709 for (u=ZSTD_REP_NUM; u>0; u--) { 710 offsetCount_t tmp; 711 if (table[u-1].count >= table[u].count) break; 712 tmp = table[u-1]; 713 table[u-1] = table[u]; 714 table[u] = tmp; 715 } 716 } 717 718 /* ZDICT_flatLit() : 719 * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals. 720 * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode. 721 */ 722 static void ZDICT_flatLit(unsigned* countLit) 723 { 724 int u; 725 for (u=1; u<256; u++) countLit[u] = 2; 726 countLit[0] = 4; 727 countLit[253] = 1; 728 countLit[254] = 1; 729 } 730 731 #define OFFCODE_MAX 30 /* only applicable to first block */ 732 static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, 733 int compressionLevel, 734 const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles, 735 const void* dictBuffer, size_t dictBufferSize, 736 unsigned notificationLevel) 737 { 738 unsigned countLit[256]; 739 HUF_CREATE_STATIC_CTABLE(hufTable, 255); 740 unsigned offcodeCount[OFFCODE_MAX+1]; 741 short offcodeNCount[OFFCODE_MAX+1]; 742 U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB)); 743 unsigned matchLengthCount[MaxML+1]; 744 short matchLengthNCount[MaxML+1]; 745 unsigned litLengthCount[MaxLL+1]; 746 short litLengthNCount[MaxLL+1]; 747 U32 repOffset[MAXREPOFFSET]; 748 offsetCount_t bestRepOffset[ZSTD_REP_NUM+1]; 749 EStats_ress_t esr = { NULL, NULL, NULL }; 750 ZSTD_parameters params; 751 U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total; 752 size_t pos = 0, errorCode; 753 size_t eSize = 0; 754 size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles); 755 size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles); 756 BYTE* dstPtr = (BYTE*)dstBuffer; 757 758 /* init */ 759 DEBUGLOG(4, "ZDICT_analyzeEntropy"); 760 if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */ 761 for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */ 762 for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1; 763 for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1; 764 for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1; 765 memset(repOffset, 0, sizeof(repOffset)); 766 repOffset[1] = repOffset[4] = repOffset[8] = 1; 767 memset(bestRepOffset, 0, sizeof(bestRepOffset)); 768 if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT; 769 params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize); 770 771 esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem); 772 esr.zc = ZSTD_createCCtx(); 773 esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX); 774 if (!esr.dict || !esr.zc || !esr.workPlace) { 775 eSize = ERROR(memory_allocation); 776 DISPLAYLEVEL(1, "Not enough memory \n"); 777 goto _cleanup; 778 } 779 780 /* collect stats on all samples */ 781 for (u=0; u<nbFiles; u++) { 782 ZDICT_countEStats(esr, ¶ms, 783 countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset, 784 (const char*)srcBuffer + pos, fileSizes[u], 785 notificationLevel); 786 pos += fileSizes[u]; 787 } 788 789 if (notificationLevel >= 4) { 790 /* writeStats */ 791 DISPLAYLEVEL(4, "Offset Code Frequencies : \n"); 792 for (u=0; u<=offcodeMax; u++) { 793 DISPLAYLEVEL(4, "%2u :%7u \n", u, offcodeCount[u]); 794 } } 795 796 /* analyze, build stats, starting with literals */ 797 { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog); 798 if (HUF_isError(maxNbBits)) { 799 eSize = maxNbBits; 800 DISPLAYLEVEL(1, " HUF_buildCTable error \n"); 801 goto _cleanup; 802 } 803 if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */ 804 DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n"); 805 ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */ 806 maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog); 807 assert(maxNbBits==9); 808 } 809 huffLog = (U32)maxNbBits; 810 } 811 812 /* looking for most common first offsets */ 813 { U32 offset; 814 for (offset=1; offset<MAXREPOFFSET; offset++) 815 ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]); 816 } 817 /* note : the result of this phase should be used to better appreciate the impact on statistics */ 818 819 total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u]; 820 errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1); 821 if (FSE_isError(errorCode)) { 822 eSize = errorCode; 823 DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n"); 824 goto _cleanup; 825 } 826 Offlog = (U32)errorCode; 827 828 total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u]; 829 errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1); 830 if (FSE_isError(errorCode)) { 831 eSize = errorCode; 832 DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n"); 833 goto _cleanup; 834 } 835 mlLog = (U32)errorCode; 836 837 total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u]; 838 errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1); 839 if (FSE_isError(errorCode)) { 840 eSize = errorCode; 841 DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n"); 842 goto _cleanup; 843 } 844 llLog = (U32)errorCode; 845 846 /* write result to buffer */ 847 { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog); 848 if (HUF_isError(hhSize)) { 849 eSize = hhSize; 850 DISPLAYLEVEL(1, "HUF_writeCTable error \n"); 851 goto _cleanup; 852 } 853 dstPtr += hhSize; 854 maxDstSize -= hhSize; 855 eSize += hhSize; 856 } 857 858 { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog); 859 if (FSE_isError(ohSize)) { 860 eSize = ohSize; 861 DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n"); 862 goto _cleanup; 863 } 864 dstPtr += ohSize; 865 maxDstSize -= ohSize; 866 eSize += ohSize; 867 } 868 869 { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog); 870 if (FSE_isError(mhSize)) { 871 eSize = mhSize; 872 DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n"); 873 goto _cleanup; 874 } 875 dstPtr += mhSize; 876 maxDstSize -= mhSize; 877 eSize += mhSize; 878 } 879 880 { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog); 881 if (FSE_isError(lhSize)) { 882 eSize = lhSize; 883 DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n"); 884 goto _cleanup; 885 } 886 dstPtr += lhSize; 887 maxDstSize -= lhSize; 888 eSize += lhSize; 889 } 890 891 if (maxDstSize<12) { 892 eSize = ERROR(dstSize_tooSmall); 893 DISPLAYLEVEL(1, "not enough space to write RepOffsets \n"); 894 goto _cleanup; 895 } 896 # if 0 897 MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset); 898 MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset); 899 MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset); 900 #else 901 /* at this stage, we don't use the result of "most common first offset", 902 * as the impact of statistics is not properly evaluated */ 903 MEM_writeLE32(dstPtr+0, repStartValue[0]); 904 MEM_writeLE32(dstPtr+4, repStartValue[1]); 905 MEM_writeLE32(dstPtr+8, repStartValue[2]); 906 #endif 907 eSize += 12; 908 909 _cleanup: 910 ZSTD_freeCDict(esr.dict); 911 ZSTD_freeCCtx(esr.zc); 912 free(esr.workPlace); 913 914 return eSize; 915 } 916 917 918 /** 919 * @returns the maximum repcode value 920 */ 921 static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM]) 922 { 923 U32 maxRep = reps[0]; 924 int r; 925 for (r = 1; r < ZSTD_REP_NUM; ++r) 926 maxRep = MAX(maxRep, reps[r]); 927 return maxRep; 928 } 929 930 size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, 931 const void* customDictContent, size_t dictContentSize, 932 const void* samplesBuffer, const size_t* samplesSizes, 933 unsigned nbSamples, ZDICT_params_t params) 934 { 935 size_t hSize; 936 #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */ 937 BYTE header[HBUFFSIZE]; 938 int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel; 939 U32 const notificationLevel = params.notificationLevel; 940 /* The final dictionary content must be at least as large as the largest repcode */ 941 size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue); 942 size_t paddingSize; 943 944 /* check conditions */ 945 DEBUGLOG(4, "ZDICT_finalizeDictionary"); 946 if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall); 947 if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall); 948 949 /* dictionary header */ 950 MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY); 951 { U64 const randomID = XXH64(customDictContent, dictContentSize, 0); 952 U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; 953 U32 const dictID = params.dictID ? params.dictID : compliantID; 954 MEM_writeLE32(header+4, dictID); 955 } 956 hSize = 8; 957 958 /* entropy tables */ 959 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ 960 DISPLAYLEVEL(2, "statistics ... \n"); 961 { size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize, 962 compressionLevel, 963 samplesBuffer, samplesSizes, nbSamples, 964 customDictContent, dictContentSize, 965 notificationLevel); 966 if (ZDICT_isError(eSize)) return eSize; 967 hSize += eSize; 968 } 969 970 /* Shrink the content size if it doesn't fit in the buffer */ 971 if (hSize + dictContentSize > dictBufferCapacity) { 972 dictContentSize = dictBufferCapacity - hSize; 973 } 974 975 /* Pad the dictionary content with zeros if it is too small */ 976 if (dictContentSize < minContentSize) { 977 RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall, 978 "dictBufferCapacity too small to fit max repcode"); 979 paddingSize = minContentSize - dictContentSize; 980 } else { 981 paddingSize = 0; 982 } 983 984 { 985 size_t const dictSize = hSize + paddingSize + dictContentSize; 986 987 /* The dictionary consists of the header, optional padding, and the content. 988 * The padding comes before the content because the "best" position in the 989 * dictionary is the last byte. 990 */ 991 BYTE* const outDictHeader = (BYTE*)dictBuffer; 992 BYTE* const outDictPadding = outDictHeader + hSize; 993 BYTE* const outDictContent = outDictPadding + paddingSize; 994 995 assert(dictSize <= dictBufferCapacity); 996 assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize); 997 998 /* First copy the customDictContent into its final location. 999 * `customDictContent` and `dictBuffer` may overlap, so we must 1000 * do this before any other writes into the output buffer. 1001 * Then copy the header & padding into the output buffer. 1002 */ 1003 memmove(outDictContent, customDictContent, dictContentSize); 1004 memcpy(outDictHeader, header, hSize); 1005 memset(outDictPadding, 0, paddingSize); 1006 1007 return dictSize; 1008 } 1009 } 1010 1011 1012 static size_t ZDICT_addEntropyTablesFromBuffer_advanced( 1013 void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, 1014 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, 1015 ZDICT_params_t params) 1016 { 1017 int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel; 1018 U32 const notificationLevel = params.notificationLevel; 1019 size_t hSize = 8; 1020 1021 /* calculate entropy tables */ 1022 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ 1023 DISPLAYLEVEL(2, "statistics ... \n"); 1024 { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize, 1025 compressionLevel, 1026 samplesBuffer, samplesSizes, nbSamples, 1027 (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 1028 notificationLevel); 1029 if (ZDICT_isError(eSize)) return eSize; 1030 hSize += eSize; 1031 } 1032 1033 /* add dictionary header (after entropy tables) */ 1034 MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY); 1035 { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0); 1036 U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; 1037 U32 const dictID = params.dictID ? params.dictID : compliantID; 1038 MEM_writeLE32((char*)dictBuffer+4, dictID); 1039 } 1040 1041 if (hSize + dictContentSize < dictBufferCapacity) 1042 memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize); 1043 return MIN(dictBufferCapacity, hSize+dictContentSize); 1044 } 1045 1046 /*! ZDICT_trainFromBuffer_unsafe_legacy() : 1047 * Warning : `samplesBuffer` must be followed by noisy guard band !!! 1048 * @return : size of dictionary, or an error code which can be tested with ZDICT_isError() 1049 */ 1050 /* Begin FreeBSD - This symbol is needed by dll-linked CLI zstd(1). */ 1051 ZSTDLIB_API 1052 /* End FreeBSD */ 1053 static size_t ZDICT_trainFromBuffer_unsafe_legacy( 1054 void* dictBuffer, size_t maxDictSize, 1055 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, 1056 ZDICT_legacy_params_t params) 1057 { 1058 U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16)); 1059 dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); 1060 unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel; 1061 unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity; 1062 size_t const targetDictSize = maxDictSize; 1063 size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); 1064 size_t dictSize = 0; 1065 U32 const notificationLevel = params.zParams.notificationLevel; 1066 1067 /* checks */ 1068 if (!dictList) return ERROR(memory_allocation); 1069 if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */ 1070 if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */ 1071 1072 /* init */ 1073 ZDICT_initDictItem(dictList); 1074 1075 /* build dictionary */ 1076 ZDICT_trainBuffer_legacy(dictList, dictListSize, 1077 samplesBuffer, samplesBuffSize, 1078 samplesSizes, nbSamples, 1079 minRep, notificationLevel); 1080 1081 /* display best matches */ 1082 if (params.zParams.notificationLevel>= 3) { 1083 unsigned const nb = MIN(25, dictList[0].pos); 1084 unsigned const dictContentSize = ZDICT_dictSize(dictList); 1085 unsigned u; 1086 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize); 1087 DISPLAYLEVEL(3, "list %u best segments \n", nb-1); 1088 for (u=1; u<nb; u++) { 1089 unsigned const pos = dictList[u].pos; 1090 unsigned const length = dictList[u].length; 1091 U32 const printedLength = MIN(40, length); 1092 if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) { 1093 free(dictList); 1094 return ERROR(GENERIC); /* should never happen */ 1095 } 1096 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", 1097 u, length, pos, (unsigned)dictList[u].savings); 1098 ZDICT_printHex((const char*)samplesBuffer+pos, printedLength); 1099 DISPLAYLEVEL(3, "| \n"); 1100 } } 1101 1102 1103 /* create dictionary */ 1104 { unsigned dictContentSize = ZDICT_dictSize(dictList); 1105 if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */ 1106 if (dictContentSize < targetDictSize/4) { 1107 DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize); 1108 if (samplesBuffSize < 10 * targetDictSize) 1109 DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20)); 1110 if (minRep > MINRATIO) { 1111 DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1); 1112 DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n"); 1113 } 1114 } 1115 1116 if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) { 1117 unsigned proposedSelectivity = selectivity-1; 1118 while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; } 1119 DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize); 1120 DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity); 1121 DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n"); 1122 } 1123 1124 /* limit dictionary size */ 1125 { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */ 1126 U32 currentSize = 0; 1127 U32 n; for (n=1; n<max; n++) { 1128 currentSize += dictList[n].length; 1129 if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; } 1130 } 1131 dictList->pos = n; 1132 dictContentSize = currentSize; 1133 } 1134 1135 /* build dict content */ 1136 { U32 u; 1137 BYTE* ptr = (BYTE*)dictBuffer + maxDictSize; 1138 for (u=1; u<dictList->pos; u++) { 1139 U32 l = dictList[u].length; 1140 ptr -= l; 1141 if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); } /* should not happen */ 1142 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l); 1143 } } 1144 1145 dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize, 1146 samplesBuffer, samplesSizes, nbSamples, 1147 params.zParams); 1148 } 1149 1150 /* clean up */ 1151 free(dictList); 1152 return dictSize; 1153 } 1154 1155 1156 /* ZDICT_trainFromBuffer_legacy() : 1157 * issue : samplesBuffer need to be followed by a noisy guard band. 1158 * work around : duplicate the buffer, and add the noise */ 1159 size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity, 1160 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, 1161 ZDICT_legacy_params_t params) 1162 { 1163 size_t result; 1164 void* newBuff; 1165 size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); 1166 if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */ 1167 1168 newBuff = malloc(sBuffSize + NOISELENGTH); 1169 if (!newBuff) return ERROR(memory_allocation); 1170 1171 memcpy(newBuff, samplesBuffer, sBuffSize); 1172 ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */ 1173 1174 result = 1175 ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff, 1176 samplesSizes, nbSamples, params); 1177 free(newBuff); 1178 return result; 1179 } 1180 1181 1182 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, 1183 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) 1184 { 1185 ZDICT_fastCover_params_t params; 1186 DEBUGLOG(3, "ZDICT_trainFromBuffer"); 1187 memset(¶ms, 0, sizeof(params)); 1188 params.d = 8; 1189 params.steps = 4; 1190 /* Use default level since no compression level information is available */ 1191 params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT; 1192 #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1) 1193 params.zParams.notificationLevel = DEBUGLEVEL; 1194 #endif 1195 return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity, 1196 samplesBuffer, samplesSizes, nbSamples, 1197 ¶ms); 1198 } 1199 1200 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, 1201 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) 1202 { 1203 ZDICT_params_t params; 1204 memset(¶ms, 0, sizeof(params)); 1205 return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity, 1206 samplesBuffer, samplesSizes, nbSamples, 1207 params); 1208 } 1209