1 /* 2 * Copyright (c) Yann Collet, Facebook, Inc. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11 12 13 /* ************************************** 14 * Compiler Warnings 15 ****************************************/ 16 #ifdef _MSC_VER 17 # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ 18 #endif 19 20 21 /*-************************************* 22 * Includes 23 ***************************************/ 24 #include "platform.h" /* Large Files support */ 25 #include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */ 26 #include <stdlib.h> /* malloc, free */ 27 #include <string.h> /* memset */ 28 #include <stdio.h> /* fprintf, fopen, ftello64 */ 29 #include <errno.h> /* errno */ 30 #include <assert.h> 31 32 #include "timefn.h" /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */ 33 #include "../lib/common/mem.h" /* read */ 34 #include "dibio.h" 35 36 37 /*-************************************* 38 * Constants 39 ***************************************/ 40 #define KB *(1 <<10) 41 #define MB *(1 <<20) 42 #define GB *(1U<<30) 43 44 #define SAMPLESIZE_MAX (128 KB) 45 #define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */ 46 #define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */ 47 #define FASTCOVER_MEMMULT 1 /* rough estimation : memory cost to analyze 1 byte of sample */ 48 static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); 49 50 #define NOISELENGTH 32 51 #define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */ 52 53 54 /*-************************************* 55 * Console display 56 ***************************************/ 57 #define DISPLAY(...) fprintf(stderr, __VA_ARGS__) 58 #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } 59 60 static const U64 g_refreshRate = SEC_TO_MICRO / 6; 61 static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; 62 63 #define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ 64 if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ 65 { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ 66 if (displayLevel>=4) fflush(stderr); } } } 67 68 /*-************************************* 69 * Exceptions 70 ***************************************/ 71 #ifndef DEBUG 72 # define DEBUG 0 73 #endif 74 #define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); 75 #define EXM_THROW(error, ...) \ 76 { \ 77 DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ 78 DISPLAY("Error %i : ", error); \ 79 DISPLAY(__VA_ARGS__); \ 80 DISPLAY("\n"); \ 81 exit(error); \ 82 } 83 84 85 /* ******************************************************** 86 * Helper functions 87 **********************************************************/ 88 #undef MIN 89 #define MIN(a,b) ((a) < (b) ? (a) : (b)) 90 91 /** 92 Returns the size of a file. 93 If error returns -1. 94 */ 95 static S64 DiB_getFileSize (const char * fileName) 96 { 97 U64 const fileSize = UTIL_getFileSize(fileName); 98 return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize; 99 } 100 101 /* ******************************************************** 102 * File related operations 103 **********************************************************/ 104 /** DiB_loadFiles() : 105 * load samples from files listed in fileNamesTable into buffer. 106 * works even if buffer is too small to load all samples. 107 * Also provides the size of each sample into sampleSizes table 108 * which must be sized correctly, using DiB_fileStats(). 109 * @return : nb of samples effectively loaded into `buffer` 110 * *bufferSizePtr is modified, it provides the amount data loaded within buffer. 111 * sampleSizes is filled with the size of each sample. 112 */ 113 static int DiB_loadFiles( 114 void* buffer, size_t* bufferSizePtr, 115 size_t* sampleSizes, int sstSize, 116 const char** fileNamesTable, int nbFiles, 117 size_t targetChunkSize, int displayLevel ) 118 { 119 char* const buff = (char*)buffer; 120 size_t totalDataLoaded = 0; 121 int nbSamplesLoaded = 0; 122 int fileIndex = 0; 123 FILE * f = NULL; 124 125 assert(targetChunkSize <= SAMPLESIZE_MAX); 126 127 while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) { 128 size_t fileDataLoaded; 129 S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]); 130 if (fileSize <= 0) /* skip if zero-size or file error */ 131 continue; 132 133 f = fopen( fileNamesTable[fileIndex], "rb"); 134 if (f == NULL) 135 EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno)); 136 DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[fileIndex]); 137 138 /* Load the first chunk of data from the file */ 139 fileDataLoaded = targetChunkSize > 0 ? 140 (size_t)MIN(fileSize, (S64)targetChunkSize) : 141 (size_t)MIN(fileSize, SAMPLESIZE_MAX ); 142 if (totalDataLoaded + fileDataLoaded > *bufferSizePtr) 143 break; 144 if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded) 145 EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]); 146 sampleSizes[nbSamplesLoaded++] = fileDataLoaded; 147 totalDataLoaded += fileDataLoaded; 148 149 /* If file-chunking is enabled, load the rest of the file as more samples */ 150 if (targetChunkSize > 0) { 151 while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) { 152 size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize); 153 if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */ 154 break; 155 156 if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize) 157 EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]); 158 sampleSizes[nbSamplesLoaded++] = chunkSize; 159 totalDataLoaded += chunkSize; 160 fileDataLoaded += chunkSize; 161 } 162 } 163 fileIndex += 1; 164 fclose(f); f = NULL; 165 } 166 if (f != NULL) 167 fclose(f); 168 169 DISPLAYLEVEL(2, "\r%79s\r", ""); 170 DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n", 171 (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded ); 172 *bufferSizePtr = totalDataLoaded; 173 return nbSamplesLoaded; 174 } 175 176 #define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r))) 177 static U32 DiB_rand(U32* src) 178 { 179 static const U32 prime1 = 2654435761U; 180 static const U32 prime2 = 2246822519U; 181 U32 rand32 = *src; 182 rand32 *= prime1; 183 rand32 ^= prime2; 184 rand32 = DiB_rotl32(rand32, 13); 185 *src = rand32; 186 return rand32 >> 5; 187 } 188 189 /* DiB_shuffle() : 190 * shuffle a table of file names in a semi-random way 191 * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, 192 * it will load random elements from it, instead of just the first ones. */ 193 static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) { 194 U32 seed = 0xFD2FB528; 195 unsigned i; 196 assert(nbFiles >= 1); 197 for (i = nbFiles - 1; i > 0; --i) { 198 unsigned const j = DiB_rand(&seed) % (i + 1); 199 const char* const tmp = fileNamesTable[j]; 200 fileNamesTable[j] = fileNamesTable[i]; 201 fileNamesTable[i] = tmp; 202 } 203 } 204 205 206 /*-******************************************************** 207 * Dictionary training functions 208 **********************************************************/ 209 static size_t DiB_findMaxMem(unsigned long long requiredMem) 210 { 211 size_t const step = 8 MB; 212 void* testmem = NULL; 213 214 requiredMem = (((requiredMem >> 23) + 1) << 23); 215 requiredMem += step; 216 if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; 217 218 while (!testmem) { 219 testmem = malloc((size_t)requiredMem); 220 requiredMem -= step; 221 } 222 223 free(testmem); 224 return (size_t)requiredMem; 225 } 226 227 228 static void DiB_fillNoise(void* buffer, size_t length) 229 { 230 unsigned const prime1 = 2654435761U; 231 unsigned const prime2 = 2246822519U; 232 unsigned acc = prime1; 233 size_t p=0; 234 235 for (p=0; p<length; p++) { 236 acc *= prime2; 237 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21); 238 } 239 } 240 241 242 static void DiB_saveDict(const char* dictFileName, 243 const void* buff, size_t buffSize) 244 { 245 FILE* const f = fopen(dictFileName, "wb"); 246 if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); 247 248 { size_t const n = fwrite(buff, 1, buffSize, f); 249 if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } 250 251 { size_t const n = (size_t)fclose(f); 252 if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } 253 } 254 255 typedef struct { 256 S64 totalSizeToLoad; 257 int nbSamples; 258 int oneSampleTooLarge; 259 } fileStats; 260 261 /*! DiB_fileStats() : 262 * Given a list of files, and a chunkSize (0 == no chunk, whole files) 263 * provides the amount of data to be loaded and the resulting nb of samples. 264 * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. 265 */ 266 static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel) 267 { 268 fileStats fs; 269 int n; 270 memset(&fs, 0, sizeof(fs)); 271 272 // We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX 273 assert( chunkSize <= SAMPLESIZE_MAX ); 274 275 for (n=0; n<nbFiles; n++) { 276 S64 const fileSize = DiB_getFileSize(fileNamesTable[n]); 277 // TODO: is there a minimum sample size? What if the file is 1-byte? 278 if (fileSize == 0) { 279 DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]); 280 continue; 281 } 282 283 /* the case where we are breaking up files in sample chunks */ 284 if (chunkSize > 0) 285 { 286 // TODO: is there a minimum sample size? Can we have a 1-byte sample? 287 fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize); 288 fs.totalSizeToLoad += fileSize; 289 } 290 else { 291 /* the case where one file is one sample */ 292 if (fileSize > SAMPLESIZE_MAX) { 293 /* flag excessively large sample files */ 294 fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX); 295 296 /* Limit to the first SAMPLESIZE_MAX (128kB) of the file */ 297 DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB", 298 fileNamesTable[n], SAMPLESIZE_MAX / (1 KB)); 299 } 300 fs.nbSamples += 1; 301 fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX); 302 } 303 } 304 DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples); 305 return fs; 306 } 307 308 int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize, 309 const char** fileNamesTable, int nbFiles, size_t chunkSize, 310 ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams, 311 ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit) 312 { 313 fileStats fs; 314 size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */ 315 int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */ 316 size_t loadedSize; /* total data loaded in srcBuffer for all samples */ 317 void* srcBuffer /* contiguous buffer with training data/samples */; 318 void* const dictBuffer = malloc(maxDictSize); 319 int result = 0; 320 321 int const displayLevel = params ? params->zParams.notificationLevel : 322 coverParams ? coverParams->zParams.notificationLevel : 323 fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0; 324 325 /* Shuffle input files before we start assessing how much sample datA to load. 326 The purpose of the shuffle is to pick random samples when the sample 327 set is larger than what we can load in memory. */ 328 DISPLAYLEVEL(3, "Shuffling input files\n"); 329 DiB_shuffle(fileNamesTable, nbFiles); 330 331 /* Figure out how much sample data to load with how many samples */ 332 fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); 333 334 { 335 int const memMult = params ? MEMMULT : 336 coverParams ? COVER_MEMMULT: 337 FASTCOVER_MEMMULT; 338 size_t const maxMem = DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult; 339 /* Limit the size of the training data to the free memory */ 340 /* Limit the size of the training data to 2GB */ 341 /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */ 342 loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE ); 343 if (memLimit != 0) { 344 DISPLAYLEVEL(2, "! Warning : setting manual memory limit for dictionary training data at %u MB \n", 345 (unsigned)(memLimit / (1 MB))); 346 loadedSize = (size_t)MIN(loadedSize, memLimit); 347 } 348 srcBuffer = malloc(loadedSize+NOISELENGTH); 349 sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); 350 } 351 352 /* Checks */ 353 if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer)) 354 EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ 355 if (fs.oneSampleTooLarge) { 356 DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); 357 DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); 358 DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); 359 } 360 if (fs.nbSamples < 5) { 361 DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); 362 DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); 363 DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); 364 EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ 365 } 366 if (fs.totalSizeToLoad < (S64)maxDictSize * 8) { 367 DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); 368 DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); 369 } 370 371 /* init */ 372 if ((S64)loadedSize < fs.totalSizeToLoad) 373 DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n", 374 (unsigned)(fs.totalSizeToLoad / (1 MB)), 375 (unsigned)(loadedSize / (1 MB))); 376 377 /* Load input buffer */ 378 nbSamplesLoaded = DiB_loadFiles( 379 srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, 380 nbFiles, chunkSize, displayLevel); 381 382 { size_t dictSize; 383 if (params) { 384 DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH); /* guard band, for end of buffer condition */ 385 dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, 386 srcBuffer, sampleSizes, nbSamplesLoaded, 387 *params); 388 } else if (coverParams) { 389 if (optimize) { 390 dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, 391 srcBuffer, sampleSizes, nbSamplesLoaded, 392 coverParams); 393 if (!ZDICT_isError(dictSize)) { 394 unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100); 395 DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d, 396 coverParams->steps, splitPercentage); 397 } 398 } else { 399 dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer, 400 sampleSizes, nbSamplesLoaded, *coverParams); 401 } 402 } else { 403 assert(fastCoverParams != NULL); 404 if (optimize) { 405 dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, 406 srcBuffer, sampleSizes, nbSamplesLoaded, 407 fastCoverParams); 408 if (!ZDICT_isError(dictSize)) { 409 unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100); 410 DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k, 411 fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage, 412 fastCoverParams->accel); 413 } 414 } else { 415 dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer, 416 sampleSizes, nbSamplesLoaded, *fastCoverParams); 417 } 418 } 419 if (ZDICT_isError(dictSize)) { 420 DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ 421 result = 1; 422 goto _cleanup; 423 } 424 /* save dict */ 425 DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName); 426 DiB_saveDict(dictFileName, dictBuffer, dictSize); 427 } 428 429 /* clean up */ 430 _cleanup: 431 free(srcBuffer); 432 free(sampleSizes); 433 free(dictBuffer); 434 return result; 435 } 436