1 #include <stdio.h> /* fprintf */ 2 #include <stdlib.h> /* malloc, free, qsort */ 3 #include <string.h> /* memset */ 4 #include <time.h> /* clock */ 5 #include "mem.h" /* read */ 6 #include "pool.h" 7 #include "threading.h" 8 #include "zstd_internal.h" /* includes zstd.h */ 9 #ifndef ZDICT_STATIC_LINKING_ONLY 10 #define ZDICT_STATIC_LINKING_ONLY 11 #endif 12 #include "zdict.h" 13 14 /** 15 * COVER_best_t is used for two purposes: 16 * 1. Synchronizing threads. 17 * 2. Saving the best parameters and dictionary. 18 * 19 * All of the methods except COVER_best_init() are thread safe if zstd is 20 * compiled with multithreaded support. 21 */ 22 typedef struct COVER_best_s { 23 ZSTD_pthread_mutex_t mutex; 24 ZSTD_pthread_cond_t cond; 25 size_t liveJobs; 26 void *dict; 27 size_t dictSize; 28 ZDICT_cover_params_t parameters; 29 size_t compressedSize; 30 } COVER_best_t; 31 32 /** 33 * A segment is a range in the source as well as the score of the segment. 34 */ 35 typedef struct { 36 U32 begin; 37 U32 end; 38 U32 score; 39 } COVER_segment_t; 40 41 /** 42 *Number of epochs and size of each epoch. 43 */ 44 typedef struct { 45 U32 num; 46 U32 size; 47 } COVER_epoch_info_t; 48 49 /** 50 * Struct used for the dictionary selection function. 51 */ 52 typedef struct COVER_dictSelection { 53 BYTE* dictContent; 54 size_t dictSize; 55 size_t totalCompressedSize; 56 } COVER_dictSelection_t; 57 58 /** 59 * Computes the number of epochs and the size of each epoch. 60 * We will make sure that each epoch gets at least 10 * k bytes. 61 * 62 * The COVER algorithms divide the data up into epochs of equal size and 63 * select one segment from each epoch. 64 * 65 * @param maxDictSize The maximum allowed dictionary size. 66 * @param nbDmers The number of dmers we are training on. 67 * @param k The parameter k (segment size). 68 * @param passes The target number of passes over the dmer corpus. 69 * More passes means a better dictionary. 70 */ 71 COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, 72 U32 k, U32 passes); 73 74 /** 75 * Warns the user when their corpus is too small. 76 */ 77 void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); 78 79 /** 80 * Checks total compressed size of a dictionary 81 */ 82 size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, 83 const size_t *samplesSizes, const BYTE *samples, 84 size_t *offsets, 85 size_t nbTrainSamples, size_t nbSamples, 86 BYTE *const dict, size_t dictBufferCapacity); 87 88 /** 89 * Returns the sum of the sample sizes. 90 */ 91 size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ; 92 93 /** 94 * Initialize the `COVER_best_t`. 95 */ 96 void COVER_best_init(COVER_best_t *best); 97 98 /** 99 * Wait until liveJobs == 0. 100 */ 101 void COVER_best_wait(COVER_best_t *best); 102 103 /** 104 * Call COVER_best_wait() and then destroy the COVER_best_t. 105 */ 106 void COVER_best_destroy(COVER_best_t *best); 107 108 /** 109 * Called when a thread is about to be launched. 110 * Increments liveJobs. 111 */ 112 void COVER_best_start(COVER_best_t *best); 113 114 /** 115 * Called when a thread finishes executing, both on error or success. 116 * Decrements liveJobs and signals any waiting threads if liveJobs == 0. 117 * If this dictionary is the best so far save it and its parameters. 118 */ 119 void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, 120 COVER_dictSelection_t selection); 121 /** 122 * Error function for COVER_selectDict function. Checks if the return 123 * value is an error. 124 */ 125 unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); 126 127 /** 128 * Error function for COVER_selectDict function. Returns a struct where 129 * return.totalCompressedSize is a ZSTD error. 130 */ 131 COVER_dictSelection_t COVER_dictSelectionError(size_t error); 132 133 /** 134 * Always call after selectDict is called to free up used memory from 135 * newly created dictionary. 136 */ 137 void COVER_dictSelectionFree(COVER_dictSelection_t selection); 138 139 /** 140 * Called to finalize the dictionary and select one based on whether or not 141 * the shrink-dict flag was enabled. If enabled the dictionary used is the 142 * smallest dictionary within a specified regression of the compressed size 143 * from the largest dictionary. 144 */ 145 COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, 146 size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, 147 size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); 148