1 /* 2 * Copyright (c) Facebook, Inc. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11 #ifndef ZDICT_STATIC_LINKING_ONLY 12 # define ZDICT_STATIC_LINKING_ONLY 13 #endif 14 15 #include <stdio.h> /* fprintf */ 16 #include <stdlib.h> /* malloc, free, qsort */ 17 #include <string.h> /* memset */ 18 #include <time.h> /* clock */ 19 #include "../common/mem.h" /* read */ 20 #include "../common/pool.h" 21 #include "../common/threading.h" 22 #include "../common/zstd_internal.h" /* includes zstd.h */ 23 #include "../zdict.h" 24 25 /** 26 * COVER_best_t is used for two purposes: 27 * 1. Synchronizing threads. 28 * 2. Saving the best parameters and dictionary. 29 * 30 * All of the methods except COVER_best_init() are thread safe if zstd is 31 * compiled with multithreaded support. 32 */ 33 typedef struct COVER_best_s { 34 ZSTD_pthread_mutex_t mutex; 35 ZSTD_pthread_cond_t cond; 36 size_t liveJobs; 37 void *dict; 38 size_t dictSize; 39 ZDICT_cover_params_t parameters; 40 size_t compressedSize; 41 } COVER_best_t; 42 43 /** 44 * A segment is a range in the source as well as the score of the segment. 45 */ 46 typedef struct { 47 U32 begin; 48 U32 end; 49 U32 score; 50 } COVER_segment_t; 51 52 /** 53 *Number of epochs and size of each epoch. 54 */ 55 typedef struct { 56 U32 num; 57 U32 size; 58 } COVER_epoch_info_t; 59 60 /** 61 * Struct used for the dictionary selection function. 62 */ 63 typedef struct COVER_dictSelection { 64 BYTE* dictContent; 65 size_t dictSize; 66 size_t totalCompressedSize; 67 } COVER_dictSelection_t; 68 69 /** 70 * Computes the number of epochs and the size of each epoch. 71 * We will make sure that each epoch gets at least 10 * k bytes. 72 * 73 * The COVER algorithms divide the data up into epochs of equal size and 74 * select one segment from each epoch. 75 * 76 * @param maxDictSize The maximum allowed dictionary size. 77 * @param nbDmers The number of dmers we are training on. 78 * @param k The parameter k (segment size). 79 * @param passes The target number of passes over the dmer corpus. 80 * More passes means a better dictionary. 81 */ 82 COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, 83 U32 k, U32 passes); 84 85 /** 86 * Warns the user when their corpus is too small. 87 */ 88 void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); 89 90 /** 91 * Checks total compressed size of a dictionary 92 */ 93 size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, 94 const size_t *samplesSizes, const BYTE *samples, 95 size_t *offsets, 96 size_t nbTrainSamples, size_t nbSamples, 97 BYTE *const dict, size_t dictBufferCapacity); 98 99 /** 100 * Returns the sum of the sample sizes. 101 */ 102 size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ; 103 104 /** 105 * Initialize the `COVER_best_t`. 106 */ 107 void COVER_best_init(COVER_best_t *best); 108 109 /** 110 * Wait until liveJobs == 0. 111 */ 112 void COVER_best_wait(COVER_best_t *best); 113 114 /** 115 * Call COVER_best_wait() and then destroy the COVER_best_t. 116 */ 117 void COVER_best_destroy(COVER_best_t *best); 118 119 /** 120 * Called when a thread is about to be launched. 121 * Increments liveJobs. 122 */ 123 void COVER_best_start(COVER_best_t *best); 124 125 /** 126 * Called when a thread finishes executing, both on error or success. 127 * Decrements liveJobs and signals any waiting threads if liveJobs == 0. 128 * If this dictionary is the best so far save it and its parameters. 129 */ 130 void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, 131 COVER_dictSelection_t selection); 132 /** 133 * Error function for COVER_selectDict function. Checks if the return 134 * value is an error. 135 */ 136 unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); 137 138 /** 139 * Error function for COVER_selectDict function. Returns a struct where 140 * return.totalCompressedSize is a ZSTD error. 141 */ 142 COVER_dictSelection_t COVER_dictSelectionError(size_t error); 143 144 /** 145 * Always call after selectDict is called to free up used memory from 146 * newly created dictionary. 147 */ 148 void COVER_dictSelectionFree(COVER_dictSelection_t selection); 149 150 /** 151 * Called to finalize the dictionary and select one based on whether or not 152 * the shrink-dict flag was enabled. If enabled the dictionary used is the 153 * smallest dictionary within a specified regression of the compressed size 154 * from the largest dictionary. 155 */ 156 COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity, 157 size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, 158 size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); 159