1 /* 2 * Copyright (c) 2017-2020, Facebook, Inc. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11 #include <stdio.h> /* fprintf */ 12 #include <stdlib.h> /* malloc, free, qsort */ 13 #include <string.h> /* memset */ 14 #include <time.h> /* clock */ 15 #include "../common/mem.h" /* read */ 16 #include "../common/pool.h" 17 #include "../common/threading.h" 18 #include "../common/zstd_internal.h" /* includes zstd.h */ 19 #ifndef ZDICT_STATIC_LINKING_ONLY 20 #define ZDICT_STATIC_LINKING_ONLY 21 #endif 22 #include "zdict.h" 23 24 /** 25 * COVER_best_t is used for two purposes: 26 * 1. Synchronizing threads. 27 * 2. Saving the best parameters and dictionary. 28 * 29 * All of the methods except COVER_best_init() are thread safe if zstd is 30 * compiled with multithreaded support. 31 */ 32 typedef struct COVER_best_s { 33 ZSTD_pthread_mutex_t mutex; 34 ZSTD_pthread_cond_t cond; 35 size_t liveJobs; 36 void *dict; 37 size_t dictSize; 38 ZDICT_cover_params_t parameters; 39 size_t compressedSize; 40 } COVER_best_t; 41 42 /** 43 * A segment is a range in the source as well as the score of the segment. 44 */ 45 typedef struct { 46 U32 begin; 47 U32 end; 48 U32 score; 49 } COVER_segment_t; 50 51 /** 52 *Number of epochs and size of each epoch. 53 */ 54 typedef struct { 55 U32 num; 56 U32 size; 57 } COVER_epoch_info_t; 58 59 /** 60 * Struct used for the dictionary selection function. 61 */ 62 typedef struct COVER_dictSelection { 63 BYTE* dictContent; 64 size_t dictSize; 65 size_t totalCompressedSize; 66 } COVER_dictSelection_t; 67 68 /** 69 * Computes the number of epochs and the size of each epoch. 70 * We will make sure that each epoch gets at least 10 * k bytes. 71 * 72 * The COVER algorithms divide the data up into epochs of equal size and 73 * select one segment from each epoch. 74 * 75 * @param maxDictSize The maximum allowed dictionary size. 76 * @param nbDmers The number of dmers we are training on. 77 * @param k The parameter k (segment size). 78 * @param passes The target number of passes over the dmer corpus. 79 * More passes means a better dictionary. 80 */ 81 COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, 82 U32 k, U32 passes); 83 84 /** 85 * Warns the user when their corpus is too small. 86 */ 87 void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); 88 89 /** 90 * Checks total compressed size of a dictionary 91 */ 92 size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, 93 const size_t *samplesSizes, const BYTE *samples, 94 size_t *offsets, 95 size_t nbTrainSamples, size_t nbSamples, 96 BYTE *const dict, size_t dictBufferCapacity); 97 98 /** 99 * Returns the sum of the sample sizes. 100 */ 101 size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ; 102 103 /** 104 * Initialize the `COVER_best_t`. 105 */ 106 void COVER_best_init(COVER_best_t *best); 107 108 /** 109 * Wait until liveJobs == 0. 110 */ 111 void COVER_best_wait(COVER_best_t *best); 112 113 /** 114 * Call COVER_best_wait() and then destroy the COVER_best_t. 115 */ 116 void COVER_best_destroy(COVER_best_t *best); 117 118 /** 119 * Called when a thread is about to be launched. 120 * Increments liveJobs. 121 */ 122 void COVER_best_start(COVER_best_t *best); 123 124 /** 125 * Called when a thread finishes executing, both on error or success. 126 * Decrements liveJobs and signals any waiting threads if liveJobs == 0. 127 * If this dictionary is the best so far save it and its parameters. 128 */ 129 void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, 130 COVER_dictSelection_t selection); 131 /** 132 * Error function for COVER_selectDict function. Checks if the return 133 * value is an error. 134 */ 135 unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); 136 137 /** 138 * Error function for COVER_selectDict function. Returns a struct where 139 * return.totalCompressedSize is a ZSTD error. 140 */ 141 COVER_dictSelection_t COVER_dictSelectionError(size_t error); 142 143 /** 144 * Always call after selectDict is called to free up used memory from 145 * newly created dictionary. 146 */ 147 void COVER_dictSelectionFree(COVER_dictSelection_t selection); 148 149 /** 150 * Called to finalize the dictionary and select one based on whether or not 151 * the shrink-dict flag was enabled. If enabled the dictionary used is the 152 * smallest dictionary within a specified regression of the compressed size 153 * from the largest dictionary. 154 */ 155 COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, 156 size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, 157 size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); 158