xref: /freebsd/sys/contrib/zstd/lib/dictBuilder/cover.h (revision f6a3b357e9be4c6423c85eff9a847163a0d307c8)
1 #include <stdio.h>  /* fprintf */
2 #include <stdlib.h> /* malloc, free, qsort */
3 #include <string.h> /* memset */
4 #include <time.h>   /* clock */
5 #include "mem.h" /* read */
6 #include "pool.h"
7 #include "threading.h"
8 #include "zstd_internal.h" /* includes zstd.h */
9 #ifndef ZDICT_STATIC_LINKING_ONLY
10 #define ZDICT_STATIC_LINKING_ONLY
11 #endif
12 #include "zdict.h"
13 
14 /**
15  * COVER_best_t is used for two purposes:
16  * 1. Synchronizing threads.
17  * 2. Saving the best parameters and dictionary.
18  *
19  * All of the methods except COVER_best_init() are thread safe if zstd is
20  * compiled with multithreaded support.
21  */
22 typedef struct COVER_best_s {
23   ZSTD_pthread_mutex_t mutex;
24   ZSTD_pthread_cond_t cond;
25   size_t liveJobs;
26   void *dict;
27   size_t dictSize;
28   ZDICT_cover_params_t parameters;
29   size_t compressedSize;
30 } COVER_best_t;
31 
32 /**
33  * A segment is a range in the source as well as the score of the segment.
34  */
35 typedef struct {
36   U32 begin;
37   U32 end;
38   U32 score;
39 } COVER_segment_t;
40 
41 /**
42  *Number of epochs and size of each epoch.
43  */
44 typedef struct {
45   U32 num;
46   U32 size;
47 } COVER_epoch_info_t;
48 
49 /**
50  * Struct used for the dictionary selection function.
51  */
52 typedef struct COVER_dictSelection {
53   BYTE* dictContent;
54   size_t dictSize;
55   size_t totalCompressedSize;
56 } COVER_dictSelection_t;
57 
58 /**
59  * Computes the number of epochs and the size of each epoch.
60  * We will make sure that each epoch gets at least 10 * k bytes.
61  *
62  * The COVER algorithms divide the data up into epochs of equal size and
63  * select one segment from each epoch.
64  *
65  * @param maxDictSize The maximum allowed dictionary size.
66  * @param nbDmers     The number of dmers we are training on.
67  * @param k           The parameter k (segment size).
68  * @param passes      The target number of passes over the dmer corpus.
69  *                    More passes means a better dictionary.
70  */
71 COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
72                                        U32 k, U32 passes);
73 
74 /**
75  * Warns the user when their corpus is too small.
76  */
77 void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
78 
79 /**
80  *  Checks total compressed size of a dictionary
81  */
82 size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
83                                       const size_t *samplesSizes, const BYTE *samples,
84                                       size_t *offsets,
85                                       size_t nbTrainSamples, size_t nbSamples,
86                                       BYTE *const dict, size_t dictBufferCapacity);
87 
88 /**
89  * Returns the sum of the sample sizes.
90  */
91 size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
92 
93 /**
94  * Initialize the `COVER_best_t`.
95  */
96 void COVER_best_init(COVER_best_t *best);
97 
98 /**
99  * Wait until liveJobs == 0.
100  */
101 void COVER_best_wait(COVER_best_t *best);
102 
103 /**
104  * Call COVER_best_wait() and then destroy the COVER_best_t.
105  */
106 void COVER_best_destroy(COVER_best_t *best);
107 
108 /**
109  * Called when a thread is about to be launched.
110  * Increments liveJobs.
111  */
112 void COVER_best_start(COVER_best_t *best);
113 
114 /**
115  * Called when a thread finishes executing, both on error or success.
116  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
117  * If this dictionary is the best so far save it and its parameters.
118  */
119 void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
120                        COVER_dictSelection_t selection);
121 /**
122  * Error function for COVER_selectDict function. Checks if the return
123  * value is an error.
124  */
125 unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
126 
127  /**
128   * Error function for COVER_selectDict function. Returns a struct where
129   * return.totalCompressedSize is a ZSTD error.
130   */
131 COVER_dictSelection_t COVER_dictSelectionError(size_t error);
132 
133 /**
134  * Always call after selectDict is called to free up used memory from
135  * newly created dictionary.
136  */
137 void COVER_dictSelectionFree(COVER_dictSelection_t selection);
138 
139 /**
140  * Called to finalize the dictionary and select one based on whether or not
141  * the shrink-dict flag was enabled. If enabled the dictionary used is the
142  * smallest dictionary within a specified regression of the compressed size
143  * from the largest dictionary.
144  */
145  COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
146                        size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
147                        size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
148