xref: /freebsd/sys/contrib/zstd/lib/dictBuilder/cover.h (revision 2b8331622f0b212cf3bb4fc4914a501e5321d506)
1 /*
2  * Copyright (c) Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under both the BSD-style license (found in the
6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7  * in the COPYING file in the root directory of this source tree).
8  * You may select, at your option, one of the above-listed licenses.
9  */
10 
11 #ifndef ZDICT_STATIC_LINKING_ONLY
12 #  define ZDICT_STATIC_LINKING_ONLY
13 #endif
14 
15 #include <stdio.h>  /* fprintf */
16 #include <stdlib.h> /* malloc, free, qsort */
17 #include <string.h> /* memset */
18 #include <time.h>   /* clock */
19 #include "../common/mem.h" /* read */
20 #include "../common/pool.h"
21 #include "../common/threading.h"
22 #include "../common/zstd_internal.h" /* includes zstd.h */
23 #include "../zdict.h"
24 
25 /**
26  * COVER_best_t is used for two purposes:
27  * 1. Synchronizing threads.
28  * 2. Saving the best parameters and dictionary.
29  *
30  * All of the methods except COVER_best_init() are thread safe if zstd is
31  * compiled with multithreaded support.
32  */
33 typedef struct COVER_best_s {
34   ZSTD_pthread_mutex_t mutex;
35   ZSTD_pthread_cond_t cond;
36   size_t liveJobs;
37   void *dict;
38   size_t dictSize;
39   ZDICT_cover_params_t parameters;
40   size_t compressedSize;
41 } COVER_best_t;
42 
43 /**
44  * A segment is a range in the source as well as the score of the segment.
45  */
46 typedef struct {
47   U32 begin;
48   U32 end;
49   U32 score;
50 } COVER_segment_t;
51 
52 /**
53  *Number of epochs and size of each epoch.
54  */
55 typedef struct {
56   U32 num;
57   U32 size;
58 } COVER_epoch_info_t;
59 
60 /**
61  * Struct used for the dictionary selection function.
62  */
63 typedef struct COVER_dictSelection {
64   BYTE* dictContent;
65   size_t dictSize;
66   size_t totalCompressedSize;
67 } COVER_dictSelection_t;
68 
69 /**
70  * Computes the number of epochs and the size of each epoch.
71  * We will make sure that each epoch gets at least 10 * k bytes.
72  *
73  * The COVER algorithms divide the data up into epochs of equal size and
74  * select one segment from each epoch.
75  *
76  * @param maxDictSize The maximum allowed dictionary size.
77  * @param nbDmers     The number of dmers we are training on.
78  * @param k           The parameter k (segment size).
79  * @param passes      The target number of passes over the dmer corpus.
80  *                    More passes means a better dictionary.
81  */
82 COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
83                                        U32 k, U32 passes);
84 
85 /**
86  * Warns the user when their corpus is too small.
87  */
88 void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
89 
90 /**
91  *  Checks total compressed size of a dictionary
92  */
93 size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
94                                       const size_t *samplesSizes, const BYTE *samples,
95                                       size_t *offsets,
96                                       size_t nbTrainSamples, size_t nbSamples,
97                                       BYTE *const dict, size_t dictBufferCapacity);
98 
99 /**
100  * Returns the sum of the sample sizes.
101  */
102 size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
103 
104 /**
105  * Initialize the `COVER_best_t`.
106  */
107 void COVER_best_init(COVER_best_t *best);
108 
109 /**
110  * Wait until liveJobs == 0.
111  */
112 void COVER_best_wait(COVER_best_t *best);
113 
114 /**
115  * Call COVER_best_wait() and then destroy the COVER_best_t.
116  */
117 void COVER_best_destroy(COVER_best_t *best);
118 
119 /**
120  * Called when a thread is about to be launched.
121  * Increments liveJobs.
122  */
123 void COVER_best_start(COVER_best_t *best);
124 
125 /**
126  * Called when a thread finishes executing, both on error or success.
127  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
128  * If this dictionary is the best so far save it and its parameters.
129  */
130 void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
131                        COVER_dictSelection_t selection);
132 /**
133  * Error function for COVER_selectDict function. Checks if the return
134  * value is an error.
135  */
136 unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
137 
138  /**
139   * Error function for COVER_selectDict function. Returns a struct where
140   * return.totalCompressedSize is a ZSTD error.
141   */
142 COVER_dictSelection_t COVER_dictSelectionError(size_t error);
143 
144 /**
145  * Always call after selectDict is called to free up used memory from
146  * newly created dictionary.
147  */
148 void COVER_dictSelectionFree(COVER_dictSelection_t selection);
149 
150 /**
151  * Called to finalize the dictionary and select one based on whether or not
152  * the shrink-dict flag was enabled. If enabled the dictionary used is the
153  * smallest dictionary within a specified regression of the compressed size
154  * from the largest dictionary.
155  */
156  COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
157                        size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
158                        size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
159