xref: /freebsd/sys/contrib/zstd/lib/dictBuilder/cover.h (revision 2b9c00cb6bd9392645dc8afca59cf57c42df4e2d)
10f743729SConrad Meyer #include <stdio.h>  /* fprintf */
20f743729SConrad Meyer #include <stdlib.h> /* malloc, free, qsort */
30f743729SConrad Meyer #include <string.h> /* memset */
40f743729SConrad Meyer #include <time.h>   /* clock */
50f743729SConrad Meyer #include "mem.h" /* read */
60f743729SConrad Meyer #include "pool.h"
70f743729SConrad Meyer #include "threading.h"
80f743729SConrad Meyer #include "zstd_internal.h" /* includes zstd.h */
90f743729SConrad Meyer #ifndef ZDICT_STATIC_LINKING_ONLY
100f743729SConrad Meyer #define ZDICT_STATIC_LINKING_ONLY
110f743729SConrad Meyer #endif
120f743729SConrad Meyer #include "zdict.h"
130f743729SConrad Meyer 
140f743729SConrad Meyer /**
150f743729SConrad Meyer  * COVER_best_t is used for two purposes:
160f743729SConrad Meyer  * 1. Synchronizing threads.
170f743729SConrad Meyer  * 2. Saving the best parameters and dictionary.
180f743729SConrad Meyer  *
190f743729SConrad Meyer  * All of the methods except COVER_best_init() are thread safe if zstd is
200f743729SConrad Meyer  * compiled with multithreaded support.
210f743729SConrad Meyer  */
220f743729SConrad Meyer typedef struct COVER_best_s {
230f743729SConrad Meyer   ZSTD_pthread_mutex_t mutex;
240f743729SConrad Meyer   ZSTD_pthread_cond_t cond;
250f743729SConrad Meyer   size_t liveJobs;
260f743729SConrad Meyer   void *dict;
270f743729SConrad Meyer   size_t dictSize;
280f743729SConrad Meyer   ZDICT_cover_params_t parameters;
290f743729SConrad Meyer   size_t compressedSize;
300f743729SConrad Meyer } COVER_best_t;
310f743729SConrad Meyer 
320f743729SConrad Meyer /**
330f743729SConrad Meyer  * A segment is a range in the source as well as the score of the segment.
340f743729SConrad Meyer  */
350f743729SConrad Meyer typedef struct {
360f743729SConrad Meyer   U32 begin;
370f743729SConrad Meyer   U32 end;
380f743729SConrad Meyer   U32 score;
390f743729SConrad Meyer } COVER_segment_t;
400f743729SConrad Meyer 
410f743729SConrad Meyer /**
42*2b9c00cbSConrad Meyer  *Number of epochs and size of each epoch.
43*2b9c00cbSConrad Meyer  */
44*2b9c00cbSConrad Meyer typedef struct {
45*2b9c00cbSConrad Meyer   U32 num;
46*2b9c00cbSConrad Meyer   U32 size;
47*2b9c00cbSConrad Meyer } COVER_epoch_info_t;
48*2b9c00cbSConrad Meyer 
49*2b9c00cbSConrad Meyer /**
50*2b9c00cbSConrad Meyer  * Computes the number of epochs and the size of each epoch.
51*2b9c00cbSConrad Meyer  * We will make sure that each epoch gets at least 10 * k bytes.
52*2b9c00cbSConrad Meyer  *
53*2b9c00cbSConrad Meyer  * The COVER algorithms divide the data up into epochs of equal size and
54*2b9c00cbSConrad Meyer  * select one segment from each epoch.
55*2b9c00cbSConrad Meyer  *
56*2b9c00cbSConrad Meyer  * @param maxDictSize The maximum allowed dictionary size.
57*2b9c00cbSConrad Meyer  * @param nbDmers     The number of dmers we are training on.
58*2b9c00cbSConrad Meyer  * @param k           The parameter k (segment size).
59*2b9c00cbSConrad Meyer  * @param passes      The target number of passes over the dmer corpus.
60*2b9c00cbSConrad Meyer  *                    More passes means a better dictionary.
61*2b9c00cbSConrad Meyer  */
62*2b9c00cbSConrad Meyer COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
63*2b9c00cbSConrad Meyer                                        U32 k, U32 passes);
64*2b9c00cbSConrad Meyer 
65*2b9c00cbSConrad Meyer /**
66*2b9c00cbSConrad Meyer  * Warns the user when their corpus is too small.
67*2b9c00cbSConrad Meyer  */
68*2b9c00cbSConrad Meyer void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
69*2b9c00cbSConrad Meyer 
70*2b9c00cbSConrad Meyer /**
710f743729SConrad Meyer  *  Checks total compressed size of a dictionary
720f743729SConrad Meyer  */
730f743729SConrad Meyer size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
740f743729SConrad Meyer                                       const size_t *samplesSizes, const BYTE *samples,
750f743729SConrad Meyer                                       size_t *offsets,
760f743729SConrad Meyer                                       size_t nbTrainSamples, size_t nbSamples,
770f743729SConrad Meyer                                       BYTE *const dict, size_t dictBufferCapacity);
780f743729SConrad Meyer 
790f743729SConrad Meyer /**
800f743729SConrad Meyer  * Returns the sum of the sample sizes.
810f743729SConrad Meyer  */
820f743729SConrad Meyer size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
830f743729SConrad Meyer 
840f743729SConrad Meyer /**
850f743729SConrad Meyer  * Initialize the `COVER_best_t`.
860f743729SConrad Meyer  */
870f743729SConrad Meyer void COVER_best_init(COVER_best_t *best);
880f743729SConrad Meyer 
890f743729SConrad Meyer /**
900f743729SConrad Meyer  * Wait until liveJobs == 0.
910f743729SConrad Meyer  */
920f743729SConrad Meyer void COVER_best_wait(COVER_best_t *best);
930f743729SConrad Meyer 
940f743729SConrad Meyer /**
950f743729SConrad Meyer  * Call COVER_best_wait() and then destroy the COVER_best_t.
960f743729SConrad Meyer  */
970f743729SConrad Meyer void COVER_best_destroy(COVER_best_t *best);
980f743729SConrad Meyer 
990f743729SConrad Meyer /**
1000f743729SConrad Meyer  * Called when a thread is about to be launched.
1010f743729SConrad Meyer  * Increments liveJobs.
1020f743729SConrad Meyer  */
1030f743729SConrad Meyer void COVER_best_start(COVER_best_t *best);
1040f743729SConrad Meyer 
1050f743729SConrad Meyer /**
1060f743729SConrad Meyer  * Called when a thread finishes executing, both on error or success.
1070f743729SConrad Meyer  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
1080f743729SConrad Meyer  * If this dictionary is the best so far save it and its parameters.
1090f743729SConrad Meyer  */
1100f743729SConrad Meyer void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
1110f743729SConrad Meyer                        ZDICT_cover_params_t parameters, void *dict,
1120f743729SConrad Meyer                        size_t dictSize);
113