xref: /freebsd/sys/contrib/zstd/lib/dictBuilder/cover.h (revision 5ff13fbc199bdf5f0572845351c68ee5ca828e71)
137f1f268SConrad Meyer /*
2*5ff13fbcSAllan Jude  * Copyright (c) Facebook, Inc.
337f1f268SConrad Meyer  * All rights reserved.
437f1f268SConrad Meyer  *
537f1f268SConrad Meyer  * This source code is licensed under both the BSD-style license (found in the
637f1f268SConrad Meyer  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
737f1f268SConrad Meyer  * in the COPYING file in the root directory of this source tree).
837f1f268SConrad Meyer  * You may select, at your option, one of the above-listed licenses.
937f1f268SConrad Meyer  */
1037f1f268SConrad Meyer 
11*5ff13fbcSAllan Jude #ifndef ZDICT_STATIC_LINKING_ONLY
12*5ff13fbcSAllan Jude #  define ZDICT_STATIC_LINKING_ONLY
13*5ff13fbcSAllan Jude #endif
14*5ff13fbcSAllan Jude 
150f743729SConrad Meyer #include <stdio.h>  /* fprintf */
160f743729SConrad Meyer #include <stdlib.h> /* malloc, free, qsort */
170f743729SConrad Meyer #include <string.h> /* memset */
180f743729SConrad Meyer #include <time.h>   /* clock */
1937f1f268SConrad Meyer #include "../common/mem.h" /* read */
2037f1f268SConrad Meyer #include "../common/pool.h"
2137f1f268SConrad Meyer #include "../common/threading.h"
2237f1f268SConrad Meyer #include "../common/zstd_internal.h" /* includes zstd.h */
23*5ff13fbcSAllan Jude #include "../zdict.h"
240f743729SConrad Meyer 
250f743729SConrad Meyer /**
260f743729SConrad Meyer  * COVER_best_t is used for two purposes:
270f743729SConrad Meyer  * 1. Synchronizing threads.
280f743729SConrad Meyer  * 2. Saving the best parameters and dictionary.
290f743729SConrad Meyer  *
300f743729SConrad Meyer  * All of the methods except COVER_best_init() are thread safe if zstd is
310f743729SConrad Meyer  * compiled with multithreaded support.
320f743729SConrad Meyer  */
330f743729SConrad Meyer typedef struct COVER_best_s {
340f743729SConrad Meyer   ZSTD_pthread_mutex_t mutex;
350f743729SConrad Meyer   ZSTD_pthread_cond_t cond;
360f743729SConrad Meyer   size_t liveJobs;
370f743729SConrad Meyer   void *dict;
380f743729SConrad Meyer   size_t dictSize;
390f743729SConrad Meyer   ZDICT_cover_params_t parameters;
400f743729SConrad Meyer   size_t compressedSize;
410f743729SConrad Meyer } COVER_best_t;
420f743729SConrad Meyer 
430f743729SConrad Meyer /**
440f743729SConrad Meyer  * A segment is a range in the source as well as the score of the segment.
450f743729SConrad Meyer  */
460f743729SConrad Meyer typedef struct {
470f743729SConrad Meyer   U32 begin;
480f743729SConrad Meyer   U32 end;
490f743729SConrad Meyer   U32 score;
500f743729SConrad Meyer } COVER_segment_t;
510f743729SConrad Meyer 
520f743729SConrad Meyer /**
532b9c00cbSConrad Meyer  *Number of epochs and size of each epoch.
542b9c00cbSConrad Meyer  */
552b9c00cbSConrad Meyer typedef struct {
562b9c00cbSConrad Meyer   U32 num;
572b9c00cbSConrad Meyer   U32 size;
582b9c00cbSConrad Meyer } COVER_epoch_info_t;
592b9c00cbSConrad Meyer 
602b9c00cbSConrad Meyer /**
614d3f1eafSConrad Meyer  * Struct used for the dictionary selection function.
624d3f1eafSConrad Meyer  */
634d3f1eafSConrad Meyer typedef struct COVER_dictSelection {
644d3f1eafSConrad Meyer   BYTE* dictContent;
654d3f1eafSConrad Meyer   size_t dictSize;
664d3f1eafSConrad Meyer   size_t totalCompressedSize;
674d3f1eafSConrad Meyer } COVER_dictSelection_t;
684d3f1eafSConrad Meyer 
694d3f1eafSConrad Meyer /**
702b9c00cbSConrad Meyer  * Computes the number of epochs and the size of each epoch.
712b9c00cbSConrad Meyer  * We will make sure that each epoch gets at least 10 * k bytes.
722b9c00cbSConrad Meyer  *
732b9c00cbSConrad Meyer  * The COVER algorithms divide the data up into epochs of equal size and
742b9c00cbSConrad Meyer  * select one segment from each epoch.
752b9c00cbSConrad Meyer  *
762b9c00cbSConrad Meyer  * @param maxDictSize The maximum allowed dictionary size.
772b9c00cbSConrad Meyer  * @param nbDmers     The number of dmers we are training on.
782b9c00cbSConrad Meyer  * @param k           The parameter k (segment size).
792b9c00cbSConrad Meyer  * @param passes      The target number of passes over the dmer corpus.
802b9c00cbSConrad Meyer  *                    More passes means a better dictionary.
812b9c00cbSConrad Meyer  */
822b9c00cbSConrad Meyer COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
832b9c00cbSConrad Meyer                                        U32 k, U32 passes);
842b9c00cbSConrad Meyer 
852b9c00cbSConrad Meyer /**
862b9c00cbSConrad Meyer  * Warns the user when their corpus is too small.
872b9c00cbSConrad Meyer  */
882b9c00cbSConrad Meyer void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
892b9c00cbSConrad Meyer 
902b9c00cbSConrad Meyer /**
910f743729SConrad Meyer  *  Checks total compressed size of a dictionary
920f743729SConrad Meyer  */
930f743729SConrad Meyer size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
940f743729SConrad Meyer                                       const size_t *samplesSizes, const BYTE *samples,
950f743729SConrad Meyer                                       size_t *offsets,
960f743729SConrad Meyer                                       size_t nbTrainSamples, size_t nbSamples,
970f743729SConrad Meyer                                       BYTE *const dict, size_t dictBufferCapacity);
980f743729SConrad Meyer 
990f743729SConrad Meyer /**
1000f743729SConrad Meyer  * Returns the sum of the sample sizes.
1010f743729SConrad Meyer  */
1020f743729SConrad Meyer size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
1030f743729SConrad Meyer 
1040f743729SConrad Meyer /**
1050f743729SConrad Meyer  * Initialize the `COVER_best_t`.
1060f743729SConrad Meyer  */
1070f743729SConrad Meyer void COVER_best_init(COVER_best_t *best);
1080f743729SConrad Meyer 
1090f743729SConrad Meyer /**
1100f743729SConrad Meyer  * Wait until liveJobs == 0.
1110f743729SConrad Meyer  */
1120f743729SConrad Meyer void COVER_best_wait(COVER_best_t *best);
1130f743729SConrad Meyer 
1140f743729SConrad Meyer /**
1150f743729SConrad Meyer  * Call COVER_best_wait() and then destroy the COVER_best_t.
1160f743729SConrad Meyer  */
1170f743729SConrad Meyer void COVER_best_destroy(COVER_best_t *best);
1180f743729SConrad Meyer 
1190f743729SConrad Meyer /**
1200f743729SConrad Meyer  * Called when a thread is about to be launched.
1210f743729SConrad Meyer  * Increments liveJobs.
1220f743729SConrad Meyer  */
1230f743729SConrad Meyer void COVER_best_start(COVER_best_t *best);
1240f743729SConrad Meyer 
1250f743729SConrad Meyer /**
1260f743729SConrad Meyer  * Called when a thread finishes executing, both on error or success.
1270f743729SConrad Meyer  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
1280f743729SConrad Meyer  * If this dictionary is the best so far save it and its parameters.
1290f743729SConrad Meyer  */
1304d3f1eafSConrad Meyer void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
1314d3f1eafSConrad Meyer                        COVER_dictSelection_t selection);
1324d3f1eafSConrad Meyer /**
1334d3f1eafSConrad Meyer  * Error function for COVER_selectDict function. Checks if the return
1344d3f1eafSConrad Meyer  * value is an error.
1354d3f1eafSConrad Meyer  */
1364d3f1eafSConrad Meyer unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
1374d3f1eafSConrad Meyer 
1384d3f1eafSConrad Meyer  /**
1394d3f1eafSConrad Meyer   * Error function for COVER_selectDict function. Returns a struct where
1404d3f1eafSConrad Meyer   * return.totalCompressedSize is a ZSTD error.
1414d3f1eafSConrad Meyer   */
1424d3f1eafSConrad Meyer COVER_dictSelection_t COVER_dictSelectionError(size_t error);
1434d3f1eafSConrad Meyer 
1444d3f1eafSConrad Meyer /**
1454d3f1eafSConrad Meyer  * Always call after selectDict is called to free up used memory from
1464d3f1eafSConrad Meyer  * newly created dictionary.
1474d3f1eafSConrad Meyer  */
1484d3f1eafSConrad Meyer void COVER_dictSelectionFree(COVER_dictSelection_t selection);
1494d3f1eafSConrad Meyer 
1504d3f1eafSConrad Meyer /**
1514d3f1eafSConrad Meyer  * Called to finalize the dictionary and select one based on whether or not
1524d3f1eafSConrad Meyer  * the shrink-dict flag was enabled. If enabled the dictionary used is the
1534d3f1eafSConrad Meyer  * smallest dictionary within a specified regression of the compressed size
1544d3f1eafSConrad Meyer  * from the largest dictionary.
1554d3f1eafSConrad Meyer  */
156f7cd7fe5SConrad Meyer  COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
1574d3f1eafSConrad Meyer                        size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
1584d3f1eafSConrad Meyer                        size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
159