10f743729SConrad Meyer #include <stdio.h> /* fprintf */ 20f743729SConrad Meyer #include <stdlib.h> /* malloc, free, qsort */ 30f743729SConrad Meyer #include <string.h> /* memset */ 40f743729SConrad Meyer #include <time.h> /* clock */ 50f743729SConrad Meyer #include "mem.h" /* read */ 60f743729SConrad Meyer #include "pool.h" 70f743729SConrad Meyer #include "threading.h" 80f743729SConrad Meyer #include "zstd_internal.h" /* includes zstd.h */ 90f743729SConrad Meyer #ifndef ZDICT_STATIC_LINKING_ONLY 100f743729SConrad Meyer #define ZDICT_STATIC_LINKING_ONLY 110f743729SConrad Meyer #endif 120f743729SConrad Meyer #include "zdict.h" 130f743729SConrad Meyer 140f743729SConrad Meyer /** 150f743729SConrad Meyer * COVER_best_t is used for two purposes: 160f743729SConrad Meyer * 1. Synchronizing threads. 170f743729SConrad Meyer * 2. Saving the best parameters and dictionary. 180f743729SConrad Meyer * 190f743729SConrad Meyer * All of the methods except COVER_best_init() are thread safe if zstd is 200f743729SConrad Meyer * compiled with multithreaded support. 210f743729SConrad Meyer */ 220f743729SConrad Meyer typedef struct COVER_best_s { 230f743729SConrad Meyer ZSTD_pthread_mutex_t mutex; 240f743729SConrad Meyer ZSTD_pthread_cond_t cond; 250f743729SConrad Meyer size_t liveJobs; 260f743729SConrad Meyer void *dict; 270f743729SConrad Meyer size_t dictSize; 280f743729SConrad Meyer ZDICT_cover_params_t parameters; 290f743729SConrad Meyer size_t compressedSize; 300f743729SConrad Meyer } COVER_best_t; 310f743729SConrad Meyer 320f743729SConrad Meyer /** 330f743729SConrad Meyer * A segment is a range in the source as well as the score of the segment. 340f743729SConrad Meyer */ 350f743729SConrad Meyer typedef struct { 360f743729SConrad Meyer U32 begin; 370f743729SConrad Meyer U32 end; 380f743729SConrad Meyer U32 score; 390f743729SConrad Meyer } COVER_segment_t; 400f743729SConrad Meyer 410f743729SConrad Meyer /** 42*2b9c00cbSConrad Meyer *Number of epochs and size of each epoch. 43*2b9c00cbSConrad Meyer */ 44*2b9c00cbSConrad Meyer typedef struct { 45*2b9c00cbSConrad Meyer U32 num; 46*2b9c00cbSConrad Meyer U32 size; 47*2b9c00cbSConrad Meyer } COVER_epoch_info_t; 48*2b9c00cbSConrad Meyer 49*2b9c00cbSConrad Meyer /** 50*2b9c00cbSConrad Meyer * Computes the number of epochs and the size of each epoch. 51*2b9c00cbSConrad Meyer * We will make sure that each epoch gets at least 10 * k bytes. 52*2b9c00cbSConrad Meyer * 53*2b9c00cbSConrad Meyer * The COVER algorithms divide the data up into epochs of equal size and 54*2b9c00cbSConrad Meyer * select one segment from each epoch. 55*2b9c00cbSConrad Meyer * 56*2b9c00cbSConrad Meyer * @param maxDictSize The maximum allowed dictionary size. 57*2b9c00cbSConrad Meyer * @param nbDmers The number of dmers we are training on. 58*2b9c00cbSConrad Meyer * @param k The parameter k (segment size). 59*2b9c00cbSConrad Meyer * @param passes The target number of passes over the dmer corpus. 60*2b9c00cbSConrad Meyer * More passes means a better dictionary. 61*2b9c00cbSConrad Meyer */ 62*2b9c00cbSConrad Meyer COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, 63*2b9c00cbSConrad Meyer U32 k, U32 passes); 64*2b9c00cbSConrad Meyer 65*2b9c00cbSConrad Meyer /** 66*2b9c00cbSConrad Meyer * Warns the user when their corpus is too small. 67*2b9c00cbSConrad Meyer */ 68*2b9c00cbSConrad Meyer void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); 69*2b9c00cbSConrad Meyer 70*2b9c00cbSConrad Meyer /** 710f743729SConrad Meyer * Checks total compressed size of a dictionary 720f743729SConrad Meyer */ 730f743729SConrad Meyer size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, 740f743729SConrad Meyer const size_t *samplesSizes, const BYTE *samples, 750f743729SConrad Meyer size_t *offsets, 760f743729SConrad Meyer size_t nbTrainSamples, size_t nbSamples, 770f743729SConrad Meyer BYTE *const dict, size_t dictBufferCapacity); 780f743729SConrad Meyer 790f743729SConrad Meyer /** 800f743729SConrad Meyer * Returns the sum of the sample sizes. 810f743729SConrad Meyer */ 820f743729SConrad Meyer size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ; 830f743729SConrad Meyer 840f743729SConrad Meyer /** 850f743729SConrad Meyer * Initialize the `COVER_best_t`. 860f743729SConrad Meyer */ 870f743729SConrad Meyer void COVER_best_init(COVER_best_t *best); 880f743729SConrad Meyer 890f743729SConrad Meyer /** 900f743729SConrad Meyer * Wait until liveJobs == 0. 910f743729SConrad Meyer */ 920f743729SConrad Meyer void COVER_best_wait(COVER_best_t *best); 930f743729SConrad Meyer 940f743729SConrad Meyer /** 950f743729SConrad Meyer * Call COVER_best_wait() and then destroy the COVER_best_t. 960f743729SConrad Meyer */ 970f743729SConrad Meyer void COVER_best_destroy(COVER_best_t *best); 980f743729SConrad Meyer 990f743729SConrad Meyer /** 1000f743729SConrad Meyer * Called when a thread is about to be launched. 1010f743729SConrad Meyer * Increments liveJobs. 1020f743729SConrad Meyer */ 1030f743729SConrad Meyer void COVER_best_start(COVER_best_t *best); 1040f743729SConrad Meyer 1050f743729SConrad Meyer /** 1060f743729SConrad Meyer * Called when a thread finishes executing, both on error or success. 1070f743729SConrad Meyer * Decrements liveJobs and signals any waiting threads if liveJobs == 0. 1080f743729SConrad Meyer * If this dictionary is the best so far save it and its parameters. 1090f743729SConrad Meyer */ 1100f743729SConrad Meyer void COVER_best_finish(COVER_best_t *best, size_t compressedSize, 1110f743729SConrad Meyer ZDICT_cover_params_t parameters, void *dict, 1120f743729SConrad Meyer size_t dictSize); 113