137f1f268SConrad Meyer /* 2*5ff13fbcSAllan Jude * Copyright (c) Facebook, Inc. 337f1f268SConrad Meyer * All rights reserved. 437f1f268SConrad Meyer * 537f1f268SConrad Meyer * This source code is licensed under both the BSD-style license (found in the 637f1f268SConrad Meyer * LICENSE file in the root directory of this source tree) and the GPLv2 (found 737f1f268SConrad Meyer * in the COPYING file in the root directory of this source tree). 837f1f268SConrad Meyer * You may select, at your option, one of the above-listed licenses. 937f1f268SConrad Meyer */ 1037f1f268SConrad Meyer 11*5ff13fbcSAllan Jude #ifndef ZDICT_STATIC_LINKING_ONLY 12*5ff13fbcSAllan Jude # define ZDICT_STATIC_LINKING_ONLY 13*5ff13fbcSAllan Jude #endif 14*5ff13fbcSAllan Jude 150f743729SConrad Meyer #include <stdio.h> /* fprintf */ 160f743729SConrad Meyer #include <stdlib.h> /* malloc, free, qsort */ 170f743729SConrad Meyer #include <string.h> /* memset */ 180f743729SConrad Meyer #include <time.h> /* clock */ 1937f1f268SConrad Meyer #include "../common/mem.h" /* read */ 2037f1f268SConrad Meyer #include "../common/pool.h" 2137f1f268SConrad Meyer #include "../common/threading.h" 2237f1f268SConrad Meyer #include "../common/zstd_internal.h" /* includes zstd.h */ 23*5ff13fbcSAllan Jude #include "../zdict.h" 240f743729SConrad Meyer 250f743729SConrad Meyer /** 260f743729SConrad Meyer * COVER_best_t is used for two purposes: 270f743729SConrad Meyer * 1. Synchronizing threads. 280f743729SConrad Meyer * 2. Saving the best parameters and dictionary. 290f743729SConrad Meyer * 300f743729SConrad Meyer * All of the methods except COVER_best_init() are thread safe if zstd is 310f743729SConrad Meyer * compiled with multithreaded support. 320f743729SConrad Meyer */ 330f743729SConrad Meyer typedef struct COVER_best_s { 340f743729SConrad Meyer ZSTD_pthread_mutex_t mutex; 350f743729SConrad Meyer ZSTD_pthread_cond_t cond; 360f743729SConrad Meyer size_t liveJobs; 370f743729SConrad Meyer void *dict; 380f743729SConrad Meyer size_t dictSize; 390f743729SConrad Meyer ZDICT_cover_params_t parameters; 400f743729SConrad Meyer size_t compressedSize; 410f743729SConrad Meyer } COVER_best_t; 420f743729SConrad Meyer 430f743729SConrad Meyer /** 440f743729SConrad Meyer * A segment is a range in the source as well as the score of the segment. 450f743729SConrad Meyer */ 460f743729SConrad Meyer typedef struct { 470f743729SConrad Meyer U32 begin; 480f743729SConrad Meyer U32 end; 490f743729SConrad Meyer U32 score; 500f743729SConrad Meyer } COVER_segment_t; 510f743729SConrad Meyer 520f743729SConrad Meyer /** 532b9c00cbSConrad Meyer *Number of epochs and size of each epoch. 542b9c00cbSConrad Meyer */ 552b9c00cbSConrad Meyer typedef struct { 562b9c00cbSConrad Meyer U32 num; 572b9c00cbSConrad Meyer U32 size; 582b9c00cbSConrad Meyer } COVER_epoch_info_t; 592b9c00cbSConrad Meyer 602b9c00cbSConrad Meyer /** 614d3f1eafSConrad Meyer * Struct used for the dictionary selection function. 624d3f1eafSConrad Meyer */ 634d3f1eafSConrad Meyer typedef struct COVER_dictSelection { 644d3f1eafSConrad Meyer BYTE* dictContent; 654d3f1eafSConrad Meyer size_t dictSize; 664d3f1eafSConrad Meyer size_t totalCompressedSize; 674d3f1eafSConrad Meyer } COVER_dictSelection_t; 684d3f1eafSConrad Meyer 694d3f1eafSConrad Meyer /** 702b9c00cbSConrad Meyer * Computes the number of epochs and the size of each epoch. 712b9c00cbSConrad Meyer * We will make sure that each epoch gets at least 10 * k bytes. 722b9c00cbSConrad Meyer * 732b9c00cbSConrad Meyer * The COVER algorithms divide the data up into epochs of equal size and 742b9c00cbSConrad Meyer * select one segment from each epoch. 752b9c00cbSConrad Meyer * 762b9c00cbSConrad Meyer * @param maxDictSize The maximum allowed dictionary size. 772b9c00cbSConrad Meyer * @param nbDmers The number of dmers we are training on. 782b9c00cbSConrad Meyer * @param k The parameter k (segment size). 792b9c00cbSConrad Meyer * @param passes The target number of passes over the dmer corpus. 802b9c00cbSConrad Meyer * More passes means a better dictionary. 812b9c00cbSConrad Meyer */ 822b9c00cbSConrad Meyer COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, 832b9c00cbSConrad Meyer U32 k, U32 passes); 842b9c00cbSConrad Meyer 852b9c00cbSConrad Meyer /** 862b9c00cbSConrad Meyer * Warns the user when their corpus is too small. 872b9c00cbSConrad Meyer */ 882b9c00cbSConrad Meyer void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); 892b9c00cbSConrad Meyer 902b9c00cbSConrad Meyer /** 910f743729SConrad Meyer * Checks total compressed size of a dictionary 920f743729SConrad Meyer */ 930f743729SConrad Meyer size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, 940f743729SConrad Meyer const size_t *samplesSizes, const BYTE *samples, 950f743729SConrad Meyer size_t *offsets, 960f743729SConrad Meyer size_t nbTrainSamples, size_t nbSamples, 970f743729SConrad Meyer BYTE *const dict, size_t dictBufferCapacity); 980f743729SConrad Meyer 990f743729SConrad Meyer /** 1000f743729SConrad Meyer * Returns the sum of the sample sizes. 1010f743729SConrad Meyer */ 1020f743729SConrad Meyer size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ; 1030f743729SConrad Meyer 1040f743729SConrad Meyer /** 1050f743729SConrad Meyer * Initialize the `COVER_best_t`. 1060f743729SConrad Meyer */ 1070f743729SConrad Meyer void COVER_best_init(COVER_best_t *best); 1080f743729SConrad Meyer 1090f743729SConrad Meyer /** 1100f743729SConrad Meyer * Wait until liveJobs == 0. 1110f743729SConrad Meyer */ 1120f743729SConrad Meyer void COVER_best_wait(COVER_best_t *best); 1130f743729SConrad Meyer 1140f743729SConrad Meyer /** 1150f743729SConrad Meyer * Call COVER_best_wait() and then destroy the COVER_best_t. 1160f743729SConrad Meyer */ 1170f743729SConrad Meyer void COVER_best_destroy(COVER_best_t *best); 1180f743729SConrad Meyer 1190f743729SConrad Meyer /** 1200f743729SConrad Meyer * Called when a thread is about to be launched. 1210f743729SConrad Meyer * Increments liveJobs. 1220f743729SConrad Meyer */ 1230f743729SConrad Meyer void COVER_best_start(COVER_best_t *best); 1240f743729SConrad Meyer 1250f743729SConrad Meyer /** 1260f743729SConrad Meyer * Called when a thread finishes executing, both on error or success. 1270f743729SConrad Meyer * Decrements liveJobs and signals any waiting threads if liveJobs == 0. 1280f743729SConrad Meyer * If this dictionary is the best so far save it and its parameters. 1290f743729SConrad Meyer */ 1304d3f1eafSConrad Meyer void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, 1314d3f1eafSConrad Meyer COVER_dictSelection_t selection); 1324d3f1eafSConrad Meyer /** 1334d3f1eafSConrad Meyer * Error function for COVER_selectDict function. Checks if the return 1344d3f1eafSConrad Meyer * value is an error. 1354d3f1eafSConrad Meyer */ 1364d3f1eafSConrad Meyer unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); 1374d3f1eafSConrad Meyer 1384d3f1eafSConrad Meyer /** 1394d3f1eafSConrad Meyer * Error function for COVER_selectDict function. Returns a struct where 1404d3f1eafSConrad Meyer * return.totalCompressedSize is a ZSTD error. 1414d3f1eafSConrad Meyer */ 1424d3f1eafSConrad Meyer COVER_dictSelection_t COVER_dictSelectionError(size_t error); 1434d3f1eafSConrad Meyer 1444d3f1eafSConrad Meyer /** 1454d3f1eafSConrad Meyer * Always call after selectDict is called to free up used memory from 1464d3f1eafSConrad Meyer * newly created dictionary. 1474d3f1eafSConrad Meyer */ 1484d3f1eafSConrad Meyer void COVER_dictSelectionFree(COVER_dictSelection_t selection); 1494d3f1eafSConrad Meyer 1504d3f1eafSConrad Meyer /** 1514d3f1eafSConrad Meyer * Called to finalize the dictionary and select one based on whether or not 1524d3f1eafSConrad Meyer * the shrink-dict flag was enabled. If enabled the dictionary used is the 1534d3f1eafSConrad Meyer * smallest dictionary within a specified regression of the compressed size 1544d3f1eafSConrad Meyer * from the largest dictionary. 1554d3f1eafSConrad Meyer */ 156f7cd7fe5SConrad Meyer COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity, 1574d3f1eafSConrad Meyer size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, 1584d3f1eafSConrad Meyer size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); 159