137f1f268SConrad Meyer /* 237f1f268SConrad Meyer * Copyright (c) 2017-2020, Facebook, Inc. 337f1f268SConrad Meyer * All rights reserved. 437f1f268SConrad Meyer * 537f1f268SConrad Meyer * This source code is licensed under both the BSD-style license (found in the 637f1f268SConrad Meyer * LICENSE file in the root directory of this source tree) and the GPLv2 (found 737f1f268SConrad Meyer * in the COPYING file in the root directory of this source tree). 837f1f268SConrad Meyer * You may select, at your option, one of the above-listed licenses. 937f1f268SConrad Meyer */ 1037f1f268SConrad Meyer 110f743729SConrad Meyer #include <stdio.h> /* fprintf */ 120f743729SConrad Meyer #include <stdlib.h> /* malloc, free, qsort */ 130f743729SConrad Meyer #include <string.h> /* memset */ 140f743729SConrad Meyer #include <time.h> /* clock */ 1537f1f268SConrad Meyer #include "../common/mem.h" /* read */ 1637f1f268SConrad Meyer #include "../common/pool.h" 1737f1f268SConrad Meyer #include "../common/threading.h" 1837f1f268SConrad Meyer #include "../common/zstd_internal.h" /* includes zstd.h */ 190f743729SConrad Meyer #ifndef ZDICT_STATIC_LINKING_ONLY 200f743729SConrad Meyer #define ZDICT_STATIC_LINKING_ONLY 210f743729SConrad Meyer #endif 220f743729SConrad Meyer #include "zdict.h" 230f743729SConrad Meyer 240f743729SConrad Meyer /** 250f743729SConrad Meyer * COVER_best_t is used for two purposes: 260f743729SConrad Meyer * 1. Synchronizing threads. 270f743729SConrad Meyer * 2. Saving the best parameters and dictionary. 280f743729SConrad Meyer * 290f743729SConrad Meyer * All of the methods except COVER_best_init() are thread safe if zstd is 300f743729SConrad Meyer * compiled with multithreaded support. 310f743729SConrad Meyer */ 320f743729SConrad Meyer typedef struct COVER_best_s { 330f743729SConrad Meyer ZSTD_pthread_mutex_t mutex; 340f743729SConrad Meyer ZSTD_pthread_cond_t cond; 350f743729SConrad Meyer size_t liveJobs; 360f743729SConrad Meyer void *dict; 370f743729SConrad Meyer size_t dictSize; 380f743729SConrad Meyer ZDICT_cover_params_t parameters; 390f743729SConrad Meyer size_t compressedSize; 400f743729SConrad Meyer } COVER_best_t; 410f743729SConrad Meyer 420f743729SConrad Meyer /** 430f743729SConrad Meyer * A segment is a range in the source as well as the score of the segment. 440f743729SConrad Meyer */ 450f743729SConrad Meyer typedef struct { 460f743729SConrad Meyer U32 begin; 470f743729SConrad Meyer U32 end; 480f743729SConrad Meyer U32 score; 490f743729SConrad Meyer } COVER_segment_t; 500f743729SConrad Meyer 510f743729SConrad Meyer /** 522b9c00cbSConrad Meyer *Number of epochs and size of each epoch. 532b9c00cbSConrad Meyer */ 542b9c00cbSConrad Meyer typedef struct { 552b9c00cbSConrad Meyer U32 num; 562b9c00cbSConrad Meyer U32 size; 572b9c00cbSConrad Meyer } COVER_epoch_info_t; 582b9c00cbSConrad Meyer 592b9c00cbSConrad Meyer /** 604d3f1eafSConrad Meyer * Struct used for the dictionary selection function. 614d3f1eafSConrad Meyer */ 624d3f1eafSConrad Meyer typedef struct COVER_dictSelection { 634d3f1eafSConrad Meyer BYTE* dictContent; 644d3f1eafSConrad Meyer size_t dictSize; 654d3f1eafSConrad Meyer size_t totalCompressedSize; 664d3f1eafSConrad Meyer } COVER_dictSelection_t; 674d3f1eafSConrad Meyer 684d3f1eafSConrad Meyer /** 692b9c00cbSConrad Meyer * Computes the number of epochs and the size of each epoch. 702b9c00cbSConrad Meyer * We will make sure that each epoch gets at least 10 * k bytes. 712b9c00cbSConrad Meyer * 722b9c00cbSConrad Meyer * The COVER algorithms divide the data up into epochs of equal size and 732b9c00cbSConrad Meyer * select one segment from each epoch. 742b9c00cbSConrad Meyer * 752b9c00cbSConrad Meyer * @param maxDictSize The maximum allowed dictionary size. 762b9c00cbSConrad Meyer * @param nbDmers The number of dmers we are training on. 772b9c00cbSConrad Meyer * @param k The parameter k (segment size). 782b9c00cbSConrad Meyer * @param passes The target number of passes over the dmer corpus. 792b9c00cbSConrad Meyer * More passes means a better dictionary. 802b9c00cbSConrad Meyer */ 812b9c00cbSConrad Meyer COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, 822b9c00cbSConrad Meyer U32 k, U32 passes); 832b9c00cbSConrad Meyer 842b9c00cbSConrad Meyer /** 852b9c00cbSConrad Meyer * Warns the user when their corpus is too small. 862b9c00cbSConrad Meyer */ 872b9c00cbSConrad Meyer void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); 882b9c00cbSConrad Meyer 892b9c00cbSConrad Meyer /** 900f743729SConrad Meyer * Checks total compressed size of a dictionary 910f743729SConrad Meyer */ 920f743729SConrad Meyer size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, 930f743729SConrad Meyer const size_t *samplesSizes, const BYTE *samples, 940f743729SConrad Meyer size_t *offsets, 950f743729SConrad Meyer size_t nbTrainSamples, size_t nbSamples, 960f743729SConrad Meyer BYTE *const dict, size_t dictBufferCapacity); 970f743729SConrad Meyer 980f743729SConrad Meyer /** 990f743729SConrad Meyer * Returns the sum of the sample sizes. 1000f743729SConrad Meyer */ 1010f743729SConrad Meyer size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ; 1020f743729SConrad Meyer 1030f743729SConrad Meyer /** 1040f743729SConrad Meyer * Initialize the `COVER_best_t`. 1050f743729SConrad Meyer */ 1060f743729SConrad Meyer void COVER_best_init(COVER_best_t *best); 1070f743729SConrad Meyer 1080f743729SConrad Meyer /** 1090f743729SConrad Meyer * Wait until liveJobs == 0. 1100f743729SConrad Meyer */ 1110f743729SConrad Meyer void COVER_best_wait(COVER_best_t *best); 1120f743729SConrad Meyer 1130f743729SConrad Meyer /** 1140f743729SConrad Meyer * Call COVER_best_wait() and then destroy the COVER_best_t. 1150f743729SConrad Meyer */ 1160f743729SConrad Meyer void COVER_best_destroy(COVER_best_t *best); 1170f743729SConrad Meyer 1180f743729SConrad Meyer /** 1190f743729SConrad Meyer * Called when a thread is about to be launched. 1200f743729SConrad Meyer * Increments liveJobs. 1210f743729SConrad Meyer */ 1220f743729SConrad Meyer void COVER_best_start(COVER_best_t *best); 1230f743729SConrad Meyer 1240f743729SConrad Meyer /** 1250f743729SConrad Meyer * Called when a thread finishes executing, both on error or success. 1260f743729SConrad Meyer * Decrements liveJobs and signals any waiting threads if liveJobs == 0. 1270f743729SConrad Meyer * If this dictionary is the best so far save it and its parameters. 1280f743729SConrad Meyer */ 1294d3f1eafSConrad Meyer void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, 1304d3f1eafSConrad Meyer COVER_dictSelection_t selection); 1314d3f1eafSConrad Meyer /** 1324d3f1eafSConrad Meyer * Error function for COVER_selectDict function. Checks if the return 1334d3f1eafSConrad Meyer * value is an error. 1344d3f1eafSConrad Meyer */ 1354d3f1eafSConrad Meyer unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); 1364d3f1eafSConrad Meyer 1374d3f1eafSConrad Meyer /** 1384d3f1eafSConrad Meyer * Error function for COVER_selectDict function. Returns a struct where 1394d3f1eafSConrad Meyer * return.totalCompressedSize is a ZSTD error. 1404d3f1eafSConrad Meyer */ 1414d3f1eafSConrad Meyer COVER_dictSelection_t COVER_dictSelectionError(size_t error); 1424d3f1eafSConrad Meyer 1434d3f1eafSConrad Meyer /** 1444d3f1eafSConrad Meyer * Always call after selectDict is called to free up used memory from 1454d3f1eafSConrad Meyer * newly created dictionary. 1464d3f1eafSConrad Meyer */ 1474d3f1eafSConrad Meyer void COVER_dictSelectionFree(COVER_dictSelection_t selection); 1484d3f1eafSConrad Meyer 1494d3f1eafSConrad Meyer /** 1504d3f1eafSConrad Meyer * Called to finalize the dictionary and select one based on whether or not 1514d3f1eafSConrad Meyer * the shrink-dict flag was enabled. If enabled the dictionary used is the 1524d3f1eafSConrad Meyer * smallest dictionary within a specified regression of the compressed size 1534d3f1eafSConrad Meyer * from the largest dictionary. 1544d3f1eafSConrad Meyer */ 155*f7cd7fe5SConrad Meyer COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity, 1564d3f1eafSConrad Meyer size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, 1574d3f1eafSConrad Meyer size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); 158