xref: /freebsd/sys/contrib/zstd/lib/dictBuilder/cover.c (revision 4d3f1eafc9372600fc1e5472187846d07fe96c54)
10c16b537SWarner Losh /*
20c16b537SWarner Losh  * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
30c16b537SWarner Losh  * All rights reserved.
40c16b537SWarner Losh  *
50c16b537SWarner Losh  * This source code is licensed under both the BSD-style license (found in the
60c16b537SWarner Losh  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
70c16b537SWarner Losh  * in the COPYING file in the root directory of this source tree).
80c16b537SWarner Losh  * You may select, at your option, one of the above-listed licenses.
90c16b537SWarner Losh  */
100c16b537SWarner Losh 
110c16b537SWarner Losh /* *****************************************************************************
120c16b537SWarner Losh  * Constructs a dictionary using a heuristic based on the following paper:
130c16b537SWarner Losh  *
140c16b537SWarner Losh  * Liao, Petri, Moffat, Wirth
150c16b537SWarner Losh  * Effective Construction of Relative Lempel-Ziv Dictionaries
160c16b537SWarner Losh  * Published in WWW 2016.
170c16b537SWarner Losh  *
180c16b537SWarner Losh  * Adapted from code originally written by @ot (Giuseppe Ottaviano).
190c16b537SWarner Losh  ******************************************************************************/
200c16b537SWarner Losh 
210c16b537SWarner Losh /*-*************************************
220c16b537SWarner Losh *  Dependencies
230c16b537SWarner Losh ***************************************/
240c16b537SWarner Losh #include <stdio.h>  /* fprintf */
250c16b537SWarner Losh #include <stdlib.h> /* malloc, free, qsort */
260c16b537SWarner Losh #include <string.h> /* memset */
270c16b537SWarner Losh #include <time.h>   /* clock */
280c16b537SWarner Losh 
290c16b537SWarner Losh #include "mem.h" /* read */
300c16b537SWarner Losh #include "pool.h"
310c16b537SWarner Losh #include "threading.h"
320f743729SConrad Meyer #include "cover.h"
330c16b537SWarner Losh #include "zstd_internal.h" /* includes zstd.h */
340c16b537SWarner Losh #ifndef ZDICT_STATIC_LINKING_ONLY
350c16b537SWarner Losh #define ZDICT_STATIC_LINKING_ONLY
360c16b537SWarner Losh #endif
370c16b537SWarner Losh #include "zdict.h"
380c16b537SWarner Losh 
390c16b537SWarner Losh /*-*************************************
400c16b537SWarner Losh *  Constants
410c16b537SWarner Losh ***************************************/
42a0483764SConrad Meyer #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
430f743729SConrad Meyer #define DEFAULT_SPLITPOINT 1.0
440c16b537SWarner Losh 
450c16b537SWarner Losh /*-*************************************
460c16b537SWarner Losh *  Console display
470c16b537SWarner Losh ***************************************/
480c16b537SWarner Losh static int g_displayLevel = 2;
490c16b537SWarner Losh #define DISPLAY(...)                                                           \
500c16b537SWarner Losh   {                                                                            \
510c16b537SWarner Losh     fprintf(stderr, __VA_ARGS__);                                              \
520c16b537SWarner Losh     fflush(stderr);                                                            \
530c16b537SWarner Losh   }
540c16b537SWarner Losh #define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
550c16b537SWarner Losh   if (displayLevel >= l) {                                                     \
560c16b537SWarner Losh     DISPLAY(__VA_ARGS__);                                                      \
570c16b537SWarner Losh   } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
580c16b537SWarner Losh #define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
590c16b537SWarner Losh 
600c16b537SWarner Losh #define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
610c16b537SWarner Losh   if (displayLevel >= l) {                                                     \
620c16b537SWarner Losh     if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
630c16b537SWarner Losh       g_time = clock();                                                        \
640c16b537SWarner Losh       DISPLAY(__VA_ARGS__);                                                    \
650c16b537SWarner Losh     }                                                                          \
660c16b537SWarner Losh   }
670c16b537SWarner Losh #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
680c16b537SWarner Losh static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
690c16b537SWarner Losh static clock_t g_time = 0;
700c16b537SWarner Losh 
710c16b537SWarner Losh /*-*************************************
720c16b537SWarner Losh * Hash table
730c16b537SWarner Losh ***************************************
740c16b537SWarner Losh * A small specialized hash map for storing activeDmers.
750c16b537SWarner Losh * The map does not resize, so if it becomes full it will loop forever.
760c16b537SWarner Losh * Thus, the map must be large enough to store every value.
770c16b537SWarner Losh * The map implements linear probing and keeps its load less than 0.5.
780c16b537SWarner Losh */
790c16b537SWarner Losh 
800c16b537SWarner Losh #define MAP_EMPTY_VALUE ((U32)-1)
810c16b537SWarner Losh typedef struct COVER_map_pair_t_s {
820c16b537SWarner Losh   U32 key;
830c16b537SWarner Losh   U32 value;
840c16b537SWarner Losh } COVER_map_pair_t;
850c16b537SWarner Losh 
860c16b537SWarner Losh typedef struct COVER_map_s {
870c16b537SWarner Losh   COVER_map_pair_t *data;
880c16b537SWarner Losh   U32 sizeLog;
890c16b537SWarner Losh   U32 size;
900c16b537SWarner Losh   U32 sizeMask;
910c16b537SWarner Losh } COVER_map_t;
920c16b537SWarner Losh 
930c16b537SWarner Losh /**
940c16b537SWarner Losh  * Clear the map.
950c16b537SWarner Losh  */
960c16b537SWarner Losh static void COVER_map_clear(COVER_map_t *map) {
970c16b537SWarner Losh   memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
980c16b537SWarner Losh }
990c16b537SWarner Losh 
1000c16b537SWarner Losh /**
1010c16b537SWarner Losh  * Initializes a map of the given size.
1020c16b537SWarner Losh  * Returns 1 on success and 0 on failure.
1030c16b537SWarner Losh  * The map must be destroyed with COVER_map_destroy().
1040c16b537SWarner Losh  * The map is only guaranteed to be large enough to hold size elements.
1050c16b537SWarner Losh  */
1060c16b537SWarner Losh static int COVER_map_init(COVER_map_t *map, U32 size) {
1070c16b537SWarner Losh   map->sizeLog = ZSTD_highbit32(size) + 2;
1080c16b537SWarner Losh   map->size = (U32)1 << map->sizeLog;
1090c16b537SWarner Losh   map->sizeMask = map->size - 1;
1100c16b537SWarner Losh   map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
1110c16b537SWarner Losh   if (!map->data) {
1120c16b537SWarner Losh     map->sizeLog = 0;
1130c16b537SWarner Losh     map->size = 0;
1140c16b537SWarner Losh     return 0;
1150c16b537SWarner Losh   }
1160c16b537SWarner Losh   COVER_map_clear(map);
1170c16b537SWarner Losh   return 1;
1180c16b537SWarner Losh }
1190c16b537SWarner Losh 
1200c16b537SWarner Losh /**
1210c16b537SWarner Losh  * Internal hash function
1220c16b537SWarner Losh  */
1230c16b537SWarner Losh static const U32 prime4bytes = 2654435761U;
1240c16b537SWarner Losh static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
1250c16b537SWarner Losh   return (key * prime4bytes) >> (32 - map->sizeLog);
1260c16b537SWarner Losh }
1270c16b537SWarner Losh 
1280c16b537SWarner Losh /**
1290c16b537SWarner Losh  * Helper function that returns the index that a key should be placed into.
1300c16b537SWarner Losh  */
1310c16b537SWarner Losh static U32 COVER_map_index(COVER_map_t *map, U32 key) {
1320c16b537SWarner Losh   const U32 hash = COVER_map_hash(map, key);
1330c16b537SWarner Losh   U32 i;
1340c16b537SWarner Losh   for (i = hash;; i = (i + 1) & map->sizeMask) {
1350c16b537SWarner Losh     COVER_map_pair_t *pos = &map->data[i];
1360c16b537SWarner Losh     if (pos->value == MAP_EMPTY_VALUE) {
1370c16b537SWarner Losh       return i;
1380c16b537SWarner Losh     }
1390c16b537SWarner Losh     if (pos->key == key) {
1400c16b537SWarner Losh       return i;
1410c16b537SWarner Losh     }
1420c16b537SWarner Losh   }
1430c16b537SWarner Losh }
1440c16b537SWarner Losh 
1450c16b537SWarner Losh /**
1460c16b537SWarner Losh  * Returns the pointer to the value for key.
1470c16b537SWarner Losh  * If key is not in the map, it is inserted and the value is set to 0.
1480c16b537SWarner Losh  * The map must not be full.
1490c16b537SWarner Losh  */
1500c16b537SWarner Losh static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
1510c16b537SWarner Losh   COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
1520c16b537SWarner Losh   if (pos->value == MAP_EMPTY_VALUE) {
1530c16b537SWarner Losh     pos->key = key;
1540c16b537SWarner Losh     pos->value = 0;
1550c16b537SWarner Losh   }
1560c16b537SWarner Losh   return &pos->value;
1570c16b537SWarner Losh }
1580c16b537SWarner Losh 
1590c16b537SWarner Losh /**
1600c16b537SWarner Losh  * Deletes key from the map if present.
1610c16b537SWarner Losh  */
1620c16b537SWarner Losh static void COVER_map_remove(COVER_map_t *map, U32 key) {
1630c16b537SWarner Losh   U32 i = COVER_map_index(map, key);
1640c16b537SWarner Losh   COVER_map_pair_t *del = &map->data[i];
1650c16b537SWarner Losh   U32 shift = 1;
1660c16b537SWarner Losh   if (del->value == MAP_EMPTY_VALUE) {
1670c16b537SWarner Losh     return;
1680c16b537SWarner Losh   }
1690c16b537SWarner Losh   for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
1700c16b537SWarner Losh     COVER_map_pair_t *const pos = &map->data[i];
1710c16b537SWarner Losh     /* If the position is empty we are done */
1720c16b537SWarner Losh     if (pos->value == MAP_EMPTY_VALUE) {
1730c16b537SWarner Losh       del->value = MAP_EMPTY_VALUE;
1740c16b537SWarner Losh       return;
1750c16b537SWarner Losh     }
1760c16b537SWarner Losh     /* If pos can be moved to del do so */
1770c16b537SWarner Losh     if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
1780c16b537SWarner Losh       del->key = pos->key;
1790c16b537SWarner Losh       del->value = pos->value;
1800c16b537SWarner Losh       del = pos;
1810c16b537SWarner Losh       shift = 1;
1820c16b537SWarner Losh     } else {
1830c16b537SWarner Losh       ++shift;
1840c16b537SWarner Losh     }
1850c16b537SWarner Losh   }
1860c16b537SWarner Losh }
1870c16b537SWarner Losh 
1880c16b537SWarner Losh /**
1890f743729SConrad Meyer  * Destroys a map that is inited with COVER_map_init().
1900c16b537SWarner Losh  */
1910c16b537SWarner Losh static void COVER_map_destroy(COVER_map_t *map) {
1920c16b537SWarner Losh   if (map->data) {
1930c16b537SWarner Losh     free(map->data);
1940c16b537SWarner Losh   }
1950c16b537SWarner Losh   map->data = NULL;
1960c16b537SWarner Losh   map->size = 0;
1970c16b537SWarner Losh }
1980c16b537SWarner Losh 
1990c16b537SWarner Losh /*-*************************************
2000c16b537SWarner Losh * Context
2010c16b537SWarner Losh ***************************************/
2020c16b537SWarner Losh 
2030c16b537SWarner Losh typedef struct {
2040c16b537SWarner Losh   const BYTE *samples;
2050c16b537SWarner Losh   size_t *offsets;
2060c16b537SWarner Losh   const size_t *samplesSizes;
2070c16b537SWarner Losh   size_t nbSamples;
2080f743729SConrad Meyer   size_t nbTrainSamples;
2090f743729SConrad Meyer   size_t nbTestSamples;
2100c16b537SWarner Losh   U32 *suffix;
2110c16b537SWarner Losh   size_t suffixSize;
2120c16b537SWarner Losh   U32 *freqs;
2130c16b537SWarner Losh   U32 *dmerAt;
2140c16b537SWarner Losh   unsigned d;
2150c16b537SWarner Losh } COVER_ctx_t;
2160c16b537SWarner Losh 
2170c16b537SWarner Losh /* We need a global context for qsort... */
2180c16b537SWarner Losh static COVER_ctx_t *g_ctx = NULL;
2190c16b537SWarner Losh 
2200c16b537SWarner Losh /*-*************************************
2210c16b537SWarner Losh *  Helper functions
2220c16b537SWarner Losh ***************************************/
2230c16b537SWarner Losh 
2240c16b537SWarner Losh /**
2250c16b537SWarner Losh  * Returns the sum of the sample sizes.
2260c16b537SWarner Losh  */
2270f743729SConrad Meyer size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
2280c16b537SWarner Losh   size_t sum = 0;
2290f743729SConrad Meyer   unsigned i;
2300c16b537SWarner Losh   for (i = 0; i < nbSamples; ++i) {
2310c16b537SWarner Losh     sum += samplesSizes[i];
2320c16b537SWarner Losh   }
2330c16b537SWarner Losh   return sum;
2340c16b537SWarner Losh }
2350c16b537SWarner Losh 
2360c16b537SWarner Losh /**
2370c16b537SWarner Losh  * Returns -1 if the dmer at lp is less than the dmer at rp.
2380c16b537SWarner Losh  * Return 0 if the dmers at lp and rp are equal.
2390c16b537SWarner Losh  * Returns 1 if the dmer at lp is greater than the dmer at rp.
2400c16b537SWarner Losh  */
2410c16b537SWarner Losh static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
2420c16b537SWarner Losh   U32 const lhs = *(U32 const *)lp;
2430c16b537SWarner Losh   U32 const rhs = *(U32 const *)rp;
2440c16b537SWarner Losh   return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
2450c16b537SWarner Losh }
2460c16b537SWarner Losh /**
2470c16b537SWarner Losh  * Faster version for d <= 8.
2480c16b537SWarner Losh  */
2490c16b537SWarner Losh static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
2500c16b537SWarner Losh   U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
2510c16b537SWarner Losh   U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
2520c16b537SWarner Losh   U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
2530c16b537SWarner Losh   if (lhs < rhs) {
2540c16b537SWarner Losh     return -1;
2550c16b537SWarner Losh   }
2560c16b537SWarner Losh   return (lhs > rhs);
2570c16b537SWarner Losh }
2580c16b537SWarner Losh 
2590c16b537SWarner Losh /**
2600c16b537SWarner Losh  * Same as COVER_cmp() except ties are broken by pointer value
2610c16b537SWarner Losh  * NOTE: g_ctx must be set to call this function.  A global is required because
2620c16b537SWarner Losh  * qsort doesn't take an opaque pointer.
2630c16b537SWarner Losh  */
2640c16b537SWarner Losh static int COVER_strict_cmp(const void *lp, const void *rp) {
2650c16b537SWarner Losh   int result = COVER_cmp(g_ctx, lp, rp);
2660c16b537SWarner Losh   if (result == 0) {
2670c16b537SWarner Losh     result = lp < rp ? -1 : 1;
2680c16b537SWarner Losh   }
2690c16b537SWarner Losh   return result;
2700c16b537SWarner Losh }
2710c16b537SWarner Losh /**
2720c16b537SWarner Losh  * Faster version for d <= 8.
2730c16b537SWarner Losh  */
2740c16b537SWarner Losh static int COVER_strict_cmp8(const void *lp, const void *rp) {
2750c16b537SWarner Losh   int result = COVER_cmp8(g_ctx, lp, rp);
2760c16b537SWarner Losh   if (result == 0) {
2770c16b537SWarner Losh     result = lp < rp ? -1 : 1;
2780c16b537SWarner Losh   }
2790c16b537SWarner Losh   return result;
2800c16b537SWarner Losh }
2810c16b537SWarner Losh 
2820c16b537SWarner Losh /**
2830c16b537SWarner Losh  * Returns the first pointer in [first, last) whose element does not compare
2840c16b537SWarner Losh  * less than value.  If no such element exists it returns last.
2850c16b537SWarner Losh  */
2860c16b537SWarner Losh static const size_t *COVER_lower_bound(const size_t *first, const size_t *last,
2870c16b537SWarner Losh                                        size_t value) {
2880c16b537SWarner Losh   size_t count = last - first;
2890c16b537SWarner Losh   while (count != 0) {
2900c16b537SWarner Losh     size_t step = count / 2;
2910c16b537SWarner Losh     const size_t *ptr = first;
2920c16b537SWarner Losh     ptr += step;
2930c16b537SWarner Losh     if (*ptr < value) {
2940c16b537SWarner Losh       first = ++ptr;
2950c16b537SWarner Losh       count -= step + 1;
2960c16b537SWarner Losh     } else {
2970c16b537SWarner Losh       count = step;
2980c16b537SWarner Losh     }
2990c16b537SWarner Losh   }
3000c16b537SWarner Losh   return first;
3010c16b537SWarner Losh }
3020c16b537SWarner Losh 
3030c16b537SWarner Losh /**
3040c16b537SWarner Losh  * Generic groupBy function.
3050c16b537SWarner Losh  * Groups an array sorted by cmp into groups with equivalent values.
3060c16b537SWarner Losh  * Calls grp for each group.
3070c16b537SWarner Losh  */
3080c16b537SWarner Losh static void
3090c16b537SWarner Losh COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
3100c16b537SWarner Losh               int (*cmp)(COVER_ctx_t *, const void *, const void *),
3110c16b537SWarner Losh               void (*grp)(COVER_ctx_t *, const void *, const void *)) {
3120c16b537SWarner Losh   const BYTE *ptr = (const BYTE *)data;
3130c16b537SWarner Losh   size_t num = 0;
3140c16b537SWarner Losh   while (num < count) {
3150c16b537SWarner Losh     const BYTE *grpEnd = ptr + size;
3160c16b537SWarner Losh     ++num;
3170c16b537SWarner Losh     while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
3180c16b537SWarner Losh       grpEnd += size;
3190c16b537SWarner Losh       ++num;
3200c16b537SWarner Losh     }
3210c16b537SWarner Losh     grp(ctx, ptr, grpEnd);
3220c16b537SWarner Losh     ptr = grpEnd;
3230c16b537SWarner Losh   }
3240c16b537SWarner Losh }
3250c16b537SWarner Losh 
3260c16b537SWarner Losh /*-*************************************
3270c16b537SWarner Losh *  Cover functions
3280c16b537SWarner Losh ***************************************/
3290c16b537SWarner Losh 
3300c16b537SWarner Losh /**
3310c16b537SWarner Losh  * Called on each group of positions with the same dmer.
3320c16b537SWarner Losh  * Counts the frequency of each dmer and saves it in the suffix array.
3330c16b537SWarner Losh  * Fills `ctx->dmerAt`.
3340c16b537SWarner Losh  */
3350c16b537SWarner Losh static void COVER_group(COVER_ctx_t *ctx, const void *group,
3360c16b537SWarner Losh                         const void *groupEnd) {
3370c16b537SWarner Losh   /* The group consists of all the positions with the same first d bytes. */
3380c16b537SWarner Losh   const U32 *grpPtr = (const U32 *)group;
3390c16b537SWarner Losh   const U32 *grpEnd = (const U32 *)groupEnd;
3400c16b537SWarner Losh   /* The dmerId is how we will reference this dmer.
3410c16b537SWarner Losh    * This allows us to map the whole dmer space to a much smaller space, the
3420c16b537SWarner Losh    * size of the suffix array.
3430c16b537SWarner Losh    */
3440c16b537SWarner Losh   const U32 dmerId = (U32)(grpPtr - ctx->suffix);
3450c16b537SWarner Losh   /* Count the number of samples this dmer shows up in */
3460c16b537SWarner Losh   U32 freq = 0;
3470c16b537SWarner Losh   /* Details */
3480c16b537SWarner Losh   const size_t *curOffsetPtr = ctx->offsets;
3490c16b537SWarner Losh   const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
3500c16b537SWarner Losh   /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
3510c16b537SWarner Losh    * different sample than the last.
3520c16b537SWarner Losh    */
3530c16b537SWarner Losh   size_t curSampleEnd = ctx->offsets[0];
3540c16b537SWarner Losh   for (; grpPtr != grpEnd; ++grpPtr) {
3550c16b537SWarner Losh     /* Save the dmerId for this position so we can get back to it. */
3560c16b537SWarner Losh     ctx->dmerAt[*grpPtr] = dmerId;
3570c16b537SWarner Losh     /* Dictionaries only help for the first reference to the dmer.
3580c16b537SWarner Losh      * After that zstd can reference the match from the previous reference.
3590c16b537SWarner Losh      * So only count each dmer once for each sample it is in.
3600c16b537SWarner Losh      */
3610c16b537SWarner Losh     if (*grpPtr < curSampleEnd) {
3620c16b537SWarner Losh       continue;
3630c16b537SWarner Losh     }
3640c16b537SWarner Losh     freq += 1;
3650c16b537SWarner Losh     /* Binary search to find the end of the sample *grpPtr is in.
3660c16b537SWarner Losh      * In the common case that grpPtr + 1 == grpEnd we can skip the binary
3670c16b537SWarner Losh      * search because the loop is over.
3680c16b537SWarner Losh      */
3690c16b537SWarner Losh     if (grpPtr + 1 != grpEnd) {
3700c16b537SWarner Losh       const size_t *sampleEndPtr =
3710c16b537SWarner Losh           COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
3720c16b537SWarner Losh       curSampleEnd = *sampleEndPtr;
3730c16b537SWarner Losh       curOffsetPtr = sampleEndPtr + 1;
3740c16b537SWarner Losh     }
3750c16b537SWarner Losh   }
3760c16b537SWarner Losh   /* At this point we are never going to look at this segment of the suffix
3770c16b537SWarner Losh    * array again.  We take advantage of this fact to save memory.
3780c16b537SWarner Losh    * We store the frequency of the dmer in the first position of the group,
3790c16b537SWarner Losh    * which is dmerId.
3800c16b537SWarner Losh    */
3810c16b537SWarner Losh   ctx->suffix[dmerId] = freq;
3820c16b537SWarner Losh }
3830c16b537SWarner Losh 
3840c16b537SWarner Losh 
3850c16b537SWarner Losh /**
3860c16b537SWarner Losh  * Selects the best segment in an epoch.
3870c16b537SWarner Losh  * Segments of are scored according to the function:
3880c16b537SWarner Losh  *
3890c16b537SWarner Losh  * Let F(d) be the frequency of dmer d.
3900c16b537SWarner Losh  * Let S_i be the dmer at position i of segment S which has length k.
3910c16b537SWarner Losh  *
3920c16b537SWarner Losh  *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
3930c16b537SWarner Losh  *
3942b9c00cbSConrad Meyer  * Once the dmer d is in the dictionary we set F(d) = 0.
3950c16b537SWarner Losh  */
3960c16b537SWarner Losh static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
3970c16b537SWarner Losh                                            COVER_map_t *activeDmers, U32 begin,
3980c16b537SWarner Losh                                            U32 end,
3990c16b537SWarner Losh                                            ZDICT_cover_params_t parameters) {
4000c16b537SWarner Losh   /* Constants */
4010c16b537SWarner Losh   const U32 k = parameters.k;
4020c16b537SWarner Losh   const U32 d = parameters.d;
4030c16b537SWarner Losh   const U32 dmersInK = k - d + 1;
4040c16b537SWarner Losh   /* Try each segment (activeSegment) and save the best (bestSegment) */
4050c16b537SWarner Losh   COVER_segment_t bestSegment = {0, 0, 0};
4060c16b537SWarner Losh   COVER_segment_t activeSegment;
4070c16b537SWarner Losh   /* Reset the activeDmers in the segment */
4080c16b537SWarner Losh   COVER_map_clear(activeDmers);
4090c16b537SWarner Losh   /* The activeSegment starts at the beginning of the epoch. */
4100c16b537SWarner Losh   activeSegment.begin = begin;
4110c16b537SWarner Losh   activeSegment.end = begin;
4120c16b537SWarner Losh   activeSegment.score = 0;
4130c16b537SWarner Losh   /* Slide the activeSegment through the whole epoch.
4140c16b537SWarner Losh    * Save the best segment in bestSegment.
4150c16b537SWarner Losh    */
4160c16b537SWarner Losh   while (activeSegment.end < end) {
4170c16b537SWarner Losh     /* The dmerId for the dmer at the next position */
4180c16b537SWarner Losh     U32 newDmer = ctx->dmerAt[activeSegment.end];
4190c16b537SWarner Losh     /* The entry in activeDmers for this dmerId */
4200c16b537SWarner Losh     U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
4210c16b537SWarner Losh     /* If the dmer isn't already present in the segment add its score. */
4220c16b537SWarner Losh     if (*newDmerOcc == 0) {
4230c16b537SWarner Losh       /* The paper suggest using the L-0.5 norm, but experiments show that it
4240c16b537SWarner Losh        * doesn't help.
4250c16b537SWarner Losh        */
4260c16b537SWarner Losh       activeSegment.score += freqs[newDmer];
4270c16b537SWarner Losh     }
4280c16b537SWarner Losh     /* Add the dmer to the segment */
4290c16b537SWarner Losh     activeSegment.end += 1;
4300c16b537SWarner Losh     *newDmerOcc += 1;
4310c16b537SWarner Losh 
4320c16b537SWarner Losh     /* If the window is now too large, drop the first position */
4330c16b537SWarner Losh     if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
4340c16b537SWarner Losh       U32 delDmer = ctx->dmerAt[activeSegment.begin];
4350c16b537SWarner Losh       U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
4360c16b537SWarner Losh       activeSegment.begin += 1;
4370c16b537SWarner Losh       *delDmerOcc -= 1;
4382b9c00cbSConrad Meyer       /* If this is the last occurrence of the dmer, subtract its score */
4390c16b537SWarner Losh       if (*delDmerOcc == 0) {
4400c16b537SWarner Losh         COVER_map_remove(activeDmers, delDmer);
4410c16b537SWarner Losh         activeSegment.score -= freqs[delDmer];
4420c16b537SWarner Losh       }
4430c16b537SWarner Losh     }
4440c16b537SWarner Losh 
4450c16b537SWarner Losh     /* If this segment is the best so far save it */
4460c16b537SWarner Losh     if (activeSegment.score > bestSegment.score) {
4470c16b537SWarner Losh       bestSegment = activeSegment;
4480c16b537SWarner Losh     }
4490c16b537SWarner Losh   }
4500c16b537SWarner Losh   {
4510c16b537SWarner Losh     /* Trim off the zero frequency head and tail from the segment. */
4520c16b537SWarner Losh     U32 newBegin = bestSegment.end;
4530c16b537SWarner Losh     U32 newEnd = bestSegment.begin;
4540c16b537SWarner Losh     U32 pos;
4550c16b537SWarner Losh     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
4560c16b537SWarner Losh       U32 freq = freqs[ctx->dmerAt[pos]];
4570c16b537SWarner Losh       if (freq != 0) {
4580c16b537SWarner Losh         newBegin = MIN(newBegin, pos);
4590c16b537SWarner Losh         newEnd = pos + 1;
4600c16b537SWarner Losh       }
4610c16b537SWarner Losh     }
4620c16b537SWarner Losh     bestSegment.begin = newBegin;
4630c16b537SWarner Losh     bestSegment.end = newEnd;
4640c16b537SWarner Losh   }
4650c16b537SWarner Losh   {
4660c16b537SWarner Losh     /* Zero out the frequency of each dmer covered by the chosen segment. */
4670c16b537SWarner Losh     U32 pos;
4680c16b537SWarner Losh     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
4690c16b537SWarner Losh       freqs[ctx->dmerAt[pos]] = 0;
4700c16b537SWarner Losh     }
4710c16b537SWarner Losh   }
4720c16b537SWarner Losh   return bestSegment;
4730c16b537SWarner Losh }
4740c16b537SWarner Losh 
4750c16b537SWarner Losh /**
4760c16b537SWarner Losh  * Check the validity of the parameters.
4770c16b537SWarner Losh  * Returns non-zero if the parameters are valid and 0 otherwise.
4780c16b537SWarner Losh  */
4790c16b537SWarner Losh static int COVER_checkParameters(ZDICT_cover_params_t parameters,
4800c16b537SWarner Losh                                  size_t maxDictSize) {
4810c16b537SWarner Losh   /* k and d are required parameters */
4820c16b537SWarner Losh   if (parameters.d == 0 || parameters.k == 0) {
4830c16b537SWarner Losh     return 0;
4840c16b537SWarner Losh   }
4850c16b537SWarner Losh   /* k <= maxDictSize */
4860c16b537SWarner Losh   if (parameters.k > maxDictSize) {
4870c16b537SWarner Losh     return 0;
4880c16b537SWarner Losh   }
4890c16b537SWarner Losh   /* d <= k */
4900c16b537SWarner Losh   if (parameters.d > parameters.k) {
4910c16b537SWarner Losh     return 0;
4920c16b537SWarner Losh   }
4930f743729SConrad Meyer   /* 0 < splitPoint <= 1 */
4940f743729SConrad Meyer   if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
4950f743729SConrad Meyer     return 0;
4960f743729SConrad Meyer   }
4970c16b537SWarner Losh   return 1;
4980c16b537SWarner Losh }
4990c16b537SWarner Losh 
5000c16b537SWarner Losh /**
5010c16b537SWarner Losh  * Clean up a context initialized with `COVER_ctx_init()`.
5020c16b537SWarner Losh  */
5030c16b537SWarner Losh static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
5040c16b537SWarner Losh   if (!ctx) {
5050c16b537SWarner Losh     return;
5060c16b537SWarner Losh   }
5070c16b537SWarner Losh   if (ctx->suffix) {
5080c16b537SWarner Losh     free(ctx->suffix);
5090c16b537SWarner Losh     ctx->suffix = NULL;
5100c16b537SWarner Losh   }
5110c16b537SWarner Losh   if (ctx->freqs) {
5120c16b537SWarner Losh     free(ctx->freqs);
5130c16b537SWarner Losh     ctx->freqs = NULL;
5140c16b537SWarner Losh   }
5150c16b537SWarner Losh   if (ctx->dmerAt) {
5160c16b537SWarner Losh     free(ctx->dmerAt);
5170c16b537SWarner Losh     ctx->dmerAt = NULL;
5180c16b537SWarner Losh   }
5190c16b537SWarner Losh   if (ctx->offsets) {
5200c16b537SWarner Losh     free(ctx->offsets);
5210c16b537SWarner Losh     ctx->offsets = NULL;
5220c16b537SWarner Losh   }
5230c16b537SWarner Losh }
5240c16b537SWarner Losh 
5250c16b537SWarner Losh /**
5260c16b537SWarner Losh  * Prepare a context for dictionary building.
5270c16b537SWarner Losh  * The context is only dependent on the parameter `d` and can used multiple
5280c16b537SWarner Losh  * times.
529*4d3f1eafSConrad Meyer  * Returns 0 on success or error code on error.
5300c16b537SWarner Losh  * The context must be destroyed with `COVER_ctx_destroy()`.
5310c16b537SWarner Losh  */
532*4d3f1eafSConrad Meyer static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
5330c16b537SWarner Losh                           const size_t *samplesSizes, unsigned nbSamples,
5340f743729SConrad Meyer                           unsigned d, double splitPoint) {
5350c16b537SWarner Losh   const BYTE *const samples = (const BYTE *)samplesBuffer;
5360c16b537SWarner Losh   const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
5370f743729SConrad Meyer   /* Split samples into testing and training sets */
5380f743729SConrad Meyer   const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
5390f743729SConrad Meyer   const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
5400f743729SConrad Meyer   const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
5410f743729SConrad Meyer   const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
5420c16b537SWarner Losh   /* Checks */
5430c16b537SWarner Losh   if (totalSamplesSize < MAX(d, sizeof(U64)) ||
5440c16b537SWarner Losh       totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
54519fcbaf1SConrad Meyer     DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
546a0483764SConrad Meyer                  (unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
547*4d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
5480c16b537SWarner Losh   }
5490f743729SConrad Meyer   /* Check if there are at least 5 training samples */
5500f743729SConrad Meyer   if (nbTrainSamples < 5) {
5510f743729SConrad Meyer     DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
552*4d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
5530f743729SConrad Meyer   }
5540f743729SConrad Meyer   /* Check if there's testing sample */
5550f743729SConrad Meyer   if (nbTestSamples < 1) {
5560f743729SConrad Meyer     DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
557*4d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
5580f743729SConrad Meyer   }
5590c16b537SWarner Losh   /* Zero the context */
5600c16b537SWarner Losh   memset(ctx, 0, sizeof(*ctx));
5610f743729SConrad Meyer   DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
562a0483764SConrad Meyer                (unsigned)trainingSamplesSize);
5630f743729SConrad Meyer   DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
564a0483764SConrad Meyer                (unsigned)testSamplesSize);
5650c16b537SWarner Losh   ctx->samples = samples;
5660c16b537SWarner Losh   ctx->samplesSizes = samplesSizes;
5670c16b537SWarner Losh   ctx->nbSamples = nbSamples;
5680f743729SConrad Meyer   ctx->nbTrainSamples = nbTrainSamples;
5690f743729SConrad Meyer   ctx->nbTestSamples = nbTestSamples;
5700c16b537SWarner Losh   /* Partial suffix array */
5710f743729SConrad Meyer   ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
5720c16b537SWarner Losh   ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
5730c16b537SWarner Losh   /* Maps index to the dmerID */
5740c16b537SWarner Losh   ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
5750c16b537SWarner Losh   /* The offsets of each file */
5760c16b537SWarner Losh   ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
5770c16b537SWarner Losh   if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
5780c16b537SWarner Losh     DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
5790c16b537SWarner Losh     COVER_ctx_destroy(ctx);
580*4d3f1eafSConrad Meyer     return ERROR(memory_allocation);
5810c16b537SWarner Losh   }
5820c16b537SWarner Losh   ctx->freqs = NULL;
5830c16b537SWarner Losh   ctx->d = d;
5840c16b537SWarner Losh 
5850f743729SConrad Meyer   /* Fill offsets from the samplesSizes */
5860c16b537SWarner Losh   {
5870c16b537SWarner Losh     U32 i;
5880c16b537SWarner Losh     ctx->offsets[0] = 0;
5890c16b537SWarner Losh     for (i = 1; i <= nbSamples; ++i) {
5900c16b537SWarner Losh       ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
5910c16b537SWarner Losh     }
5920c16b537SWarner Losh   }
5930c16b537SWarner Losh   DISPLAYLEVEL(2, "Constructing partial suffix array\n");
5940c16b537SWarner Losh   {
5950c16b537SWarner Losh     /* suffix is a partial suffix array.
5960c16b537SWarner Losh      * It only sorts suffixes by their first parameters.d bytes.
5970c16b537SWarner Losh      * The sort is stable, so each dmer group is sorted by position in input.
5980c16b537SWarner Losh      */
5990c16b537SWarner Losh     U32 i;
6000c16b537SWarner Losh     for (i = 0; i < ctx->suffixSize; ++i) {
6010c16b537SWarner Losh       ctx->suffix[i] = i;
6020c16b537SWarner Losh     }
6030f743729SConrad Meyer     /* qsort doesn't take an opaque pointer, so pass as a global.
6040f743729SConrad Meyer      * On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
6050f743729SConrad Meyer      */
6060c16b537SWarner Losh     g_ctx = ctx;
6070f743729SConrad Meyer #if defined(__OpenBSD__)
6080f743729SConrad Meyer     mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
6090f743729SConrad Meyer           (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
6100f743729SConrad Meyer #else
6110c16b537SWarner Losh     qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
6120c16b537SWarner Losh           (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
6130f743729SConrad Meyer #endif
6140c16b537SWarner Losh   }
6150c16b537SWarner Losh   DISPLAYLEVEL(2, "Computing frequencies\n");
6160c16b537SWarner Losh   /* For each dmer group (group of positions with the same first d bytes):
6170c16b537SWarner Losh    * 1. For each position we set dmerAt[position] = dmerID.  The dmerID is
6180c16b537SWarner Losh    *    (groupBeginPtr - suffix).  This allows us to go from position to
6190c16b537SWarner Losh    *    dmerID so we can look up values in freq.
6200c16b537SWarner Losh    * 2. We calculate how many samples the dmer occurs in and save it in
6210c16b537SWarner Losh    *    freqs[dmerId].
6220c16b537SWarner Losh    */
6230c16b537SWarner Losh   COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
6240c16b537SWarner Losh                 (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
6250c16b537SWarner Losh   ctx->freqs = ctx->suffix;
6260c16b537SWarner Losh   ctx->suffix = NULL;
627*4d3f1eafSConrad Meyer   return 0;
6280c16b537SWarner Losh }
6290c16b537SWarner Losh 
6302b9c00cbSConrad Meyer void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
6312b9c00cbSConrad Meyer {
6322b9c00cbSConrad Meyer   const double ratio = (double)nbDmers / maxDictSize;
6332b9c00cbSConrad Meyer   if (ratio >= 10) {
6342b9c00cbSConrad Meyer       return;
6352b9c00cbSConrad Meyer   }
6362b9c00cbSConrad Meyer   LOCALDISPLAYLEVEL(displayLevel, 1,
6372b9c00cbSConrad Meyer                     "WARNING: The maximum dictionary size %u is too large "
6382b9c00cbSConrad Meyer                     "compared to the source size %u! "
6392b9c00cbSConrad Meyer                     "size(source)/size(dictionary) = %f, but it should be >= "
6402b9c00cbSConrad Meyer                     "10! This may lead to a subpar dictionary! We recommend "
6412b9c00cbSConrad Meyer                     "training on sources at least 10x, and up to 100x the "
6422b9c00cbSConrad Meyer                     "size of the dictionary!\n", (U32)maxDictSize,
6432b9c00cbSConrad Meyer                     (U32)nbDmers, ratio);
6442b9c00cbSConrad Meyer }
6452b9c00cbSConrad Meyer 
6462b9c00cbSConrad Meyer COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
6472b9c00cbSConrad Meyer                                        U32 nbDmers, U32 k, U32 passes)
6482b9c00cbSConrad Meyer {
6492b9c00cbSConrad Meyer   const U32 minEpochSize = k * 10;
6502b9c00cbSConrad Meyer   COVER_epoch_info_t epochs;
6512b9c00cbSConrad Meyer   epochs.num = MAX(1, maxDictSize / k / passes);
6522b9c00cbSConrad Meyer   epochs.size = nbDmers / epochs.num;
6532b9c00cbSConrad Meyer   if (epochs.size >= minEpochSize) {
6542b9c00cbSConrad Meyer       assert(epochs.size * epochs.num <= nbDmers);
6552b9c00cbSConrad Meyer       return epochs;
6562b9c00cbSConrad Meyer   }
6572b9c00cbSConrad Meyer   epochs.size = MIN(minEpochSize, nbDmers);
6582b9c00cbSConrad Meyer   epochs.num = nbDmers / epochs.size;
6592b9c00cbSConrad Meyer   assert(epochs.size * epochs.num <= nbDmers);
6602b9c00cbSConrad Meyer   return epochs;
6612b9c00cbSConrad Meyer }
6622b9c00cbSConrad Meyer 
6630c16b537SWarner Losh /**
6640c16b537SWarner Losh  * Given the prepared context build the dictionary.
6650c16b537SWarner Losh  */
6660c16b537SWarner Losh static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
6670c16b537SWarner Losh                                     COVER_map_t *activeDmers, void *dictBuffer,
6680c16b537SWarner Losh                                     size_t dictBufferCapacity,
6690c16b537SWarner Losh                                     ZDICT_cover_params_t parameters) {
6700c16b537SWarner Losh   BYTE *const dict = (BYTE *)dictBuffer;
6710c16b537SWarner Losh   size_t tail = dictBufferCapacity;
6722b9c00cbSConrad Meyer   /* Divide the data into epochs. We will select one segment from each epoch. */
6732b9c00cbSConrad Meyer   const COVER_epoch_info_t epochs = COVER_computeEpochs(
6742b9c00cbSConrad Meyer       (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
6752b9c00cbSConrad Meyer   const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
6762b9c00cbSConrad Meyer   size_t zeroScoreRun = 0;
6770c16b537SWarner Losh   size_t epoch;
678a0483764SConrad Meyer   DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
6792b9c00cbSConrad Meyer                 (U32)epochs.num, (U32)epochs.size);
6800c16b537SWarner Losh   /* Loop through the epochs until there are no more segments or the dictionary
6810c16b537SWarner Losh    * is full.
6820c16b537SWarner Losh    */
6832b9c00cbSConrad Meyer   for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
6842b9c00cbSConrad Meyer     const U32 epochBegin = (U32)(epoch * epochs.size);
6852b9c00cbSConrad Meyer     const U32 epochEnd = epochBegin + epochs.size;
6860c16b537SWarner Losh     size_t segmentSize;
6870c16b537SWarner Losh     /* Select a segment */
6880c16b537SWarner Losh     COVER_segment_t segment = COVER_selectSegment(
6890c16b537SWarner Losh         ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
6902b9c00cbSConrad Meyer     /* If the segment covers no dmers, then we are out of content.
6912b9c00cbSConrad Meyer      * There may be new content in other epochs, for continue for some time.
6922b9c00cbSConrad Meyer      */
6930c16b537SWarner Losh     if (segment.score == 0) {
6942b9c00cbSConrad Meyer       if (++zeroScoreRun >= maxZeroScoreRun) {
6950c16b537SWarner Losh           break;
6960c16b537SWarner Losh       }
6972b9c00cbSConrad Meyer       continue;
6982b9c00cbSConrad Meyer     }
6992b9c00cbSConrad Meyer     zeroScoreRun = 0;
7000c16b537SWarner Losh     /* Trim the segment if necessary and if it is too small then we are done */
7010c16b537SWarner Losh     segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
7020c16b537SWarner Losh     if (segmentSize < parameters.d) {
7030c16b537SWarner Losh       break;
7040c16b537SWarner Losh     }
7050c16b537SWarner Losh     /* We fill the dictionary from the back to allow the best segments to be
7060c16b537SWarner Losh      * referenced with the smallest offsets.
7070c16b537SWarner Losh      */
7080c16b537SWarner Losh     tail -= segmentSize;
7090c16b537SWarner Losh     memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
7100c16b537SWarner Losh     DISPLAYUPDATE(
7110c16b537SWarner Losh         2, "\r%u%%       ",
712a0483764SConrad Meyer         (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
7130c16b537SWarner Losh   }
7140c16b537SWarner Losh   DISPLAYLEVEL(2, "\r%79s\r", "");
7150c16b537SWarner Losh   return tail;
7160c16b537SWarner Losh }
7170c16b537SWarner Losh 
7180c16b537SWarner Losh ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
71919fcbaf1SConrad Meyer     void *dictBuffer, size_t dictBufferCapacity,
72019fcbaf1SConrad Meyer     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
72119fcbaf1SConrad Meyer     ZDICT_cover_params_t parameters)
72219fcbaf1SConrad Meyer {
7230c16b537SWarner Losh   BYTE* const dict = (BYTE*)dictBuffer;
7240c16b537SWarner Losh   COVER_ctx_t ctx;
7250c16b537SWarner Losh   COVER_map_t activeDmers;
7260f743729SConrad Meyer   parameters.splitPoint = 1.0;
72719fcbaf1SConrad Meyer   /* Initialize global data */
72819fcbaf1SConrad Meyer   g_displayLevel = parameters.zParams.notificationLevel;
7290c16b537SWarner Losh   /* Checks */
7300c16b537SWarner Losh   if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
7310c16b537SWarner Losh     DISPLAYLEVEL(1, "Cover parameters incorrect\n");
732*4d3f1eafSConrad Meyer     return ERROR(parameter_outOfBound);
7330c16b537SWarner Losh   }
7340c16b537SWarner Losh   if (nbSamples == 0) {
7350c16b537SWarner Losh     DISPLAYLEVEL(1, "Cover must have at least one input file\n");
736*4d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
7370c16b537SWarner Losh   }
7380c16b537SWarner Losh   if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
7390c16b537SWarner Losh     DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
7400c16b537SWarner Losh                  ZDICT_DICTSIZE_MIN);
7410c16b537SWarner Losh     return ERROR(dstSize_tooSmall);
7420c16b537SWarner Losh   }
7430c16b537SWarner Losh   /* Initialize context and activeDmers */
744*4d3f1eafSConrad Meyer   {
745*4d3f1eafSConrad Meyer     size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
746*4d3f1eafSConrad Meyer                       parameters.d, parameters.splitPoint);
747*4d3f1eafSConrad Meyer     if (ZSTD_isError(initVal)) {
748*4d3f1eafSConrad Meyer       return initVal;
749*4d3f1eafSConrad Meyer     }
7500c16b537SWarner Losh   }
7512b9c00cbSConrad Meyer   COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
7520c16b537SWarner Losh   if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
7530c16b537SWarner Losh     DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
7540c16b537SWarner Losh     COVER_ctx_destroy(&ctx);
755*4d3f1eafSConrad Meyer     return ERROR(memory_allocation);
7560c16b537SWarner Losh   }
7570c16b537SWarner Losh 
7580c16b537SWarner Losh   DISPLAYLEVEL(2, "Building dictionary\n");
7590c16b537SWarner Losh   {
7600c16b537SWarner Losh     const size_t tail =
7610c16b537SWarner Losh         COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
7620c16b537SWarner Losh                               dictBufferCapacity, parameters);
7630c16b537SWarner Losh     const size_t dictionarySize = ZDICT_finalizeDictionary(
7640c16b537SWarner Losh         dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
7650c16b537SWarner Losh         samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
7660c16b537SWarner Losh     if (!ZSTD_isError(dictionarySize)) {
7670c16b537SWarner Losh       DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
768a0483764SConrad Meyer                    (unsigned)dictionarySize);
7690c16b537SWarner Losh     }
7700c16b537SWarner Losh     COVER_ctx_destroy(&ctx);
7710c16b537SWarner Losh     COVER_map_destroy(&activeDmers);
7720c16b537SWarner Losh     return dictionarySize;
7730c16b537SWarner Losh   }
7740c16b537SWarner Losh }
7750c16b537SWarner Losh 
7760f743729SConrad Meyer 
7770f743729SConrad Meyer 
7780f743729SConrad Meyer size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
7790f743729SConrad Meyer                                     const size_t *samplesSizes, const BYTE *samples,
7800f743729SConrad Meyer                                     size_t *offsets,
7810f743729SConrad Meyer                                     size_t nbTrainSamples, size_t nbSamples,
7820f743729SConrad Meyer                                     BYTE *const dict, size_t dictBufferCapacity) {
7830f743729SConrad Meyer   size_t totalCompressedSize = ERROR(GENERIC);
7840f743729SConrad Meyer   /* Pointers */
7850f743729SConrad Meyer   ZSTD_CCtx *cctx;
7860f743729SConrad Meyer   ZSTD_CDict *cdict;
7870f743729SConrad Meyer   void *dst;
7880f743729SConrad Meyer   /* Local variables */
7890f743729SConrad Meyer   size_t dstCapacity;
7900f743729SConrad Meyer   size_t i;
7910f743729SConrad Meyer   /* Allocate dst with enough space to compress the maximum sized sample */
7920f743729SConrad Meyer   {
7930f743729SConrad Meyer     size_t maxSampleSize = 0;
7940f743729SConrad Meyer     i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
7950f743729SConrad Meyer     for (; i < nbSamples; ++i) {
7960f743729SConrad Meyer       maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
7970f743729SConrad Meyer     }
7980f743729SConrad Meyer     dstCapacity = ZSTD_compressBound(maxSampleSize);
7990f743729SConrad Meyer     dst = malloc(dstCapacity);
8000f743729SConrad Meyer   }
8010f743729SConrad Meyer   /* Create the cctx and cdict */
8020f743729SConrad Meyer   cctx = ZSTD_createCCtx();
8030f743729SConrad Meyer   cdict = ZSTD_createCDict(dict, dictBufferCapacity,
8040f743729SConrad Meyer                            parameters.zParams.compressionLevel);
8050f743729SConrad Meyer   if (!dst || !cctx || !cdict) {
8060f743729SConrad Meyer     goto _compressCleanup;
8070f743729SConrad Meyer   }
8080f743729SConrad Meyer   /* Compress each sample and sum their sizes (or error) */
8090f743729SConrad Meyer   totalCompressedSize = dictBufferCapacity;
8100f743729SConrad Meyer   i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
8110f743729SConrad Meyer   for (; i < nbSamples; ++i) {
8120f743729SConrad Meyer     const size_t size = ZSTD_compress_usingCDict(
8130f743729SConrad Meyer         cctx, dst, dstCapacity, samples + offsets[i],
8140f743729SConrad Meyer         samplesSizes[i], cdict);
8150f743729SConrad Meyer     if (ZSTD_isError(size)) {
816*4d3f1eafSConrad Meyer       totalCompressedSize = size;
8170f743729SConrad Meyer       goto _compressCleanup;
8180f743729SConrad Meyer     }
8190f743729SConrad Meyer     totalCompressedSize += size;
8200f743729SConrad Meyer   }
8210f743729SConrad Meyer _compressCleanup:
8220f743729SConrad Meyer   ZSTD_freeCCtx(cctx);
8230f743729SConrad Meyer   ZSTD_freeCDict(cdict);
8240f743729SConrad Meyer   if (dst) {
8250f743729SConrad Meyer     free(dst);
8260f743729SConrad Meyer   }
8270f743729SConrad Meyer   return totalCompressedSize;
8280f743729SConrad Meyer }
8290f743729SConrad Meyer 
8300c16b537SWarner Losh 
8310c16b537SWarner Losh /**
8320c16b537SWarner Losh  * Initialize the `COVER_best_t`.
8330c16b537SWarner Losh  */
8340f743729SConrad Meyer void COVER_best_init(COVER_best_t *best) {
8350c16b537SWarner Losh   if (best==NULL) return; /* compatible with init on NULL */
8360c16b537SWarner Losh   (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
8370c16b537SWarner Losh   (void)ZSTD_pthread_cond_init(&best->cond, NULL);
8380c16b537SWarner Losh   best->liveJobs = 0;
8390c16b537SWarner Losh   best->dict = NULL;
8400c16b537SWarner Losh   best->dictSize = 0;
8410c16b537SWarner Losh   best->compressedSize = (size_t)-1;
8420c16b537SWarner Losh   memset(&best->parameters, 0, sizeof(best->parameters));
8430c16b537SWarner Losh }
8440c16b537SWarner Losh 
8450c16b537SWarner Losh /**
8460c16b537SWarner Losh  * Wait until liveJobs == 0.
8470c16b537SWarner Losh  */
8480f743729SConrad Meyer void COVER_best_wait(COVER_best_t *best) {
8490c16b537SWarner Losh   if (!best) {
8500c16b537SWarner Losh     return;
8510c16b537SWarner Losh   }
8520c16b537SWarner Losh   ZSTD_pthread_mutex_lock(&best->mutex);
8530c16b537SWarner Losh   while (best->liveJobs != 0) {
8540c16b537SWarner Losh     ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
8550c16b537SWarner Losh   }
8560c16b537SWarner Losh   ZSTD_pthread_mutex_unlock(&best->mutex);
8570c16b537SWarner Losh }
8580c16b537SWarner Losh 
8590c16b537SWarner Losh /**
8600c16b537SWarner Losh  * Call COVER_best_wait() and then destroy the COVER_best_t.
8610c16b537SWarner Losh  */
8620f743729SConrad Meyer void COVER_best_destroy(COVER_best_t *best) {
8630c16b537SWarner Losh   if (!best) {
8640c16b537SWarner Losh     return;
8650c16b537SWarner Losh   }
8660c16b537SWarner Losh   COVER_best_wait(best);
8670c16b537SWarner Losh   if (best->dict) {
8680c16b537SWarner Losh     free(best->dict);
8690c16b537SWarner Losh   }
8700c16b537SWarner Losh   ZSTD_pthread_mutex_destroy(&best->mutex);
8710c16b537SWarner Losh   ZSTD_pthread_cond_destroy(&best->cond);
8720c16b537SWarner Losh }
8730c16b537SWarner Losh 
8740c16b537SWarner Losh /**
8750c16b537SWarner Losh  * Called when a thread is about to be launched.
8760c16b537SWarner Losh  * Increments liveJobs.
8770c16b537SWarner Losh  */
8780f743729SConrad Meyer void COVER_best_start(COVER_best_t *best) {
8790c16b537SWarner Losh   if (!best) {
8800c16b537SWarner Losh     return;
8810c16b537SWarner Losh   }
8820c16b537SWarner Losh   ZSTD_pthread_mutex_lock(&best->mutex);
8830c16b537SWarner Losh   ++best->liveJobs;
8840c16b537SWarner Losh   ZSTD_pthread_mutex_unlock(&best->mutex);
8850c16b537SWarner Losh }
8860c16b537SWarner Losh 
8870c16b537SWarner Losh /**
8880c16b537SWarner Losh  * Called when a thread finishes executing, both on error or success.
8890c16b537SWarner Losh  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
8900c16b537SWarner Losh  * If this dictionary is the best so far save it and its parameters.
8910c16b537SWarner Losh  */
892*4d3f1eafSConrad Meyer void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
893*4d3f1eafSConrad Meyer                               COVER_dictSelection_t selection) {
894*4d3f1eafSConrad Meyer   void* dict = selection.dictContent;
895*4d3f1eafSConrad Meyer   size_t compressedSize = selection.totalCompressedSize;
896*4d3f1eafSConrad Meyer   size_t dictSize = selection.dictSize;
8970c16b537SWarner Losh   if (!best) {
8980c16b537SWarner Losh     return;
8990c16b537SWarner Losh   }
9000c16b537SWarner Losh   {
9010c16b537SWarner Losh     size_t liveJobs;
9020c16b537SWarner Losh     ZSTD_pthread_mutex_lock(&best->mutex);
9030c16b537SWarner Losh     --best->liveJobs;
9040c16b537SWarner Losh     liveJobs = best->liveJobs;
9050c16b537SWarner Losh     /* If the new dictionary is better */
9060c16b537SWarner Losh     if (compressedSize < best->compressedSize) {
9070c16b537SWarner Losh       /* Allocate space if necessary */
9080c16b537SWarner Losh       if (!best->dict || best->dictSize < dictSize) {
9090c16b537SWarner Losh         if (best->dict) {
9100c16b537SWarner Losh           free(best->dict);
9110c16b537SWarner Losh         }
9120c16b537SWarner Losh         best->dict = malloc(dictSize);
9130c16b537SWarner Losh         if (!best->dict) {
9140c16b537SWarner Losh           best->compressedSize = ERROR(GENERIC);
9150c16b537SWarner Losh           best->dictSize = 0;
916a0483764SConrad Meyer           ZSTD_pthread_cond_signal(&best->cond);
917a0483764SConrad Meyer           ZSTD_pthread_mutex_unlock(&best->mutex);
9180c16b537SWarner Losh           return;
9190c16b537SWarner Losh         }
9200c16b537SWarner Losh       }
9210c16b537SWarner Losh       /* Save the dictionary, parameters, and size */
922*4d3f1eafSConrad Meyer       if (!dict) {
923*4d3f1eafSConrad Meyer         return;
924*4d3f1eafSConrad Meyer       }
9250c16b537SWarner Losh       memcpy(best->dict, dict, dictSize);
9260c16b537SWarner Losh       best->dictSize = dictSize;
9270c16b537SWarner Losh       best->parameters = parameters;
9280c16b537SWarner Losh       best->compressedSize = compressedSize;
9290c16b537SWarner Losh     }
9300c16b537SWarner Losh     if (liveJobs == 0) {
9310c16b537SWarner Losh       ZSTD_pthread_cond_broadcast(&best->cond);
9320c16b537SWarner Losh     }
9330f743729SConrad Meyer     ZSTD_pthread_mutex_unlock(&best->mutex);
9340c16b537SWarner Losh   }
9350c16b537SWarner Losh }
9360c16b537SWarner Losh 
937*4d3f1eafSConrad Meyer COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
938*4d3f1eafSConrad Meyer     COVER_dictSelection_t selection = { NULL, 0, error };
939*4d3f1eafSConrad Meyer     return selection;
940*4d3f1eafSConrad Meyer }
941*4d3f1eafSConrad Meyer 
942*4d3f1eafSConrad Meyer unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
943*4d3f1eafSConrad Meyer   return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
944*4d3f1eafSConrad Meyer }
945*4d3f1eafSConrad Meyer 
946*4d3f1eafSConrad Meyer void COVER_dictSelectionFree(COVER_dictSelection_t selection){
947*4d3f1eafSConrad Meyer   free(selection.dictContent);
948*4d3f1eafSConrad Meyer }
949*4d3f1eafSConrad Meyer 
950*4d3f1eafSConrad Meyer COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
951*4d3f1eafSConrad Meyer         size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
952*4d3f1eafSConrad Meyer         size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
953*4d3f1eafSConrad Meyer 
954*4d3f1eafSConrad Meyer   size_t largestDict = 0;
955*4d3f1eafSConrad Meyer   size_t largestCompressed = 0;
956*4d3f1eafSConrad Meyer   BYTE* customDictContentEnd = customDictContent + dictContentSize;
957*4d3f1eafSConrad Meyer 
958*4d3f1eafSConrad Meyer   BYTE * largestDictbuffer = (BYTE *)malloc(dictContentSize);
959*4d3f1eafSConrad Meyer   BYTE * candidateDictBuffer = (BYTE *)malloc(dictContentSize);
960*4d3f1eafSConrad Meyer   double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
961*4d3f1eafSConrad Meyer 
962*4d3f1eafSConrad Meyer   if (!largestDictbuffer || !candidateDictBuffer) {
963*4d3f1eafSConrad Meyer     free(largestDictbuffer);
964*4d3f1eafSConrad Meyer     free(candidateDictBuffer);
965*4d3f1eafSConrad Meyer     return COVER_dictSelectionError(dictContentSize);
966*4d3f1eafSConrad Meyer   }
967*4d3f1eafSConrad Meyer 
968*4d3f1eafSConrad Meyer   /* Initial dictionary size and compressed size */
969*4d3f1eafSConrad Meyer   memcpy(largestDictbuffer, customDictContent, dictContentSize);
970*4d3f1eafSConrad Meyer   dictContentSize = ZDICT_finalizeDictionary(
971*4d3f1eafSConrad Meyer     largestDictbuffer, dictContentSize, customDictContent, dictContentSize,
972*4d3f1eafSConrad Meyer     samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
973*4d3f1eafSConrad Meyer 
974*4d3f1eafSConrad Meyer   if (ZDICT_isError(dictContentSize)) {
975*4d3f1eafSConrad Meyer     free(largestDictbuffer);
976*4d3f1eafSConrad Meyer     free(candidateDictBuffer);
977*4d3f1eafSConrad Meyer     return COVER_dictSelectionError(dictContentSize);
978*4d3f1eafSConrad Meyer   }
979*4d3f1eafSConrad Meyer 
980*4d3f1eafSConrad Meyer   totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
981*4d3f1eafSConrad Meyer                                                        samplesBuffer, offsets,
982*4d3f1eafSConrad Meyer                                                        nbCheckSamples, nbSamples,
983*4d3f1eafSConrad Meyer                                                        largestDictbuffer, dictContentSize);
984*4d3f1eafSConrad Meyer 
985*4d3f1eafSConrad Meyer   if (ZSTD_isError(totalCompressedSize)) {
986*4d3f1eafSConrad Meyer     free(largestDictbuffer);
987*4d3f1eafSConrad Meyer     free(candidateDictBuffer);
988*4d3f1eafSConrad Meyer     return COVER_dictSelectionError(totalCompressedSize);
989*4d3f1eafSConrad Meyer   }
990*4d3f1eafSConrad Meyer 
991*4d3f1eafSConrad Meyer   if (params.shrinkDict == 0) {
992*4d3f1eafSConrad Meyer     COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
993*4d3f1eafSConrad Meyer     free(candidateDictBuffer);
994*4d3f1eafSConrad Meyer     return selection;
995*4d3f1eafSConrad Meyer   }
996*4d3f1eafSConrad Meyer 
997*4d3f1eafSConrad Meyer   largestDict = dictContentSize;
998*4d3f1eafSConrad Meyer   largestCompressed = totalCompressedSize;
999*4d3f1eafSConrad Meyer   dictContentSize = ZDICT_DICTSIZE_MIN;
1000*4d3f1eafSConrad Meyer 
1001*4d3f1eafSConrad Meyer   /* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
1002*4d3f1eafSConrad Meyer   while (dictContentSize < largestDict) {
1003*4d3f1eafSConrad Meyer     memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
1004*4d3f1eafSConrad Meyer     dictContentSize = ZDICT_finalizeDictionary(
1005*4d3f1eafSConrad Meyer       candidateDictBuffer, dictContentSize, customDictContentEnd - dictContentSize, dictContentSize,
1006*4d3f1eafSConrad Meyer       samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
1007*4d3f1eafSConrad Meyer 
1008*4d3f1eafSConrad Meyer     if (ZDICT_isError(dictContentSize)) {
1009*4d3f1eafSConrad Meyer       free(largestDictbuffer);
1010*4d3f1eafSConrad Meyer       free(candidateDictBuffer);
1011*4d3f1eafSConrad Meyer       return COVER_dictSelectionError(dictContentSize);
1012*4d3f1eafSConrad Meyer 
1013*4d3f1eafSConrad Meyer     }
1014*4d3f1eafSConrad Meyer 
1015*4d3f1eafSConrad Meyer     totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
1016*4d3f1eafSConrad Meyer                                                          samplesBuffer, offsets,
1017*4d3f1eafSConrad Meyer                                                          nbCheckSamples, nbSamples,
1018*4d3f1eafSConrad Meyer                                                          candidateDictBuffer, dictContentSize);
1019*4d3f1eafSConrad Meyer 
1020*4d3f1eafSConrad Meyer     if (ZSTD_isError(totalCompressedSize)) {
1021*4d3f1eafSConrad Meyer       free(largestDictbuffer);
1022*4d3f1eafSConrad Meyer       free(candidateDictBuffer);
1023*4d3f1eafSConrad Meyer       return COVER_dictSelectionError(totalCompressedSize);
1024*4d3f1eafSConrad Meyer     }
1025*4d3f1eafSConrad Meyer 
1026*4d3f1eafSConrad Meyer     if (totalCompressedSize <= largestCompressed * regressionTolerance) {
1027*4d3f1eafSConrad Meyer       COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
1028*4d3f1eafSConrad Meyer       free(largestDictbuffer);
1029*4d3f1eafSConrad Meyer       return selection;
1030*4d3f1eafSConrad Meyer     }
1031*4d3f1eafSConrad Meyer     dictContentSize *= 2;
1032*4d3f1eafSConrad Meyer   }
1033*4d3f1eafSConrad Meyer   dictContentSize = largestDict;
1034*4d3f1eafSConrad Meyer   totalCompressedSize = largestCompressed;
1035*4d3f1eafSConrad Meyer   {
1036*4d3f1eafSConrad Meyer     COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
1037*4d3f1eafSConrad Meyer     free(candidateDictBuffer);
1038*4d3f1eafSConrad Meyer     return selection;
1039*4d3f1eafSConrad Meyer   }
1040*4d3f1eafSConrad Meyer }
1041*4d3f1eafSConrad Meyer 
10420c16b537SWarner Losh /**
10430c16b537SWarner Losh  * Parameters for COVER_tryParameters().
10440c16b537SWarner Losh  */
10450c16b537SWarner Losh typedef struct COVER_tryParameters_data_s {
10460c16b537SWarner Losh   const COVER_ctx_t *ctx;
10470c16b537SWarner Losh   COVER_best_t *best;
10480c16b537SWarner Losh   size_t dictBufferCapacity;
10490c16b537SWarner Losh   ZDICT_cover_params_t parameters;
10500c16b537SWarner Losh } COVER_tryParameters_data_t;
10510c16b537SWarner Losh 
10520c16b537SWarner Losh /**
10530f743729SConrad Meyer  * Tries a set of parameters and updates the COVER_best_t with the results.
10540c16b537SWarner Losh  * This function is thread safe if zstd is compiled with multithreaded support.
10550c16b537SWarner Losh  * It takes its parameters as an *OWNING* opaque pointer to support threading.
10560c16b537SWarner Losh  */
10570c16b537SWarner Losh static void COVER_tryParameters(void *opaque) {
10580c16b537SWarner Losh   /* Save parameters as local variables */
10590c16b537SWarner Losh   COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque;
10600c16b537SWarner Losh   const COVER_ctx_t *const ctx = data->ctx;
10610c16b537SWarner Losh   const ZDICT_cover_params_t parameters = data->parameters;
10620c16b537SWarner Losh   size_t dictBufferCapacity = data->dictBufferCapacity;
10630c16b537SWarner Losh   size_t totalCompressedSize = ERROR(GENERIC);
10640c16b537SWarner Losh   /* Allocate space for hash table, dict, and freqs */
10650c16b537SWarner Losh   COVER_map_t activeDmers;
10660c16b537SWarner Losh   BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
1067*4d3f1eafSConrad Meyer   COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
10680c16b537SWarner Losh   U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
10690c16b537SWarner Losh   if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
10700c16b537SWarner Losh     DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
10710c16b537SWarner Losh     goto _cleanup;
10720c16b537SWarner Losh   }
10730c16b537SWarner Losh   if (!dict || !freqs) {
10740c16b537SWarner Losh     DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
10750c16b537SWarner Losh     goto _cleanup;
10760c16b537SWarner Losh   }
10770c16b537SWarner Losh   /* Copy the frequencies because we need to modify them */
10780c16b537SWarner Losh   memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
10790c16b537SWarner Losh   /* Build the dictionary */
10800c16b537SWarner Losh   {
10810c16b537SWarner Losh     const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
10820c16b537SWarner Losh                                               dictBufferCapacity, parameters);
1083*4d3f1eafSConrad Meyer     selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
1084*4d3f1eafSConrad Meyer         ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
1085*4d3f1eafSConrad Meyer         totalCompressedSize);
1086*4d3f1eafSConrad Meyer 
1087*4d3f1eafSConrad Meyer     if (COVER_dictSelectionIsError(selection)) {
1088*4d3f1eafSConrad Meyer       DISPLAYLEVEL(1, "Failed to select dictionary\n");
10890c16b537SWarner Losh       goto _cleanup;
10900c16b537SWarner Losh     }
10910c16b537SWarner Losh   }
10920c16b537SWarner Losh _cleanup:
1093*4d3f1eafSConrad Meyer   free(dict);
1094*4d3f1eafSConrad Meyer   COVER_best_finish(data->best, parameters, selection);
10950c16b537SWarner Losh   free(data);
10960c16b537SWarner Losh   COVER_map_destroy(&activeDmers);
1097*4d3f1eafSConrad Meyer   COVER_dictSelectionFree(selection);
10980c16b537SWarner Losh   if (freqs) {
10990c16b537SWarner Losh     free(freqs);
11000c16b537SWarner Losh   }
11010c16b537SWarner Losh }
11020c16b537SWarner Losh 
11030c16b537SWarner Losh ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
11040c16b537SWarner Losh     void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
11050c16b537SWarner Losh     const size_t *samplesSizes, unsigned nbSamples,
11060c16b537SWarner Losh     ZDICT_cover_params_t *parameters) {
11070c16b537SWarner Losh   /* constants */
11080c16b537SWarner Losh   const unsigned nbThreads = parameters->nbThreads;
11090f743729SConrad Meyer   const double splitPoint =
11100f743729SConrad Meyer       parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
11110c16b537SWarner Losh   const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
11120c16b537SWarner Losh   const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
11130c16b537SWarner Losh   const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
11140c16b537SWarner Losh   const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
11150c16b537SWarner Losh   const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
11160c16b537SWarner Losh   const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
11170c16b537SWarner Losh   const unsigned kIterations =
11180c16b537SWarner Losh       (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
1119*4d3f1eafSConrad Meyer   const unsigned shrinkDict = 0;
11200c16b537SWarner Losh   /* Local variables */
11210c16b537SWarner Losh   const int displayLevel = parameters->zParams.notificationLevel;
11220c16b537SWarner Losh   unsigned iteration = 1;
11230c16b537SWarner Losh   unsigned d;
11240c16b537SWarner Losh   unsigned k;
11250c16b537SWarner Losh   COVER_best_t best;
11260c16b537SWarner Losh   POOL_ctx *pool = NULL;
11272b9c00cbSConrad Meyer   int warned = 0;
112819fcbaf1SConrad Meyer 
11290c16b537SWarner Losh   /* Checks */
11300f743729SConrad Meyer   if (splitPoint <= 0 || splitPoint > 1) {
11310f743729SConrad Meyer     LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
1132*4d3f1eafSConrad Meyer     return ERROR(parameter_outOfBound);
11330f743729SConrad Meyer   }
11340c16b537SWarner Losh   if (kMinK < kMaxD || kMaxK < kMinK) {
11350c16b537SWarner Losh     LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
1136*4d3f1eafSConrad Meyer     return ERROR(parameter_outOfBound);
11370c16b537SWarner Losh   }
11380c16b537SWarner Losh   if (nbSamples == 0) {
11390c16b537SWarner Losh     DISPLAYLEVEL(1, "Cover must have at least one input file\n");
1140*4d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
11410c16b537SWarner Losh   }
11420c16b537SWarner Losh   if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
11430c16b537SWarner Losh     DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
11440c16b537SWarner Losh                  ZDICT_DICTSIZE_MIN);
11450c16b537SWarner Losh     return ERROR(dstSize_tooSmall);
11460c16b537SWarner Losh   }
11470c16b537SWarner Losh   if (nbThreads > 1) {
11480c16b537SWarner Losh     pool = POOL_create(nbThreads, 1);
11490c16b537SWarner Losh     if (!pool) {
11500c16b537SWarner Losh       return ERROR(memory_allocation);
11510c16b537SWarner Losh     }
11520c16b537SWarner Losh   }
11530c16b537SWarner Losh   /* Initialization */
11540c16b537SWarner Losh   COVER_best_init(&best);
11550c16b537SWarner Losh   /* Turn down global display level to clean up display at level 2 and below */
11560c16b537SWarner Losh   g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
11570c16b537SWarner Losh   /* Loop through d first because each new value needs a new context */
11580c16b537SWarner Losh   LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
11590c16b537SWarner Losh                     kIterations);
11600c16b537SWarner Losh   for (d = kMinD; d <= kMaxD; d += 2) {
11610c16b537SWarner Losh     /* Initialize the context for this value of d */
11620c16b537SWarner Losh     COVER_ctx_t ctx;
11630c16b537SWarner Losh     LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
1164*4d3f1eafSConrad Meyer     {
1165*4d3f1eafSConrad Meyer       const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
1166*4d3f1eafSConrad Meyer       if (ZSTD_isError(initVal)) {
11670c16b537SWarner Losh         LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
11680c16b537SWarner Losh         COVER_best_destroy(&best);
11690c16b537SWarner Losh         POOL_free(pool);
1170*4d3f1eafSConrad Meyer         return initVal;
1171*4d3f1eafSConrad Meyer       }
11720c16b537SWarner Losh     }
11732b9c00cbSConrad Meyer     if (!warned) {
11742b9c00cbSConrad Meyer       COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
11752b9c00cbSConrad Meyer       warned = 1;
11762b9c00cbSConrad Meyer     }
11770c16b537SWarner Losh     /* Loop through k reusing the same context */
11780c16b537SWarner Losh     for (k = kMinK; k <= kMaxK; k += kStepSize) {
11790c16b537SWarner Losh       /* Prepare the arguments */
11800c16b537SWarner Losh       COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
11810c16b537SWarner Losh           sizeof(COVER_tryParameters_data_t));
11820c16b537SWarner Losh       LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
11830c16b537SWarner Losh       if (!data) {
11840c16b537SWarner Losh         LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
11850c16b537SWarner Losh         COVER_best_destroy(&best);
11860c16b537SWarner Losh         COVER_ctx_destroy(&ctx);
11870c16b537SWarner Losh         POOL_free(pool);
1188*4d3f1eafSConrad Meyer         return ERROR(memory_allocation);
11890c16b537SWarner Losh       }
11900c16b537SWarner Losh       data->ctx = &ctx;
11910c16b537SWarner Losh       data->best = &best;
11920c16b537SWarner Losh       data->dictBufferCapacity = dictBufferCapacity;
11930c16b537SWarner Losh       data->parameters = *parameters;
11940c16b537SWarner Losh       data->parameters.k = k;
11950c16b537SWarner Losh       data->parameters.d = d;
11960f743729SConrad Meyer       data->parameters.splitPoint = splitPoint;
11970c16b537SWarner Losh       data->parameters.steps = kSteps;
1198*4d3f1eafSConrad Meyer       data->parameters.shrinkDict = shrinkDict;
11990c16b537SWarner Losh       data->parameters.zParams.notificationLevel = g_displayLevel;
12000c16b537SWarner Losh       /* Check the parameters */
12010c16b537SWarner Losh       if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
12020c16b537SWarner Losh         DISPLAYLEVEL(1, "Cover parameters incorrect\n");
12030c16b537SWarner Losh         free(data);
12040c16b537SWarner Losh         continue;
12050c16b537SWarner Losh       }
12060c16b537SWarner Losh       /* Call the function and pass ownership of data to it */
12070c16b537SWarner Losh       COVER_best_start(&best);
12080c16b537SWarner Losh       if (pool) {
12090c16b537SWarner Losh         POOL_add(pool, &COVER_tryParameters, data);
12100c16b537SWarner Losh       } else {
12110c16b537SWarner Losh         COVER_tryParameters(data);
12120c16b537SWarner Losh       }
12130c16b537SWarner Losh       /* Print status */
12140c16b537SWarner Losh       LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
1215a0483764SConrad Meyer                          (unsigned)((iteration * 100) / kIterations));
12160c16b537SWarner Losh       ++iteration;
12170c16b537SWarner Losh     }
12180c16b537SWarner Losh     COVER_best_wait(&best);
12190c16b537SWarner Losh     COVER_ctx_destroy(&ctx);
12200c16b537SWarner Losh   }
12210c16b537SWarner Losh   LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
12220c16b537SWarner Losh   /* Fill the output buffer and parameters with output of the best parameters */
12230c16b537SWarner Losh   {
12240c16b537SWarner Losh     const size_t dictSize = best.dictSize;
12250c16b537SWarner Losh     if (ZSTD_isError(best.compressedSize)) {
12260c16b537SWarner Losh       const size_t compressedSize = best.compressedSize;
12270c16b537SWarner Losh       COVER_best_destroy(&best);
12280c16b537SWarner Losh       POOL_free(pool);
12290c16b537SWarner Losh       return compressedSize;
12300c16b537SWarner Losh     }
12310c16b537SWarner Losh     *parameters = best.parameters;
12320c16b537SWarner Losh     memcpy(dictBuffer, best.dict, dictSize);
12330c16b537SWarner Losh     COVER_best_destroy(&best);
12340c16b537SWarner Losh     POOL_free(pool);
12350c16b537SWarner Losh     return dictSize;
12360c16b537SWarner Losh   }
12370c16b537SWarner Losh }
1238