xref: /freebsd/sys/contrib/zstd/lib/dictBuilder/cover.c (revision f7cd7fe51c4140960ebea00410ed62894f5625d1)
10c16b537SWarner Losh /*
237f1f268SConrad Meyer  * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
30c16b537SWarner Losh  * All rights reserved.
40c16b537SWarner Losh  *
50c16b537SWarner Losh  * This source code is licensed under both the BSD-style license (found in the
60c16b537SWarner Losh  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
70c16b537SWarner Losh  * in the COPYING file in the root directory of this source tree).
80c16b537SWarner Losh  * You may select, at your option, one of the above-listed licenses.
90c16b537SWarner Losh  */
100c16b537SWarner Losh 
110c16b537SWarner Losh /* *****************************************************************************
120c16b537SWarner Losh  * Constructs a dictionary using a heuristic based on the following paper:
130c16b537SWarner Losh  *
140c16b537SWarner Losh  * Liao, Petri, Moffat, Wirth
150c16b537SWarner Losh  * Effective Construction of Relative Lempel-Ziv Dictionaries
160c16b537SWarner Losh  * Published in WWW 2016.
170c16b537SWarner Losh  *
180c16b537SWarner Losh  * Adapted from code originally written by @ot (Giuseppe Ottaviano).
190c16b537SWarner Losh  ******************************************************************************/
200c16b537SWarner Losh 
210c16b537SWarner Losh /*-*************************************
220c16b537SWarner Losh *  Dependencies
230c16b537SWarner Losh ***************************************/
240c16b537SWarner Losh #include <stdio.h>  /* fprintf */
250c16b537SWarner Losh #include <stdlib.h> /* malloc, free, qsort */
260c16b537SWarner Losh #include <string.h> /* memset */
270c16b537SWarner Losh #include <time.h>   /* clock */
280c16b537SWarner Losh 
2937f1f268SConrad Meyer #include "../common/mem.h" /* read */
3037f1f268SConrad Meyer #include "../common/pool.h"
3137f1f268SConrad Meyer #include "../common/threading.h"
320f743729SConrad Meyer #include "cover.h"
3337f1f268SConrad Meyer #include "../common/zstd_internal.h" /* includes zstd.h */
340c16b537SWarner Losh #ifndef ZDICT_STATIC_LINKING_ONLY
350c16b537SWarner Losh #define ZDICT_STATIC_LINKING_ONLY
360c16b537SWarner Losh #endif
370c16b537SWarner Losh #include "zdict.h"
380c16b537SWarner Losh 
390c16b537SWarner Losh /*-*************************************
400c16b537SWarner Losh *  Constants
410c16b537SWarner Losh ***************************************/
42a0483764SConrad Meyer #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
43*f7cd7fe5SConrad Meyer #define COVER_DEFAULT_SPLITPOINT 1.0
440c16b537SWarner Losh 
450c16b537SWarner Losh /*-*************************************
460c16b537SWarner Losh *  Console display
470c16b537SWarner Losh ***************************************/
48*f7cd7fe5SConrad Meyer #ifndef LOCALDISPLAYLEVEL
490c16b537SWarner Losh static int g_displayLevel = 2;
50*f7cd7fe5SConrad Meyer #endif
51*f7cd7fe5SConrad Meyer #undef  DISPLAY
520c16b537SWarner Losh #define DISPLAY(...)                                                           \
530c16b537SWarner Losh   {                                                                            \
540c16b537SWarner Losh     fprintf(stderr, __VA_ARGS__);                                              \
550c16b537SWarner Losh     fflush(stderr);                                                            \
560c16b537SWarner Losh   }
57*f7cd7fe5SConrad Meyer #undef  LOCALDISPLAYLEVEL
580c16b537SWarner Losh #define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
590c16b537SWarner Losh   if (displayLevel >= l) {                                                     \
600c16b537SWarner Losh     DISPLAY(__VA_ARGS__);                                                      \
610c16b537SWarner Losh   } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
62*f7cd7fe5SConrad Meyer #undef  DISPLAYLEVEL
630c16b537SWarner Losh #define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
640c16b537SWarner Losh 
65*f7cd7fe5SConrad Meyer #ifndef LOCALDISPLAYUPDATE
66*f7cd7fe5SConrad Meyer static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;
67*f7cd7fe5SConrad Meyer static clock_t g_time = 0;
68*f7cd7fe5SConrad Meyer #endif
69*f7cd7fe5SConrad Meyer #undef  LOCALDISPLAYUPDATE
700c16b537SWarner Losh #define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
710c16b537SWarner Losh   if (displayLevel >= l) {                                                     \
72*f7cd7fe5SConrad Meyer     if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) {             \
730c16b537SWarner Losh       g_time = clock();                                                        \
740c16b537SWarner Losh       DISPLAY(__VA_ARGS__);                                                    \
750c16b537SWarner Losh     }                                                                          \
760c16b537SWarner Losh   }
77*f7cd7fe5SConrad Meyer #undef  DISPLAYUPDATE
780c16b537SWarner Losh #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
790c16b537SWarner Losh 
800c16b537SWarner Losh /*-*************************************
810c16b537SWarner Losh * Hash table
820c16b537SWarner Losh ***************************************
830c16b537SWarner Losh * A small specialized hash map for storing activeDmers.
840c16b537SWarner Losh * The map does not resize, so if it becomes full it will loop forever.
850c16b537SWarner Losh * Thus, the map must be large enough to store every value.
860c16b537SWarner Losh * The map implements linear probing and keeps its load less than 0.5.
870c16b537SWarner Losh */
880c16b537SWarner Losh 
890c16b537SWarner Losh #define MAP_EMPTY_VALUE ((U32)-1)
900c16b537SWarner Losh typedef struct COVER_map_pair_t_s {
910c16b537SWarner Losh   U32 key;
920c16b537SWarner Losh   U32 value;
930c16b537SWarner Losh } COVER_map_pair_t;
940c16b537SWarner Losh 
950c16b537SWarner Losh typedef struct COVER_map_s {
960c16b537SWarner Losh   COVER_map_pair_t *data;
970c16b537SWarner Losh   U32 sizeLog;
980c16b537SWarner Losh   U32 size;
990c16b537SWarner Losh   U32 sizeMask;
1000c16b537SWarner Losh } COVER_map_t;
1010c16b537SWarner Losh 
1020c16b537SWarner Losh /**
1030c16b537SWarner Losh  * Clear the map.
1040c16b537SWarner Losh  */
1050c16b537SWarner Losh static void COVER_map_clear(COVER_map_t *map) {
1060c16b537SWarner Losh   memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
1070c16b537SWarner Losh }
1080c16b537SWarner Losh 
1090c16b537SWarner Losh /**
1100c16b537SWarner Losh  * Initializes a map of the given size.
1110c16b537SWarner Losh  * Returns 1 on success and 0 on failure.
1120c16b537SWarner Losh  * The map must be destroyed with COVER_map_destroy().
1130c16b537SWarner Losh  * The map is only guaranteed to be large enough to hold size elements.
1140c16b537SWarner Losh  */
1150c16b537SWarner Losh static int COVER_map_init(COVER_map_t *map, U32 size) {
1160c16b537SWarner Losh   map->sizeLog = ZSTD_highbit32(size) + 2;
1170c16b537SWarner Losh   map->size = (U32)1 << map->sizeLog;
1180c16b537SWarner Losh   map->sizeMask = map->size - 1;
1190c16b537SWarner Losh   map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
1200c16b537SWarner Losh   if (!map->data) {
1210c16b537SWarner Losh     map->sizeLog = 0;
1220c16b537SWarner Losh     map->size = 0;
1230c16b537SWarner Losh     return 0;
1240c16b537SWarner Losh   }
1250c16b537SWarner Losh   COVER_map_clear(map);
1260c16b537SWarner Losh   return 1;
1270c16b537SWarner Losh }
1280c16b537SWarner Losh 
1290c16b537SWarner Losh /**
1300c16b537SWarner Losh  * Internal hash function
1310c16b537SWarner Losh  */
132*f7cd7fe5SConrad Meyer static const U32 COVER_prime4bytes = 2654435761U;
1330c16b537SWarner Losh static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
134*f7cd7fe5SConrad Meyer   return (key * COVER_prime4bytes) >> (32 - map->sizeLog);
1350c16b537SWarner Losh }
1360c16b537SWarner Losh 
1370c16b537SWarner Losh /**
1380c16b537SWarner Losh  * Helper function that returns the index that a key should be placed into.
1390c16b537SWarner Losh  */
1400c16b537SWarner Losh static U32 COVER_map_index(COVER_map_t *map, U32 key) {
1410c16b537SWarner Losh   const U32 hash = COVER_map_hash(map, key);
1420c16b537SWarner Losh   U32 i;
1430c16b537SWarner Losh   for (i = hash;; i = (i + 1) & map->sizeMask) {
1440c16b537SWarner Losh     COVER_map_pair_t *pos = &map->data[i];
1450c16b537SWarner Losh     if (pos->value == MAP_EMPTY_VALUE) {
1460c16b537SWarner Losh       return i;
1470c16b537SWarner Losh     }
1480c16b537SWarner Losh     if (pos->key == key) {
1490c16b537SWarner Losh       return i;
1500c16b537SWarner Losh     }
1510c16b537SWarner Losh   }
1520c16b537SWarner Losh }
1530c16b537SWarner Losh 
1540c16b537SWarner Losh /**
1550c16b537SWarner Losh  * Returns the pointer to the value for key.
1560c16b537SWarner Losh  * If key is not in the map, it is inserted and the value is set to 0.
1570c16b537SWarner Losh  * The map must not be full.
1580c16b537SWarner Losh  */
1590c16b537SWarner Losh static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
1600c16b537SWarner Losh   COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
1610c16b537SWarner Losh   if (pos->value == MAP_EMPTY_VALUE) {
1620c16b537SWarner Losh     pos->key = key;
1630c16b537SWarner Losh     pos->value = 0;
1640c16b537SWarner Losh   }
1650c16b537SWarner Losh   return &pos->value;
1660c16b537SWarner Losh }
1670c16b537SWarner Losh 
1680c16b537SWarner Losh /**
1690c16b537SWarner Losh  * Deletes key from the map if present.
1700c16b537SWarner Losh  */
1710c16b537SWarner Losh static void COVER_map_remove(COVER_map_t *map, U32 key) {
1720c16b537SWarner Losh   U32 i = COVER_map_index(map, key);
1730c16b537SWarner Losh   COVER_map_pair_t *del = &map->data[i];
1740c16b537SWarner Losh   U32 shift = 1;
1750c16b537SWarner Losh   if (del->value == MAP_EMPTY_VALUE) {
1760c16b537SWarner Losh     return;
1770c16b537SWarner Losh   }
1780c16b537SWarner Losh   for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
1790c16b537SWarner Losh     COVER_map_pair_t *const pos = &map->data[i];
1800c16b537SWarner Losh     /* If the position is empty we are done */
1810c16b537SWarner Losh     if (pos->value == MAP_EMPTY_VALUE) {
1820c16b537SWarner Losh       del->value = MAP_EMPTY_VALUE;
1830c16b537SWarner Losh       return;
1840c16b537SWarner Losh     }
1850c16b537SWarner Losh     /* If pos can be moved to del do so */
1860c16b537SWarner Losh     if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
1870c16b537SWarner Losh       del->key = pos->key;
1880c16b537SWarner Losh       del->value = pos->value;
1890c16b537SWarner Losh       del = pos;
1900c16b537SWarner Losh       shift = 1;
1910c16b537SWarner Losh     } else {
1920c16b537SWarner Losh       ++shift;
1930c16b537SWarner Losh     }
1940c16b537SWarner Losh   }
1950c16b537SWarner Losh }
1960c16b537SWarner Losh 
1970c16b537SWarner Losh /**
1980f743729SConrad Meyer  * Destroys a map that is inited with COVER_map_init().
1990c16b537SWarner Losh  */
2000c16b537SWarner Losh static void COVER_map_destroy(COVER_map_t *map) {
2010c16b537SWarner Losh   if (map->data) {
2020c16b537SWarner Losh     free(map->data);
2030c16b537SWarner Losh   }
2040c16b537SWarner Losh   map->data = NULL;
2050c16b537SWarner Losh   map->size = 0;
2060c16b537SWarner Losh }
2070c16b537SWarner Losh 
2080c16b537SWarner Losh /*-*************************************
2090c16b537SWarner Losh * Context
2100c16b537SWarner Losh ***************************************/
2110c16b537SWarner Losh 
2120c16b537SWarner Losh typedef struct {
2130c16b537SWarner Losh   const BYTE *samples;
2140c16b537SWarner Losh   size_t *offsets;
2150c16b537SWarner Losh   const size_t *samplesSizes;
2160c16b537SWarner Losh   size_t nbSamples;
2170f743729SConrad Meyer   size_t nbTrainSamples;
2180f743729SConrad Meyer   size_t nbTestSamples;
2190c16b537SWarner Losh   U32 *suffix;
2200c16b537SWarner Losh   size_t suffixSize;
2210c16b537SWarner Losh   U32 *freqs;
2220c16b537SWarner Losh   U32 *dmerAt;
2230c16b537SWarner Losh   unsigned d;
2240c16b537SWarner Losh } COVER_ctx_t;
2250c16b537SWarner Losh 
2260c16b537SWarner Losh /* We need a global context for qsort... */
227*f7cd7fe5SConrad Meyer static COVER_ctx_t *g_coverCtx = NULL;
2280c16b537SWarner Losh 
2290c16b537SWarner Losh /*-*************************************
2300c16b537SWarner Losh *  Helper functions
2310c16b537SWarner Losh ***************************************/
2320c16b537SWarner Losh 
2330c16b537SWarner Losh /**
2340c16b537SWarner Losh  * Returns the sum of the sample sizes.
2350c16b537SWarner Losh  */
2360f743729SConrad Meyer size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
2370c16b537SWarner Losh   size_t sum = 0;
2380f743729SConrad Meyer   unsigned i;
2390c16b537SWarner Losh   for (i = 0; i < nbSamples; ++i) {
2400c16b537SWarner Losh     sum += samplesSizes[i];
2410c16b537SWarner Losh   }
2420c16b537SWarner Losh   return sum;
2430c16b537SWarner Losh }
2440c16b537SWarner Losh 
2450c16b537SWarner Losh /**
2460c16b537SWarner Losh  * Returns -1 if the dmer at lp is less than the dmer at rp.
2470c16b537SWarner Losh  * Return 0 if the dmers at lp and rp are equal.
2480c16b537SWarner Losh  * Returns 1 if the dmer at lp is greater than the dmer at rp.
2490c16b537SWarner Losh  */
2500c16b537SWarner Losh static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
2510c16b537SWarner Losh   U32 const lhs = *(U32 const *)lp;
2520c16b537SWarner Losh   U32 const rhs = *(U32 const *)rp;
2530c16b537SWarner Losh   return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
2540c16b537SWarner Losh }
2550c16b537SWarner Losh /**
2560c16b537SWarner Losh  * Faster version for d <= 8.
2570c16b537SWarner Losh  */
2580c16b537SWarner Losh static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
2590c16b537SWarner Losh   U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
2600c16b537SWarner Losh   U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
2610c16b537SWarner Losh   U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
2620c16b537SWarner Losh   if (lhs < rhs) {
2630c16b537SWarner Losh     return -1;
2640c16b537SWarner Losh   }
2650c16b537SWarner Losh   return (lhs > rhs);
2660c16b537SWarner Losh }
2670c16b537SWarner Losh 
2680c16b537SWarner Losh /**
2690c16b537SWarner Losh  * Same as COVER_cmp() except ties are broken by pointer value
270*f7cd7fe5SConrad Meyer  * NOTE: g_coverCtx must be set to call this function.  A global is required because
2710c16b537SWarner Losh  * qsort doesn't take an opaque pointer.
2720c16b537SWarner Losh  */
273*f7cd7fe5SConrad Meyer static int WIN_CDECL COVER_strict_cmp(const void *lp, const void *rp) {
274*f7cd7fe5SConrad Meyer   int result = COVER_cmp(g_coverCtx, lp, rp);
2750c16b537SWarner Losh   if (result == 0) {
2760c16b537SWarner Losh     result = lp < rp ? -1 : 1;
2770c16b537SWarner Losh   }
2780c16b537SWarner Losh   return result;
2790c16b537SWarner Losh }
2800c16b537SWarner Losh /**
2810c16b537SWarner Losh  * Faster version for d <= 8.
2820c16b537SWarner Losh  */
283*f7cd7fe5SConrad Meyer static int WIN_CDECL COVER_strict_cmp8(const void *lp, const void *rp) {
284*f7cd7fe5SConrad Meyer   int result = COVER_cmp8(g_coverCtx, lp, rp);
2850c16b537SWarner Losh   if (result == 0) {
2860c16b537SWarner Losh     result = lp < rp ? -1 : 1;
2870c16b537SWarner Losh   }
2880c16b537SWarner Losh   return result;
2890c16b537SWarner Losh }
2900c16b537SWarner Losh 
2910c16b537SWarner Losh /**
2920c16b537SWarner Losh  * Returns the first pointer in [first, last) whose element does not compare
2930c16b537SWarner Losh  * less than value.  If no such element exists it returns last.
2940c16b537SWarner Losh  */
2950c16b537SWarner Losh static const size_t *COVER_lower_bound(const size_t *first, const size_t *last,
2960c16b537SWarner Losh                                        size_t value) {
2970c16b537SWarner Losh   size_t count = last - first;
2980c16b537SWarner Losh   while (count != 0) {
2990c16b537SWarner Losh     size_t step = count / 2;
3000c16b537SWarner Losh     const size_t *ptr = first;
3010c16b537SWarner Losh     ptr += step;
3020c16b537SWarner Losh     if (*ptr < value) {
3030c16b537SWarner Losh       first = ++ptr;
3040c16b537SWarner Losh       count -= step + 1;
3050c16b537SWarner Losh     } else {
3060c16b537SWarner Losh       count = step;
3070c16b537SWarner Losh     }
3080c16b537SWarner Losh   }
3090c16b537SWarner Losh   return first;
3100c16b537SWarner Losh }
3110c16b537SWarner Losh 
3120c16b537SWarner Losh /**
3130c16b537SWarner Losh  * Generic groupBy function.
3140c16b537SWarner Losh  * Groups an array sorted by cmp into groups with equivalent values.
3150c16b537SWarner Losh  * Calls grp for each group.
3160c16b537SWarner Losh  */
3170c16b537SWarner Losh static void
3180c16b537SWarner Losh COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
3190c16b537SWarner Losh               int (*cmp)(COVER_ctx_t *, const void *, const void *),
3200c16b537SWarner Losh               void (*grp)(COVER_ctx_t *, const void *, const void *)) {
3210c16b537SWarner Losh   const BYTE *ptr = (const BYTE *)data;
3220c16b537SWarner Losh   size_t num = 0;
3230c16b537SWarner Losh   while (num < count) {
3240c16b537SWarner Losh     const BYTE *grpEnd = ptr + size;
3250c16b537SWarner Losh     ++num;
3260c16b537SWarner Losh     while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
3270c16b537SWarner Losh       grpEnd += size;
3280c16b537SWarner Losh       ++num;
3290c16b537SWarner Losh     }
3300c16b537SWarner Losh     grp(ctx, ptr, grpEnd);
3310c16b537SWarner Losh     ptr = grpEnd;
3320c16b537SWarner Losh   }
3330c16b537SWarner Losh }
3340c16b537SWarner Losh 
3350c16b537SWarner Losh /*-*************************************
3360c16b537SWarner Losh *  Cover functions
3370c16b537SWarner Losh ***************************************/
3380c16b537SWarner Losh 
3390c16b537SWarner Losh /**
3400c16b537SWarner Losh  * Called on each group of positions with the same dmer.
3410c16b537SWarner Losh  * Counts the frequency of each dmer and saves it in the suffix array.
3420c16b537SWarner Losh  * Fills `ctx->dmerAt`.
3430c16b537SWarner Losh  */
3440c16b537SWarner Losh static void COVER_group(COVER_ctx_t *ctx, const void *group,
3450c16b537SWarner Losh                         const void *groupEnd) {
3460c16b537SWarner Losh   /* The group consists of all the positions with the same first d bytes. */
3470c16b537SWarner Losh   const U32 *grpPtr = (const U32 *)group;
3480c16b537SWarner Losh   const U32 *grpEnd = (const U32 *)groupEnd;
3490c16b537SWarner Losh   /* The dmerId is how we will reference this dmer.
3500c16b537SWarner Losh    * This allows us to map the whole dmer space to a much smaller space, the
3510c16b537SWarner Losh    * size of the suffix array.
3520c16b537SWarner Losh    */
3530c16b537SWarner Losh   const U32 dmerId = (U32)(grpPtr - ctx->suffix);
3540c16b537SWarner Losh   /* Count the number of samples this dmer shows up in */
3550c16b537SWarner Losh   U32 freq = 0;
3560c16b537SWarner Losh   /* Details */
3570c16b537SWarner Losh   const size_t *curOffsetPtr = ctx->offsets;
3580c16b537SWarner Losh   const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
3590c16b537SWarner Losh   /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
3600c16b537SWarner Losh    * different sample than the last.
3610c16b537SWarner Losh    */
3620c16b537SWarner Losh   size_t curSampleEnd = ctx->offsets[0];
3630c16b537SWarner Losh   for (; grpPtr != grpEnd; ++grpPtr) {
3640c16b537SWarner Losh     /* Save the dmerId for this position so we can get back to it. */
3650c16b537SWarner Losh     ctx->dmerAt[*grpPtr] = dmerId;
3660c16b537SWarner Losh     /* Dictionaries only help for the first reference to the dmer.
3670c16b537SWarner Losh      * After that zstd can reference the match from the previous reference.
3680c16b537SWarner Losh      * So only count each dmer once for each sample it is in.
3690c16b537SWarner Losh      */
3700c16b537SWarner Losh     if (*grpPtr < curSampleEnd) {
3710c16b537SWarner Losh       continue;
3720c16b537SWarner Losh     }
3730c16b537SWarner Losh     freq += 1;
3740c16b537SWarner Losh     /* Binary search to find the end of the sample *grpPtr is in.
3750c16b537SWarner Losh      * In the common case that grpPtr + 1 == grpEnd we can skip the binary
3760c16b537SWarner Losh      * search because the loop is over.
3770c16b537SWarner Losh      */
3780c16b537SWarner Losh     if (grpPtr + 1 != grpEnd) {
3790c16b537SWarner Losh       const size_t *sampleEndPtr =
3800c16b537SWarner Losh           COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
3810c16b537SWarner Losh       curSampleEnd = *sampleEndPtr;
3820c16b537SWarner Losh       curOffsetPtr = sampleEndPtr + 1;
3830c16b537SWarner Losh     }
3840c16b537SWarner Losh   }
3850c16b537SWarner Losh   /* At this point we are never going to look at this segment of the suffix
3860c16b537SWarner Losh    * array again.  We take advantage of this fact to save memory.
3870c16b537SWarner Losh    * We store the frequency of the dmer in the first position of the group,
3880c16b537SWarner Losh    * which is dmerId.
3890c16b537SWarner Losh    */
3900c16b537SWarner Losh   ctx->suffix[dmerId] = freq;
3910c16b537SWarner Losh }
3920c16b537SWarner Losh 
3930c16b537SWarner Losh 
3940c16b537SWarner Losh /**
3950c16b537SWarner Losh  * Selects the best segment in an epoch.
3960c16b537SWarner Losh  * Segments of are scored according to the function:
3970c16b537SWarner Losh  *
3980c16b537SWarner Losh  * Let F(d) be the frequency of dmer d.
3990c16b537SWarner Losh  * Let S_i be the dmer at position i of segment S which has length k.
4000c16b537SWarner Losh  *
4010c16b537SWarner Losh  *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
4020c16b537SWarner Losh  *
4032b9c00cbSConrad Meyer  * Once the dmer d is in the dictionary we set F(d) = 0.
4040c16b537SWarner Losh  */
4050c16b537SWarner Losh static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
4060c16b537SWarner Losh                                            COVER_map_t *activeDmers, U32 begin,
4070c16b537SWarner Losh                                            U32 end,
4080c16b537SWarner Losh                                            ZDICT_cover_params_t parameters) {
4090c16b537SWarner Losh   /* Constants */
4100c16b537SWarner Losh   const U32 k = parameters.k;
4110c16b537SWarner Losh   const U32 d = parameters.d;
4120c16b537SWarner Losh   const U32 dmersInK = k - d + 1;
4130c16b537SWarner Losh   /* Try each segment (activeSegment) and save the best (bestSegment) */
4140c16b537SWarner Losh   COVER_segment_t bestSegment = {0, 0, 0};
4150c16b537SWarner Losh   COVER_segment_t activeSegment;
4160c16b537SWarner Losh   /* Reset the activeDmers in the segment */
4170c16b537SWarner Losh   COVER_map_clear(activeDmers);
4180c16b537SWarner Losh   /* The activeSegment starts at the beginning of the epoch. */
4190c16b537SWarner Losh   activeSegment.begin = begin;
4200c16b537SWarner Losh   activeSegment.end = begin;
4210c16b537SWarner Losh   activeSegment.score = 0;
4220c16b537SWarner Losh   /* Slide the activeSegment through the whole epoch.
4230c16b537SWarner Losh    * Save the best segment in bestSegment.
4240c16b537SWarner Losh    */
4250c16b537SWarner Losh   while (activeSegment.end < end) {
4260c16b537SWarner Losh     /* The dmerId for the dmer at the next position */
4270c16b537SWarner Losh     U32 newDmer = ctx->dmerAt[activeSegment.end];
4280c16b537SWarner Losh     /* The entry in activeDmers for this dmerId */
4290c16b537SWarner Losh     U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
4300c16b537SWarner Losh     /* If the dmer isn't already present in the segment add its score. */
4310c16b537SWarner Losh     if (*newDmerOcc == 0) {
4320c16b537SWarner Losh       /* The paper suggest using the L-0.5 norm, but experiments show that it
4330c16b537SWarner Losh        * doesn't help.
4340c16b537SWarner Losh        */
4350c16b537SWarner Losh       activeSegment.score += freqs[newDmer];
4360c16b537SWarner Losh     }
4370c16b537SWarner Losh     /* Add the dmer to the segment */
4380c16b537SWarner Losh     activeSegment.end += 1;
4390c16b537SWarner Losh     *newDmerOcc += 1;
4400c16b537SWarner Losh 
4410c16b537SWarner Losh     /* If the window is now too large, drop the first position */
4420c16b537SWarner Losh     if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
4430c16b537SWarner Losh       U32 delDmer = ctx->dmerAt[activeSegment.begin];
4440c16b537SWarner Losh       U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
4450c16b537SWarner Losh       activeSegment.begin += 1;
4460c16b537SWarner Losh       *delDmerOcc -= 1;
4472b9c00cbSConrad Meyer       /* If this is the last occurrence of the dmer, subtract its score */
4480c16b537SWarner Losh       if (*delDmerOcc == 0) {
4490c16b537SWarner Losh         COVER_map_remove(activeDmers, delDmer);
4500c16b537SWarner Losh         activeSegment.score -= freqs[delDmer];
4510c16b537SWarner Losh       }
4520c16b537SWarner Losh     }
4530c16b537SWarner Losh 
4540c16b537SWarner Losh     /* If this segment is the best so far save it */
4550c16b537SWarner Losh     if (activeSegment.score > bestSegment.score) {
4560c16b537SWarner Losh       bestSegment = activeSegment;
4570c16b537SWarner Losh     }
4580c16b537SWarner Losh   }
4590c16b537SWarner Losh   {
4600c16b537SWarner Losh     /* Trim off the zero frequency head and tail from the segment. */
4610c16b537SWarner Losh     U32 newBegin = bestSegment.end;
4620c16b537SWarner Losh     U32 newEnd = bestSegment.begin;
4630c16b537SWarner Losh     U32 pos;
4640c16b537SWarner Losh     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
4650c16b537SWarner Losh       U32 freq = freqs[ctx->dmerAt[pos]];
4660c16b537SWarner Losh       if (freq != 0) {
4670c16b537SWarner Losh         newBegin = MIN(newBegin, pos);
4680c16b537SWarner Losh         newEnd = pos + 1;
4690c16b537SWarner Losh       }
4700c16b537SWarner Losh     }
4710c16b537SWarner Losh     bestSegment.begin = newBegin;
4720c16b537SWarner Losh     bestSegment.end = newEnd;
4730c16b537SWarner Losh   }
4740c16b537SWarner Losh   {
4750c16b537SWarner Losh     /* Zero out the frequency of each dmer covered by the chosen segment. */
4760c16b537SWarner Losh     U32 pos;
4770c16b537SWarner Losh     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
4780c16b537SWarner Losh       freqs[ctx->dmerAt[pos]] = 0;
4790c16b537SWarner Losh     }
4800c16b537SWarner Losh   }
4810c16b537SWarner Losh   return bestSegment;
4820c16b537SWarner Losh }
4830c16b537SWarner Losh 
4840c16b537SWarner Losh /**
4850c16b537SWarner Losh  * Check the validity of the parameters.
4860c16b537SWarner Losh  * Returns non-zero if the parameters are valid and 0 otherwise.
4870c16b537SWarner Losh  */
4880c16b537SWarner Losh static int COVER_checkParameters(ZDICT_cover_params_t parameters,
4890c16b537SWarner Losh                                  size_t maxDictSize) {
4900c16b537SWarner Losh   /* k and d are required parameters */
4910c16b537SWarner Losh   if (parameters.d == 0 || parameters.k == 0) {
4920c16b537SWarner Losh     return 0;
4930c16b537SWarner Losh   }
4940c16b537SWarner Losh   /* k <= maxDictSize */
4950c16b537SWarner Losh   if (parameters.k > maxDictSize) {
4960c16b537SWarner Losh     return 0;
4970c16b537SWarner Losh   }
4980c16b537SWarner Losh   /* d <= k */
4990c16b537SWarner Losh   if (parameters.d > parameters.k) {
5000c16b537SWarner Losh     return 0;
5010c16b537SWarner Losh   }
5020f743729SConrad Meyer   /* 0 < splitPoint <= 1 */
5030f743729SConrad Meyer   if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
5040f743729SConrad Meyer     return 0;
5050f743729SConrad Meyer   }
5060c16b537SWarner Losh   return 1;
5070c16b537SWarner Losh }
5080c16b537SWarner Losh 
5090c16b537SWarner Losh /**
5100c16b537SWarner Losh  * Clean up a context initialized with `COVER_ctx_init()`.
5110c16b537SWarner Losh  */
5120c16b537SWarner Losh static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
5130c16b537SWarner Losh   if (!ctx) {
5140c16b537SWarner Losh     return;
5150c16b537SWarner Losh   }
5160c16b537SWarner Losh   if (ctx->suffix) {
5170c16b537SWarner Losh     free(ctx->suffix);
5180c16b537SWarner Losh     ctx->suffix = NULL;
5190c16b537SWarner Losh   }
5200c16b537SWarner Losh   if (ctx->freqs) {
5210c16b537SWarner Losh     free(ctx->freqs);
5220c16b537SWarner Losh     ctx->freqs = NULL;
5230c16b537SWarner Losh   }
5240c16b537SWarner Losh   if (ctx->dmerAt) {
5250c16b537SWarner Losh     free(ctx->dmerAt);
5260c16b537SWarner Losh     ctx->dmerAt = NULL;
5270c16b537SWarner Losh   }
5280c16b537SWarner Losh   if (ctx->offsets) {
5290c16b537SWarner Losh     free(ctx->offsets);
5300c16b537SWarner Losh     ctx->offsets = NULL;
5310c16b537SWarner Losh   }
5320c16b537SWarner Losh }
5330c16b537SWarner Losh 
5340c16b537SWarner Losh /**
5350c16b537SWarner Losh  * Prepare a context for dictionary building.
5360c16b537SWarner Losh  * The context is only dependent on the parameter `d` and can used multiple
5370c16b537SWarner Losh  * times.
5384d3f1eafSConrad Meyer  * Returns 0 on success or error code on error.
5390c16b537SWarner Losh  * The context must be destroyed with `COVER_ctx_destroy()`.
5400c16b537SWarner Losh  */
5414d3f1eafSConrad Meyer static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
5420c16b537SWarner Losh                           const size_t *samplesSizes, unsigned nbSamples,
5430f743729SConrad Meyer                           unsigned d, double splitPoint) {
5440c16b537SWarner Losh   const BYTE *const samples = (const BYTE *)samplesBuffer;
5450c16b537SWarner Losh   const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
5460f743729SConrad Meyer   /* Split samples into testing and training sets */
5470f743729SConrad Meyer   const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
5480f743729SConrad Meyer   const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
5490f743729SConrad Meyer   const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
5500f743729SConrad Meyer   const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
5510c16b537SWarner Losh   /* Checks */
5520c16b537SWarner Losh   if (totalSamplesSize < MAX(d, sizeof(U64)) ||
5530c16b537SWarner Losh       totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
55419fcbaf1SConrad Meyer     DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
555a0483764SConrad Meyer                  (unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
5564d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
5570c16b537SWarner Losh   }
5580f743729SConrad Meyer   /* Check if there are at least 5 training samples */
5590f743729SConrad Meyer   if (nbTrainSamples < 5) {
5600f743729SConrad Meyer     DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
5614d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
5620f743729SConrad Meyer   }
5630f743729SConrad Meyer   /* Check if there's testing sample */
5640f743729SConrad Meyer   if (nbTestSamples < 1) {
5650f743729SConrad Meyer     DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
5664d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
5670f743729SConrad Meyer   }
5680c16b537SWarner Losh   /* Zero the context */
5690c16b537SWarner Losh   memset(ctx, 0, sizeof(*ctx));
5700f743729SConrad Meyer   DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
571a0483764SConrad Meyer                (unsigned)trainingSamplesSize);
5720f743729SConrad Meyer   DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
573a0483764SConrad Meyer                (unsigned)testSamplesSize);
5740c16b537SWarner Losh   ctx->samples = samples;
5750c16b537SWarner Losh   ctx->samplesSizes = samplesSizes;
5760c16b537SWarner Losh   ctx->nbSamples = nbSamples;
5770f743729SConrad Meyer   ctx->nbTrainSamples = nbTrainSamples;
5780f743729SConrad Meyer   ctx->nbTestSamples = nbTestSamples;
5790c16b537SWarner Losh   /* Partial suffix array */
5800f743729SConrad Meyer   ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
5810c16b537SWarner Losh   ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
5820c16b537SWarner Losh   /* Maps index to the dmerID */
5830c16b537SWarner Losh   ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
5840c16b537SWarner Losh   /* The offsets of each file */
5850c16b537SWarner Losh   ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
5860c16b537SWarner Losh   if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
5870c16b537SWarner Losh     DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
5880c16b537SWarner Losh     COVER_ctx_destroy(ctx);
5894d3f1eafSConrad Meyer     return ERROR(memory_allocation);
5900c16b537SWarner Losh   }
5910c16b537SWarner Losh   ctx->freqs = NULL;
5920c16b537SWarner Losh   ctx->d = d;
5930c16b537SWarner Losh 
5940f743729SConrad Meyer   /* Fill offsets from the samplesSizes */
5950c16b537SWarner Losh   {
5960c16b537SWarner Losh     U32 i;
5970c16b537SWarner Losh     ctx->offsets[0] = 0;
5980c16b537SWarner Losh     for (i = 1; i <= nbSamples; ++i) {
5990c16b537SWarner Losh       ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
6000c16b537SWarner Losh     }
6010c16b537SWarner Losh   }
6020c16b537SWarner Losh   DISPLAYLEVEL(2, "Constructing partial suffix array\n");
6030c16b537SWarner Losh   {
6040c16b537SWarner Losh     /* suffix is a partial suffix array.
6050c16b537SWarner Losh      * It only sorts suffixes by their first parameters.d bytes.
6060c16b537SWarner Losh      * The sort is stable, so each dmer group is sorted by position in input.
6070c16b537SWarner Losh      */
6080c16b537SWarner Losh     U32 i;
6090c16b537SWarner Losh     for (i = 0; i < ctx->suffixSize; ++i) {
6100c16b537SWarner Losh       ctx->suffix[i] = i;
6110c16b537SWarner Losh     }
6120f743729SConrad Meyer     /* qsort doesn't take an opaque pointer, so pass as a global.
6130f743729SConrad Meyer      * On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
6140f743729SConrad Meyer      */
615*f7cd7fe5SConrad Meyer     g_coverCtx = ctx;
6160f743729SConrad Meyer #if defined(__OpenBSD__)
6170f743729SConrad Meyer     mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
6180f743729SConrad Meyer           (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
6190f743729SConrad Meyer #else
6200c16b537SWarner Losh     qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
6210c16b537SWarner Losh           (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
6220f743729SConrad Meyer #endif
6230c16b537SWarner Losh   }
6240c16b537SWarner Losh   DISPLAYLEVEL(2, "Computing frequencies\n");
6250c16b537SWarner Losh   /* For each dmer group (group of positions with the same first d bytes):
6260c16b537SWarner Losh    * 1. For each position we set dmerAt[position] = dmerID.  The dmerID is
6270c16b537SWarner Losh    *    (groupBeginPtr - suffix).  This allows us to go from position to
6280c16b537SWarner Losh    *    dmerID so we can look up values in freq.
6290c16b537SWarner Losh    * 2. We calculate how many samples the dmer occurs in and save it in
6300c16b537SWarner Losh    *    freqs[dmerId].
6310c16b537SWarner Losh    */
6320c16b537SWarner Losh   COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
6330c16b537SWarner Losh                 (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
6340c16b537SWarner Losh   ctx->freqs = ctx->suffix;
6350c16b537SWarner Losh   ctx->suffix = NULL;
6364d3f1eafSConrad Meyer   return 0;
6370c16b537SWarner Losh }
6380c16b537SWarner Losh 
6392b9c00cbSConrad Meyer void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
6402b9c00cbSConrad Meyer {
6412b9c00cbSConrad Meyer   const double ratio = (double)nbDmers / maxDictSize;
6422b9c00cbSConrad Meyer   if (ratio >= 10) {
6432b9c00cbSConrad Meyer       return;
6442b9c00cbSConrad Meyer   }
6452b9c00cbSConrad Meyer   LOCALDISPLAYLEVEL(displayLevel, 1,
6462b9c00cbSConrad Meyer                     "WARNING: The maximum dictionary size %u is too large "
6472b9c00cbSConrad Meyer                     "compared to the source size %u! "
6482b9c00cbSConrad Meyer                     "size(source)/size(dictionary) = %f, but it should be >= "
6492b9c00cbSConrad Meyer                     "10! This may lead to a subpar dictionary! We recommend "
6509cbefe25SConrad Meyer                     "training on sources at least 10x, and preferably 100x "
6519cbefe25SConrad Meyer                     "the size of the dictionary! \n", (U32)maxDictSize,
6522b9c00cbSConrad Meyer                     (U32)nbDmers, ratio);
6532b9c00cbSConrad Meyer }
6542b9c00cbSConrad Meyer 
6552b9c00cbSConrad Meyer COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
6562b9c00cbSConrad Meyer                                        U32 nbDmers, U32 k, U32 passes)
6572b9c00cbSConrad Meyer {
6582b9c00cbSConrad Meyer   const U32 minEpochSize = k * 10;
6592b9c00cbSConrad Meyer   COVER_epoch_info_t epochs;
6602b9c00cbSConrad Meyer   epochs.num = MAX(1, maxDictSize / k / passes);
6612b9c00cbSConrad Meyer   epochs.size = nbDmers / epochs.num;
6622b9c00cbSConrad Meyer   if (epochs.size >= minEpochSize) {
6632b9c00cbSConrad Meyer       assert(epochs.size * epochs.num <= nbDmers);
6642b9c00cbSConrad Meyer       return epochs;
6652b9c00cbSConrad Meyer   }
6662b9c00cbSConrad Meyer   epochs.size = MIN(minEpochSize, nbDmers);
6672b9c00cbSConrad Meyer   epochs.num = nbDmers / epochs.size;
6682b9c00cbSConrad Meyer   assert(epochs.size * epochs.num <= nbDmers);
6692b9c00cbSConrad Meyer   return epochs;
6702b9c00cbSConrad Meyer }
6712b9c00cbSConrad Meyer 
6720c16b537SWarner Losh /**
6730c16b537SWarner Losh  * Given the prepared context build the dictionary.
6740c16b537SWarner Losh  */
6750c16b537SWarner Losh static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
6760c16b537SWarner Losh                                     COVER_map_t *activeDmers, void *dictBuffer,
6770c16b537SWarner Losh                                     size_t dictBufferCapacity,
6780c16b537SWarner Losh                                     ZDICT_cover_params_t parameters) {
6790c16b537SWarner Losh   BYTE *const dict = (BYTE *)dictBuffer;
6800c16b537SWarner Losh   size_t tail = dictBufferCapacity;
6812b9c00cbSConrad Meyer   /* Divide the data into epochs. We will select one segment from each epoch. */
6822b9c00cbSConrad Meyer   const COVER_epoch_info_t epochs = COVER_computeEpochs(
6832b9c00cbSConrad Meyer       (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
6842b9c00cbSConrad Meyer   const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
6852b9c00cbSConrad Meyer   size_t zeroScoreRun = 0;
6860c16b537SWarner Losh   size_t epoch;
687a0483764SConrad Meyer   DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
6882b9c00cbSConrad Meyer                 (U32)epochs.num, (U32)epochs.size);
6890c16b537SWarner Losh   /* Loop through the epochs until there are no more segments or the dictionary
6900c16b537SWarner Losh    * is full.
6910c16b537SWarner Losh    */
6922b9c00cbSConrad Meyer   for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
6932b9c00cbSConrad Meyer     const U32 epochBegin = (U32)(epoch * epochs.size);
6942b9c00cbSConrad Meyer     const U32 epochEnd = epochBegin + epochs.size;
6950c16b537SWarner Losh     size_t segmentSize;
6960c16b537SWarner Losh     /* Select a segment */
6970c16b537SWarner Losh     COVER_segment_t segment = COVER_selectSegment(
6980c16b537SWarner Losh         ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
6992b9c00cbSConrad Meyer     /* If the segment covers no dmers, then we are out of content.
7002b9c00cbSConrad Meyer      * There may be new content in other epochs, for continue for some time.
7012b9c00cbSConrad Meyer      */
7020c16b537SWarner Losh     if (segment.score == 0) {
7032b9c00cbSConrad Meyer       if (++zeroScoreRun >= maxZeroScoreRun) {
7040c16b537SWarner Losh           break;
7050c16b537SWarner Losh       }
7062b9c00cbSConrad Meyer       continue;
7072b9c00cbSConrad Meyer     }
7082b9c00cbSConrad Meyer     zeroScoreRun = 0;
7090c16b537SWarner Losh     /* Trim the segment if necessary and if it is too small then we are done */
7100c16b537SWarner Losh     segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
7110c16b537SWarner Losh     if (segmentSize < parameters.d) {
7120c16b537SWarner Losh       break;
7130c16b537SWarner Losh     }
7140c16b537SWarner Losh     /* We fill the dictionary from the back to allow the best segments to be
7150c16b537SWarner Losh      * referenced with the smallest offsets.
7160c16b537SWarner Losh      */
7170c16b537SWarner Losh     tail -= segmentSize;
7180c16b537SWarner Losh     memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
7190c16b537SWarner Losh     DISPLAYUPDATE(
7200c16b537SWarner Losh         2, "\r%u%%       ",
721a0483764SConrad Meyer         (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
7220c16b537SWarner Losh   }
7230c16b537SWarner Losh   DISPLAYLEVEL(2, "\r%79s\r", "");
7240c16b537SWarner Losh   return tail;
7250c16b537SWarner Losh }
7260c16b537SWarner Losh 
7270c16b537SWarner Losh ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
72819fcbaf1SConrad Meyer     void *dictBuffer, size_t dictBufferCapacity,
72919fcbaf1SConrad Meyer     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
73019fcbaf1SConrad Meyer     ZDICT_cover_params_t parameters)
73119fcbaf1SConrad Meyer {
7320c16b537SWarner Losh   BYTE* const dict = (BYTE*)dictBuffer;
7330c16b537SWarner Losh   COVER_ctx_t ctx;
7340c16b537SWarner Losh   COVER_map_t activeDmers;
7350f743729SConrad Meyer   parameters.splitPoint = 1.0;
73619fcbaf1SConrad Meyer   /* Initialize global data */
73719fcbaf1SConrad Meyer   g_displayLevel = parameters.zParams.notificationLevel;
7380c16b537SWarner Losh   /* Checks */
7390c16b537SWarner Losh   if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
7400c16b537SWarner Losh     DISPLAYLEVEL(1, "Cover parameters incorrect\n");
7414d3f1eafSConrad Meyer     return ERROR(parameter_outOfBound);
7420c16b537SWarner Losh   }
7430c16b537SWarner Losh   if (nbSamples == 0) {
7440c16b537SWarner Losh     DISPLAYLEVEL(1, "Cover must have at least one input file\n");
7454d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
7460c16b537SWarner Losh   }
7470c16b537SWarner Losh   if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
7480c16b537SWarner Losh     DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
7490c16b537SWarner Losh                  ZDICT_DICTSIZE_MIN);
7500c16b537SWarner Losh     return ERROR(dstSize_tooSmall);
7510c16b537SWarner Losh   }
7520c16b537SWarner Losh   /* Initialize context and activeDmers */
7534d3f1eafSConrad Meyer   {
7544d3f1eafSConrad Meyer     size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
7554d3f1eafSConrad Meyer                       parameters.d, parameters.splitPoint);
7564d3f1eafSConrad Meyer     if (ZSTD_isError(initVal)) {
7574d3f1eafSConrad Meyer       return initVal;
7584d3f1eafSConrad Meyer     }
7590c16b537SWarner Losh   }
7602b9c00cbSConrad Meyer   COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
7610c16b537SWarner Losh   if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
7620c16b537SWarner Losh     DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
7630c16b537SWarner Losh     COVER_ctx_destroy(&ctx);
7644d3f1eafSConrad Meyer     return ERROR(memory_allocation);
7650c16b537SWarner Losh   }
7660c16b537SWarner Losh 
7670c16b537SWarner Losh   DISPLAYLEVEL(2, "Building dictionary\n");
7680c16b537SWarner Losh   {
7690c16b537SWarner Losh     const size_t tail =
7700c16b537SWarner Losh         COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
7710c16b537SWarner Losh                               dictBufferCapacity, parameters);
7720c16b537SWarner Losh     const size_t dictionarySize = ZDICT_finalizeDictionary(
7730c16b537SWarner Losh         dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
7740c16b537SWarner Losh         samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
7750c16b537SWarner Losh     if (!ZSTD_isError(dictionarySize)) {
7760c16b537SWarner Losh       DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
777a0483764SConrad Meyer                    (unsigned)dictionarySize);
7780c16b537SWarner Losh     }
7790c16b537SWarner Losh     COVER_ctx_destroy(&ctx);
7800c16b537SWarner Losh     COVER_map_destroy(&activeDmers);
7810c16b537SWarner Losh     return dictionarySize;
7820c16b537SWarner Losh   }
7830c16b537SWarner Losh }
7840c16b537SWarner Losh 
7850f743729SConrad Meyer 
7860f743729SConrad Meyer 
7870f743729SConrad Meyer size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
7880f743729SConrad Meyer                                     const size_t *samplesSizes, const BYTE *samples,
7890f743729SConrad Meyer                                     size_t *offsets,
7900f743729SConrad Meyer                                     size_t nbTrainSamples, size_t nbSamples,
7910f743729SConrad Meyer                                     BYTE *const dict, size_t dictBufferCapacity) {
7920f743729SConrad Meyer   size_t totalCompressedSize = ERROR(GENERIC);
7930f743729SConrad Meyer   /* Pointers */
7940f743729SConrad Meyer   ZSTD_CCtx *cctx;
7950f743729SConrad Meyer   ZSTD_CDict *cdict;
7960f743729SConrad Meyer   void *dst;
7970f743729SConrad Meyer   /* Local variables */
7980f743729SConrad Meyer   size_t dstCapacity;
7990f743729SConrad Meyer   size_t i;
8000f743729SConrad Meyer   /* Allocate dst with enough space to compress the maximum sized sample */
8010f743729SConrad Meyer   {
8020f743729SConrad Meyer     size_t maxSampleSize = 0;
8030f743729SConrad Meyer     i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
8040f743729SConrad Meyer     for (; i < nbSamples; ++i) {
8050f743729SConrad Meyer       maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
8060f743729SConrad Meyer     }
8070f743729SConrad Meyer     dstCapacity = ZSTD_compressBound(maxSampleSize);
8080f743729SConrad Meyer     dst = malloc(dstCapacity);
8090f743729SConrad Meyer   }
8100f743729SConrad Meyer   /* Create the cctx and cdict */
8110f743729SConrad Meyer   cctx = ZSTD_createCCtx();
8120f743729SConrad Meyer   cdict = ZSTD_createCDict(dict, dictBufferCapacity,
8130f743729SConrad Meyer                            parameters.zParams.compressionLevel);
8140f743729SConrad Meyer   if (!dst || !cctx || !cdict) {
8150f743729SConrad Meyer     goto _compressCleanup;
8160f743729SConrad Meyer   }
8170f743729SConrad Meyer   /* Compress each sample and sum their sizes (or error) */
8180f743729SConrad Meyer   totalCompressedSize = dictBufferCapacity;
8190f743729SConrad Meyer   i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
8200f743729SConrad Meyer   for (; i < nbSamples; ++i) {
8210f743729SConrad Meyer     const size_t size = ZSTD_compress_usingCDict(
8220f743729SConrad Meyer         cctx, dst, dstCapacity, samples + offsets[i],
8230f743729SConrad Meyer         samplesSizes[i], cdict);
8240f743729SConrad Meyer     if (ZSTD_isError(size)) {
8254d3f1eafSConrad Meyer       totalCompressedSize = size;
8260f743729SConrad Meyer       goto _compressCleanup;
8270f743729SConrad Meyer     }
8280f743729SConrad Meyer     totalCompressedSize += size;
8290f743729SConrad Meyer   }
8300f743729SConrad Meyer _compressCleanup:
8310f743729SConrad Meyer   ZSTD_freeCCtx(cctx);
8320f743729SConrad Meyer   ZSTD_freeCDict(cdict);
8330f743729SConrad Meyer   if (dst) {
8340f743729SConrad Meyer     free(dst);
8350f743729SConrad Meyer   }
8360f743729SConrad Meyer   return totalCompressedSize;
8370f743729SConrad Meyer }
8380f743729SConrad Meyer 
8390c16b537SWarner Losh 
8400c16b537SWarner Losh /**
8410c16b537SWarner Losh  * Initialize the `COVER_best_t`.
8420c16b537SWarner Losh  */
8430f743729SConrad Meyer void COVER_best_init(COVER_best_t *best) {
8440c16b537SWarner Losh   if (best==NULL) return; /* compatible with init on NULL */
8450c16b537SWarner Losh   (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
8460c16b537SWarner Losh   (void)ZSTD_pthread_cond_init(&best->cond, NULL);
8470c16b537SWarner Losh   best->liveJobs = 0;
8480c16b537SWarner Losh   best->dict = NULL;
8490c16b537SWarner Losh   best->dictSize = 0;
8500c16b537SWarner Losh   best->compressedSize = (size_t)-1;
8510c16b537SWarner Losh   memset(&best->parameters, 0, sizeof(best->parameters));
8520c16b537SWarner Losh }
8530c16b537SWarner Losh 
8540c16b537SWarner Losh /**
8550c16b537SWarner Losh  * Wait until liveJobs == 0.
8560c16b537SWarner Losh  */
8570f743729SConrad Meyer void COVER_best_wait(COVER_best_t *best) {
8580c16b537SWarner Losh   if (!best) {
8590c16b537SWarner Losh     return;
8600c16b537SWarner Losh   }
8610c16b537SWarner Losh   ZSTD_pthread_mutex_lock(&best->mutex);
8620c16b537SWarner Losh   while (best->liveJobs != 0) {
8630c16b537SWarner Losh     ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
8640c16b537SWarner Losh   }
8650c16b537SWarner Losh   ZSTD_pthread_mutex_unlock(&best->mutex);
8660c16b537SWarner Losh }
8670c16b537SWarner Losh 
8680c16b537SWarner Losh /**
8690c16b537SWarner Losh  * Call COVER_best_wait() and then destroy the COVER_best_t.
8700c16b537SWarner Losh  */
8710f743729SConrad Meyer void COVER_best_destroy(COVER_best_t *best) {
8720c16b537SWarner Losh   if (!best) {
8730c16b537SWarner Losh     return;
8740c16b537SWarner Losh   }
8750c16b537SWarner Losh   COVER_best_wait(best);
8760c16b537SWarner Losh   if (best->dict) {
8770c16b537SWarner Losh     free(best->dict);
8780c16b537SWarner Losh   }
8790c16b537SWarner Losh   ZSTD_pthread_mutex_destroy(&best->mutex);
8800c16b537SWarner Losh   ZSTD_pthread_cond_destroy(&best->cond);
8810c16b537SWarner Losh }
8820c16b537SWarner Losh 
8830c16b537SWarner Losh /**
8840c16b537SWarner Losh  * Called when a thread is about to be launched.
8850c16b537SWarner Losh  * Increments liveJobs.
8860c16b537SWarner Losh  */
8870f743729SConrad Meyer void COVER_best_start(COVER_best_t *best) {
8880c16b537SWarner Losh   if (!best) {
8890c16b537SWarner Losh     return;
8900c16b537SWarner Losh   }
8910c16b537SWarner Losh   ZSTD_pthread_mutex_lock(&best->mutex);
8920c16b537SWarner Losh   ++best->liveJobs;
8930c16b537SWarner Losh   ZSTD_pthread_mutex_unlock(&best->mutex);
8940c16b537SWarner Losh }
8950c16b537SWarner Losh 
8960c16b537SWarner Losh /**
8970c16b537SWarner Losh  * Called when a thread finishes executing, both on error or success.
8980c16b537SWarner Losh  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
8990c16b537SWarner Losh  * If this dictionary is the best so far save it and its parameters.
9000c16b537SWarner Losh  */
9014d3f1eafSConrad Meyer void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
9024d3f1eafSConrad Meyer                               COVER_dictSelection_t selection) {
9034d3f1eafSConrad Meyer   void* dict = selection.dictContent;
9044d3f1eafSConrad Meyer   size_t compressedSize = selection.totalCompressedSize;
9054d3f1eafSConrad Meyer   size_t dictSize = selection.dictSize;
9060c16b537SWarner Losh   if (!best) {
9070c16b537SWarner Losh     return;
9080c16b537SWarner Losh   }
9090c16b537SWarner Losh   {
9100c16b537SWarner Losh     size_t liveJobs;
9110c16b537SWarner Losh     ZSTD_pthread_mutex_lock(&best->mutex);
9120c16b537SWarner Losh     --best->liveJobs;
9130c16b537SWarner Losh     liveJobs = best->liveJobs;
9140c16b537SWarner Losh     /* If the new dictionary is better */
9150c16b537SWarner Losh     if (compressedSize < best->compressedSize) {
9160c16b537SWarner Losh       /* Allocate space if necessary */
9170c16b537SWarner Losh       if (!best->dict || best->dictSize < dictSize) {
9180c16b537SWarner Losh         if (best->dict) {
9190c16b537SWarner Losh           free(best->dict);
9200c16b537SWarner Losh         }
9210c16b537SWarner Losh         best->dict = malloc(dictSize);
9220c16b537SWarner Losh         if (!best->dict) {
9230c16b537SWarner Losh           best->compressedSize = ERROR(GENERIC);
9240c16b537SWarner Losh           best->dictSize = 0;
925a0483764SConrad Meyer           ZSTD_pthread_cond_signal(&best->cond);
926a0483764SConrad Meyer           ZSTD_pthread_mutex_unlock(&best->mutex);
9270c16b537SWarner Losh           return;
9280c16b537SWarner Losh         }
9290c16b537SWarner Losh       }
9300c16b537SWarner Losh       /* Save the dictionary, parameters, and size */
9319cbefe25SConrad Meyer       if (dict) {
9320c16b537SWarner Losh         memcpy(best->dict, dict, dictSize);
9330c16b537SWarner Losh         best->dictSize = dictSize;
9340c16b537SWarner Losh         best->parameters = parameters;
9350c16b537SWarner Losh         best->compressedSize = compressedSize;
9360c16b537SWarner Losh       }
9379cbefe25SConrad Meyer     }
9380c16b537SWarner Losh     if (liveJobs == 0) {
9390c16b537SWarner Losh       ZSTD_pthread_cond_broadcast(&best->cond);
9400c16b537SWarner Losh     }
9410f743729SConrad Meyer     ZSTD_pthread_mutex_unlock(&best->mutex);
9420c16b537SWarner Losh   }
9430c16b537SWarner Losh }
9440c16b537SWarner Losh 
9454d3f1eafSConrad Meyer COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
9464d3f1eafSConrad Meyer     COVER_dictSelection_t selection = { NULL, 0, error };
9474d3f1eafSConrad Meyer     return selection;
9484d3f1eafSConrad Meyer }
9494d3f1eafSConrad Meyer 
9504d3f1eafSConrad Meyer unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
9514d3f1eafSConrad Meyer   return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
9524d3f1eafSConrad Meyer }
9534d3f1eafSConrad Meyer 
9544d3f1eafSConrad Meyer void COVER_dictSelectionFree(COVER_dictSelection_t selection){
9554d3f1eafSConrad Meyer   free(selection.dictContent);
9564d3f1eafSConrad Meyer }
9574d3f1eafSConrad Meyer 
958*f7cd7fe5SConrad Meyer COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
9594d3f1eafSConrad Meyer         size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
9604d3f1eafSConrad Meyer         size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
9614d3f1eafSConrad Meyer 
9624d3f1eafSConrad Meyer   size_t largestDict = 0;
9634d3f1eafSConrad Meyer   size_t largestCompressed = 0;
9644d3f1eafSConrad Meyer   BYTE* customDictContentEnd = customDictContent + dictContentSize;
9654d3f1eafSConrad Meyer 
966*f7cd7fe5SConrad Meyer   BYTE * largestDictbuffer = (BYTE *)malloc(dictBufferCapacity);
967*f7cd7fe5SConrad Meyer   BYTE * candidateDictBuffer = (BYTE *)malloc(dictBufferCapacity);
9684d3f1eafSConrad Meyer   double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
9694d3f1eafSConrad Meyer 
9704d3f1eafSConrad Meyer   if (!largestDictbuffer || !candidateDictBuffer) {
9714d3f1eafSConrad Meyer     free(largestDictbuffer);
9724d3f1eafSConrad Meyer     free(candidateDictBuffer);
9734d3f1eafSConrad Meyer     return COVER_dictSelectionError(dictContentSize);
9744d3f1eafSConrad Meyer   }
9754d3f1eafSConrad Meyer 
9764d3f1eafSConrad Meyer   /* Initial dictionary size and compressed size */
9774d3f1eafSConrad Meyer   memcpy(largestDictbuffer, customDictContent, dictContentSize);
9784d3f1eafSConrad Meyer   dictContentSize = ZDICT_finalizeDictionary(
979*f7cd7fe5SConrad Meyer     largestDictbuffer, dictBufferCapacity, customDictContent, dictContentSize,
9804d3f1eafSConrad Meyer     samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
9814d3f1eafSConrad Meyer 
9824d3f1eafSConrad Meyer   if (ZDICT_isError(dictContentSize)) {
9834d3f1eafSConrad Meyer     free(largestDictbuffer);
9844d3f1eafSConrad Meyer     free(candidateDictBuffer);
9854d3f1eafSConrad Meyer     return COVER_dictSelectionError(dictContentSize);
9864d3f1eafSConrad Meyer   }
9874d3f1eafSConrad Meyer 
9884d3f1eafSConrad Meyer   totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
9894d3f1eafSConrad Meyer                                                        samplesBuffer, offsets,
9904d3f1eafSConrad Meyer                                                        nbCheckSamples, nbSamples,
9914d3f1eafSConrad Meyer                                                        largestDictbuffer, dictContentSize);
9924d3f1eafSConrad Meyer 
9934d3f1eafSConrad Meyer   if (ZSTD_isError(totalCompressedSize)) {
9944d3f1eafSConrad Meyer     free(largestDictbuffer);
9954d3f1eafSConrad Meyer     free(candidateDictBuffer);
9964d3f1eafSConrad Meyer     return COVER_dictSelectionError(totalCompressedSize);
9974d3f1eafSConrad Meyer   }
9984d3f1eafSConrad Meyer 
9994d3f1eafSConrad Meyer   if (params.shrinkDict == 0) {
10004d3f1eafSConrad Meyer     COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
10014d3f1eafSConrad Meyer     free(candidateDictBuffer);
10024d3f1eafSConrad Meyer     return selection;
10034d3f1eafSConrad Meyer   }
10044d3f1eafSConrad Meyer 
10054d3f1eafSConrad Meyer   largestDict = dictContentSize;
10064d3f1eafSConrad Meyer   largestCompressed = totalCompressedSize;
10074d3f1eafSConrad Meyer   dictContentSize = ZDICT_DICTSIZE_MIN;
10084d3f1eafSConrad Meyer 
10094d3f1eafSConrad Meyer   /* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
10104d3f1eafSConrad Meyer   while (dictContentSize < largestDict) {
10114d3f1eafSConrad Meyer     memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
10124d3f1eafSConrad Meyer     dictContentSize = ZDICT_finalizeDictionary(
1013*f7cd7fe5SConrad Meyer       candidateDictBuffer, dictBufferCapacity, customDictContentEnd - dictContentSize, dictContentSize,
10144d3f1eafSConrad Meyer       samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
10154d3f1eafSConrad Meyer 
10164d3f1eafSConrad Meyer     if (ZDICT_isError(dictContentSize)) {
10174d3f1eafSConrad Meyer       free(largestDictbuffer);
10184d3f1eafSConrad Meyer       free(candidateDictBuffer);
10194d3f1eafSConrad Meyer       return COVER_dictSelectionError(dictContentSize);
10204d3f1eafSConrad Meyer 
10214d3f1eafSConrad Meyer     }
10224d3f1eafSConrad Meyer 
10234d3f1eafSConrad Meyer     totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
10244d3f1eafSConrad Meyer                                                          samplesBuffer, offsets,
10254d3f1eafSConrad Meyer                                                          nbCheckSamples, nbSamples,
10264d3f1eafSConrad Meyer                                                          candidateDictBuffer, dictContentSize);
10274d3f1eafSConrad Meyer 
10284d3f1eafSConrad Meyer     if (ZSTD_isError(totalCompressedSize)) {
10294d3f1eafSConrad Meyer       free(largestDictbuffer);
10304d3f1eafSConrad Meyer       free(candidateDictBuffer);
10314d3f1eafSConrad Meyer       return COVER_dictSelectionError(totalCompressedSize);
10324d3f1eafSConrad Meyer     }
10334d3f1eafSConrad Meyer 
10344d3f1eafSConrad Meyer     if (totalCompressedSize <= largestCompressed * regressionTolerance) {
10354d3f1eafSConrad Meyer       COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
10364d3f1eafSConrad Meyer       free(largestDictbuffer);
10374d3f1eafSConrad Meyer       return selection;
10384d3f1eafSConrad Meyer     }
10394d3f1eafSConrad Meyer     dictContentSize *= 2;
10404d3f1eafSConrad Meyer   }
10414d3f1eafSConrad Meyer   dictContentSize = largestDict;
10424d3f1eafSConrad Meyer   totalCompressedSize = largestCompressed;
10434d3f1eafSConrad Meyer   {
10444d3f1eafSConrad Meyer     COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
10454d3f1eafSConrad Meyer     free(candidateDictBuffer);
10464d3f1eafSConrad Meyer     return selection;
10474d3f1eafSConrad Meyer   }
10484d3f1eafSConrad Meyer }
10494d3f1eafSConrad Meyer 
10500c16b537SWarner Losh /**
10510c16b537SWarner Losh  * Parameters for COVER_tryParameters().
10520c16b537SWarner Losh  */
10530c16b537SWarner Losh typedef struct COVER_tryParameters_data_s {
10540c16b537SWarner Losh   const COVER_ctx_t *ctx;
10550c16b537SWarner Losh   COVER_best_t *best;
10560c16b537SWarner Losh   size_t dictBufferCapacity;
10570c16b537SWarner Losh   ZDICT_cover_params_t parameters;
10580c16b537SWarner Losh } COVER_tryParameters_data_t;
10590c16b537SWarner Losh 
10600c16b537SWarner Losh /**
10610f743729SConrad Meyer  * Tries a set of parameters and updates the COVER_best_t with the results.
10620c16b537SWarner Losh  * This function is thread safe if zstd is compiled with multithreaded support.
10630c16b537SWarner Losh  * It takes its parameters as an *OWNING* opaque pointer to support threading.
10640c16b537SWarner Losh  */
10650c16b537SWarner Losh static void COVER_tryParameters(void *opaque) {
10660c16b537SWarner Losh   /* Save parameters as local variables */
10670c16b537SWarner Losh   COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque;
10680c16b537SWarner Losh   const COVER_ctx_t *const ctx = data->ctx;
10690c16b537SWarner Losh   const ZDICT_cover_params_t parameters = data->parameters;
10700c16b537SWarner Losh   size_t dictBufferCapacity = data->dictBufferCapacity;
10710c16b537SWarner Losh   size_t totalCompressedSize = ERROR(GENERIC);
10720c16b537SWarner Losh   /* Allocate space for hash table, dict, and freqs */
10730c16b537SWarner Losh   COVER_map_t activeDmers;
10740c16b537SWarner Losh   BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
10754d3f1eafSConrad Meyer   COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
10760c16b537SWarner Losh   U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
10770c16b537SWarner Losh   if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
10780c16b537SWarner Losh     DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
10790c16b537SWarner Losh     goto _cleanup;
10800c16b537SWarner Losh   }
10810c16b537SWarner Losh   if (!dict || !freqs) {
10820c16b537SWarner Losh     DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
10830c16b537SWarner Losh     goto _cleanup;
10840c16b537SWarner Losh   }
10850c16b537SWarner Losh   /* Copy the frequencies because we need to modify them */
10860c16b537SWarner Losh   memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
10870c16b537SWarner Losh   /* Build the dictionary */
10880c16b537SWarner Losh   {
10890c16b537SWarner Losh     const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
10900c16b537SWarner Losh                                               dictBufferCapacity, parameters);
1091*f7cd7fe5SConrad Meyer     selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,
10924d3f1eafSConrad Meyer         ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
10934d3f1eafSConrad Meyer         totalCompressedSize);
10944d3f1eafSConrad Meyer 
10954d3f1eafSConrad Meyer     if (COVER_dictSelectionIsError(selection)) {
10964d3f1eafSConrad Meyer       DISPLAYLEVEL(1, "Failed to select dictionary\n");
10970c16b537SWarner Losh       goto _cleanup;
10980c16b537SWarner Losh     }
10990c16b537SWarner Losh   }
11000c16b537SWarner Losh _cleanup:
11014d3f1eafSConrad Meyer   free(dict);
11024d3f1eafSConrad Meyer   COVER_best_finish(data->best, parameters, selection);
11030c16b537SWarner Losh   free(data);
11040c16b537SWarner Losh   COVER_map_destroy(&activeDmers);
11054d3f1eafSConrad Meyer   COVER_dictSelectionFree(selection);
11060c16b537SWarner Losh   if (freqs) {
11070c16b537SWarner Losh     free(freqs);
11080c16b537SWarner Losh   }
11090c16b537SWarner Losh }
11100c16b537SWarner Losh 
11110c16b537SWarner Losh ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
11120c16b537SWarner Losh     void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
11130c16b537SWarner Losh     const size_t *samplesSizes, unsigned nbSamples,
11140c16b537SWarner Losh     ZDICT_cover_params_t *parameters) {
11150c16b537SWarner Losh   /* constants */
11160c16b537SWarner Losh   const unsigned nbThreads = parameters->nbThreads;
11170f743729SConrad Meyer   const double splitPoint =
1118*f7cd7fe5SConrad Meyer       parameters->splitPoint <= 0.0 ? COVER_DEFAULT_SPLITPOINT : parameters->splitPoint;
11190c16b537SWarner Losh   const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
11200c16b537SWarner Losh   const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
11210c16b537SWarner Losh   const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
11220c16b537SWarner Losh   const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
11230c16b537SWarner Losh   const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
11240c16b537SWarner Losh   const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
11250c16b537SWarner Losh   const unsigned kIterations =
11260c16b537SWarner Losh       (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
11274d3f1eafSConrad Meyer   const unsigned shrinkDict = 0;
11280c16b537SWarner Losh   /* Local variables */
11290c16b537SWarner Losh   const int displayLevel = parameters->zParams.notificationLevel;
11300c16b537SWarner Losh   unsigned iteration = 1;
11310c16b537SWarner Losh   unsigned d;
11320c16b537SWarner Losh   unsigned k;
11330c16b537SWarner Losh   COVER_best_t best;
11340c16b537SWarner Losh   POOL_ctx *pool = NULL;
11352b9c00cbSConrad Meyer   int warned = 0;
113619fcbaf1SConrad Meyer 
11370c16b537SWarner Losh   /* Checks */
11380f743729SConrad Meyer   if (splitPoint <= 0 || splitPoint > 1) {
11390f743729SConrad Meyer     LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
11404d3f1eafSConrad Meyer     return ERROR(parameter_outOfBound);
11410f743729SConrad Meyer   }
11420c16b537SWarner Losh   if (kMinK < kMaxD || kMaxK < kMinK) {
11430c16b537SWarner Losh     LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
11444d3f1eafSConrad Meyer     return ERROR(parameter_outOfBound);
11450c16b537SWarner Losh   }
11460c16b537SWarner Losh   if (nbSamples == 0) {
11470c16b537SWarner Losh     DISPLAYLEVEL(1, "Cover must have at least one input file\n");
11484d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
11490c16b537SWarner Losh   }
11500c16b537SWarner Losh   if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
11510c16b537SWarner Losh     DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
11520c16b537SWarner Losh                  ZDICT_DICTSIZE_MIN);
11530c16b537SWarner Losh     return ERROR(dstSize_tooSmall);
11540c16b537SWarner Losh   }
11550c16b537SWarner Losh   if (nbThreads > 1) {
11560c16b537SWarner Losh     pool = POOL_create(nbThreads, 1);
11570c16b537SWarner Losh     if (!pool) {
11580c16b537SWarner Losh       return ERROR(memory_allocation);
11590c16b537SWarner Losh     }
11600c16b537SWarner Losh   }
11610c16b537SWarner Losh   /* Initialization */
11620c16b537SWarner Losh   COVER_best_init(&best);
11630c16b537SWarner Losh   /* Turn down global display level to clean up display at level 2 and below */
11640c16b537SWarner Losh   g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
11650c16b537SWarner Losh   /* Loop through d first because each new value needs a new context */
11660c16b537SWarner Losh   LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
11670c16b537SWarner Losh                     kIterations);
11680c16b537SWarner Losh   for (d = kMinD; d <= kMaxD; d += 2) {
11690c16b537SWarner Losh     /* Initialize the context for this value of d */
11700c16b537SWarner Losh     COVER_ctx_t ctx;
11710c16b537SWarner Losh     LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
11724d3f1eafSConrad Meyer     {
11734d3f1eafSConrad Meyer       const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
11744d3f1eafSConrad Meyer       if (ZSTD_isError(initVal)) {
11750c16b537SWarner Losh         LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
11760c16b537SWarner Losh         COVER_best_destroy(&best);
11770c16b537SWarner Losh         POOL_free(pool);
11784d3f1eafSConrad Meyer         return initVal;
11794d3f1eafSConrad Meyer       }
11800c16b537SWarner Losh     }
11812b9c00cbSConrad Meyer     if (!warned) {
11822b9c00cbSConrad Meyer       COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
11832b9c00cbSConrad Meyer       warned = 1;
11842b9c00cbSConrad Meyer     }
11850c16b537SWarner Losh     /* Loop through k reusing the same context */
11860c16b537SWarner Losh     for (k = kMinK; k <= kMaxK; k += kStepSize) {
11870c16b537SWarner Losh       /* Prepare the arguments */
11880c16b537SWarner Losh       COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
11890c16b537SWarner Losh           sizeof(COVER_tryParameters_data_t));
11900c16b537SWarner Losh       LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
11910c16b537SWarner Losh       if (!data) {
11920c16b537SWarner Losh         LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
11930c16b537SWarner Losh         COVER_best_destroy(&best);
11940c16b537SWarner Losh         COVER_ctx_destroy(&ctx);
11950c16b537SWarner Losh         POOL_free(pool);
11964d3f1eafSConrad Meyer         return ERROR(memory_allocation);
11970c16b537SWarner Losh       }
11980c16b537SWarner Losh       data->ctx = &ctx;
11990c16b537SWarner Losh       data->best = &best;
12000c16b537SWarner Losh       data->dictBufferCapacity = dictBufferCapacity;
12010c16b537SWarner Losh       data->parameters = *parameters;
12020c16b537SWarner Losh       data->parameters.k = k;
12030c16b537SWarner Losh       data->parameters.d = d;
12040f743729SConrad Meyer       data->parameters.splitPoint = splitPoint;
12050c16b537SWarner Losh       data->parameters.steps = kSteps;
12064d3f1eafSConrad Meyer       data->parameters.shrinkDict = shrinkDict;
12070c16b537SWarner Losh       data->parameters.zParams.notificationLevel = g_displayLevel;
12080c16b537SWarner Losh       /* Check the parameters */
12090c16b537SWarner Losh       if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
12100c16b537SWarner Losh         DISPLAYLEVEL(1, "Cover parameters incorrect\n");
12110c16b537SWarner Losh         free(data);
12120c16b537SWarner Losh         continue;
12130c16b537SWarner Losh       }
12140c16b537SWarner Losh       /* Call the function and pass ownership of data to it */
12150c16b537SWarner Losh       COVER_best_start(&best);
12160c16b537SWarner Losh       if (pool) {
12170c16b537SWarner Losh         POOL_add(pool, &COVER_tryParameters, data);
12180c16b537SWarner Losh       } else {
12190c16b537SWarner Losh         COVER_tryParameters(data);
12200c16b537SWarner Losh       }
12210c16b537SWarner Losh       /* Print status */
12220c16b537SWarner Losh       LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
1223a0483764SConrad Meyer                          (unsigned)((iteration * 100) / kIterations));
12240c16b537SWarner Losh       ++iteration;
12250c16b537SWarner Losh     }
12260c16b537SWarner Losh     COVER_best_wait(&best);
12270c16b537SWarner Losh     COVER_ctx_destroy(&ctx);
12280c16b537SWarner Losh   }
12290c16b537SWarner Losh   LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
12300c16b537SWarner Losh   /* Fill the output buffer and parameters with output of the best parameters */
12310c16b537SWarner Losh   {
12320c16b537SWarner Losh     const size_t dictSize = best.dictSize;
12330c16b537SWarner Losh     if (ZSTD_isError(best.compressedSize)) {
12340c16b537SWarner Losh       const size_t compressedSize = best.compressedSize;
12350c16b537SWarner Losh       COVER_best_destroy(&best);
12360c16b537SWarner Losh       POOL_free(pool);
12370c16b537SWarner Losh       return compressedSize;
12380c16b537SWarner Losh     }
12390c16b537SWarner Losh     *parameters = best.parameters;
12400c16b537SWarner Losh     memcpy(dictBuffer, best.dict, dictSize);
12410c16b537SWarner Losh     COVER_best_destroy(&best);
12420c16b537SWarner Losh     POOL_free(pool);
12430c16b537SWarner Losh     return dictSize;
12440c16b537SWarner Losh   }
12450c16b537SWarner Losh }
1246