Lines Matching +full:group +full:- +full:index +full:- +full:shift

5  * This source code is licensed under both the BSD-style license (found in the
8 * You may select, at your option, one of the above-listed licenses.
15 * Effective Construction of Relative Lempel-Ziv Dictionaries
21 /*-*************************************
40 /*-*************************************
47 * Most 32bit platforms have 2GB user-mode addressable space and we allocate a large
50 #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
53 /*-*************************************
80 if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) { \
88 /*-*************************************
97 #define MAP_EMPTY_VALUE ((U32)-1)
114 memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t)); in COVER_map_clear()
124 map->sizeLog = ZSTD_highbit32(size) + 2; in COVER_map_init()
125 map->size = (U32)1 << map->sizeLog; in COVER_map_init()
126 map->sizeMask = map->size - 1; in COVER_map_init()
127 map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t)); in COVER_map_init()
128 if (!map->data) { in COVER_map_init()
129 map->sizeLog = 0; in COVER_map_init()
130 map->size = 0; in COVER_map_init()
142 return (key * COVER_prime4bytes) >> (32 - map->sizeLog); in COVER_map_hash()
146 * Helper function that returns the index that a key should be placed into.
151 for (i = hash;; i = (i + 1) & map->sizeMask) { in COVER_map_index()
152 COVER_map_pair_t *pos = &map->data[i]; in COVER_map_index()
153 if (pos->value == MAP_EMPTY_VALUE) { in COVER_map_index()
156 if (pos->key == key) { in COVER_map_index()
168 COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)]; in COVER_map_at()
169 if (pos->value == MAP_EMPTY_VALUE) { in COVER_map_at()
170 pos->key = key; in COVER_map_at()
171 pos->value = 0; in COVER_map_at()
173 return &pos->value; in COVER_map_at()
181 COVER_map_pair_t *del = &map->data[i]; in COVER_map_remove()
182 U32 shift = 1; in COVER_map_remove() local
183 if (del->value == MAP_EMPTY_VALUE) { in COVER_map_remove()
186 for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) { in COVER_map_remove()
187 COVER_map_pair_t *const pos = &map->data[i]; in COVER_map_remove()
189 if (pos->value == MAP_EMPTY_VALUE) { in COVER_map_remove()
190 del->value = MAP_EMPTY_VALUE; in COVER_map_remove()
194 if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) { in COVER_map_remove()
195 del->key = pos->key; in COVER_map_remove()
196 del->value = pos->value; in COVER_map_remove()
198 shift = 1; in COVER_map_remove()
200 ++shift; in COVER_map_remove()
209 if (map->data) { in COVER_map_destroy()
210 free(map->data); in COVER_map_destroy()
212 map->data = NULL; in COVER_map_destroy()
213 map->size = 0; in COVER_map_destroy()
216 /*-*************************************
237 /*-*************************************
254 * Returns -1 if the dmer at lp is less than the dmer at rp.
261 return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d); in COVER_cmp()
267 U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1); in COVER_cmp8()
268 U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask; in COVER_cmp8()
269 U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask; in COVER_cmp8()
271 return -1; in COVER_cmp8()
284 result = lp < rp ? -1 : 1; in COVER_strict_cmp()
294 result = lp < rp ? -1 : 1; in COVER_strict_cmp8()
305 size_t count = last - first; in COVER_lower_bound()
312 count -= step + 1; in COVER_lower_bound()
323 * Calls grp for each group.
343 /*-*************************************
348 * Called on each group of positions with the same dmer.
350 * Fills `ctx->dmerAt`.
352 static void COVER_group(COVER_ctx_t *ctx, const void *group, in COVER_group() argument
354 /* The group consists of all the positions with the same first d bytes. */ in COVER_group()
355 const U32 *grpPtr = (const U32 *)group; in COVER_group()
361 const U32 dmerId = (U32)(grpPtr - ctx->suffix); in COVER_group()
365 const size_t *curOffsetPtr = ctx->offsets; in COVER_group()
366 const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples; in COVER_group()
370 size_t curSampleEnd = ctx->offsets[0]; in COVER_group()
373 ctx->dmerAt[*grpPtr] = dmerId; in COVER_group()
395 * We store the frequency of the dmer in the first position of the group, in COVER_group()
398 ctx->suffix[dmerId] = freq; in COVER_group()
409 * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
420 const U32 dmersInK = k - d + 1; in COVER_selectSegment()
435 U32 newDmer = ctx->dmerAt[activeSegment.end]; in COVER_selectSegment()
440 /* The paper suggest using the L-0.5 norm, but experiments show that it in COVER_selectSegment()
450 if (activeSegment.end - activeSegment.begin == dmersInK + 1) { in COVER_selectSegment()
451 U32 delDmer = ctx->dmerAt[activeSegment.begin]; in COVER_selectSegment()
454 *delDmerOcc -= 1; in COVER_selectSegment()
458 activeSegment.score -= freqs[delDmer]; in COVER_selectSegment()
473 U32 freq = freqs[ctx->dmerAt[pos]]; in COVER_selectSegment()
486 freqs[ctx->dmerAt[pos]] = 0; in COVER_selectSegment()
494 * Returns non-zero if the parameters are valid and 0 otherwise.
524 if (ctx->suffix) { in COVER_ctx_destroy()
525 free(ctx->suffix); in COVER_ctx_destroy()
526 ctx->suffix = NULL; in COVER_ctx_destroy()
528 if (ctx->freqs) { in COVER_ctx_destroy()
529 free(ctx->freqs); in COVER_ctx_destroy()
530 ctx->freqs = NULL; in COVER_ctx_destroy()
532 if (ctx->dmerAt) { in COVER_ctx_destroy()
533 free(ctx->dmerAt); in COVER_ctx_destroy()
534 ctx->dmerAt = NULL; in COVER_ctx_destroy()
536 if (ctx->offsets) { in COVER_ctx_destroy()
537 free(ctx->offsets); in COVER_ctx_destroy()
538 ctx->offsets = NULL; in COVER_ctx_destroy()
556 const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples; in COVER_ctx_init()
582 ctx->samples = samples; in COVER_ctx_init()
583 ctx->samplesSizes = samplesSizes; in COVER_ctx_init()
584 ctx->nbSamples = nbSamples; in COVER_ctx_init()
585 ctx->nbTrainSamples = nbTrainSamples; in COVER_ctx_init()
586 ctx->nbTestSamples = nbTestSamples; in COVER_ctx_init()
588 ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1; in COVER_ctx_init()
589 ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); in COVER_ctx_init()
590 /* Maps index to the dmerID */ in COVER_ctx_init()
591 ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); in COVER_ctx_init()
593 ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t)); in COVER_ctx_init()
594 if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) { in COVER_ctx_init()
599 ctx->freqs = NULL; in COVER_ctx_init()
600 ctx->d = d; in COVER_ctx_init()
605 ctx->offsets[0] = 0; in COVER_ctx_init()
607 ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; in COVER_ctx_init()
614 * The sort is stable, so each dmer group is sorted by position in input. in COVER_ctx_init()
617 for (i = 0; i < ctx->suffixSize; ++i) { in COVER_ctx_init()
618 ctx->suffix[i] = i; in COVER_ctx_init()
625 mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32), in COVER_ctx_init()
626 (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp)); in COVER_ctx_init()
628 qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), in COVER_ctx_init()
629 (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp)); in COVER_ctx_init()
633 /* For each dmer group (group of positions with the same first d bytes): in COVER_ctx_init()
635 * (groupBeginPtr - suffix). This allows us to go from position to in COVER_ctx_init()
640 COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx, in COVER_ctx_init()
641 (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group); in COVER_ctx_init()
642 ctx->freqs = ctx->suffix; in COVER_ctx_init()
643 ctx->suffix = NULL; in COVER_ctx_init()
691 (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4); in COVER_buildDictionary()
718 segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail); in COVER_buildDictionary()
725 tail -= segmentSize; in COVER_buildDictionary()
726 memcpy(dict + tail, ctx->samples + segment.begin, segmentSize); in COVER_buildDictionary()
729 (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); in COVER_buildDictionary()
769 if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { in ZDICT_trainFromBuffer_cover()
781 dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, in ZDICT_trainFromBuffer_cover()
853 (void)ZSTD_pthread_mutex_init(&best->mutex, NULL); in COVER_best_init()
854 (void)ZSTD_pthread_cond_init(&best->cond, NULL); in COVER_best_init()
855 best->liveJobs = 0; in COVER_best_init()
856 best->dict = NULL; in COVER_best_init()
857 best->dictSize = 0; in COVER_best_init()
858 best->compressedSize = (size_t)-1; in COVER_best_init()
859 memset(&best->parameters, 0, sizeof(best->parameters)); in COVER_best_init()
869 ZSTD_pthread_mutex_lock(&best->mutex); in COVER_best_wait()
870 while (best->liveJobs != 0) { in COVER_best_wait()
871 ZSTD_pthread_cond_wait(&best->cond, &best->mutex); in COVER_best_wait()
873 ZSTD_pthread_mutex_unlock(&best->mutex); in COVER_best_wait()
884 if (best->dict) { in COVER_best_destroy()
885 free(best->dict); in COVER_best_destroy()
887 ZSTD_pthread_mutex_destroy(&best->mutex); in COVER_best_destroy()
888 ZSTD_pthread_cond_destroy(&best->cond); in COVER_best_destroy()
899 ZSTD_pthread_mutex_lock(&best->mutex); in COVER_best_start()
900 ++best->liveJobs; in COVER_best_start()
901 ZSTD_pthread_mutex_unlock(&best->mutex); in COVER_best_start()
919 ZSTD_pthread_mutex_lock(&best->mutex); in COVER_best_finish()
920 --best->liveJobs; in COVER_best_finish()
921 liveJobs = best->liveJobs; in COVER_best_finish()
923 if (compressedSize < best->compressedSize) { in COVER_best_finish()
925 if (!best->dict || best->dictSize < dictSize) { in COVER_best_finish()
926 if (best->dict) { in COVER_best_finish()
927 free(best->dict); in COVER_best_finish()
929 best->dict = malloc(dictSize); in COVER_best_finish()
930 if (!best->dict) { in COVER_best_finish()
931 best->compressedSize = ERROR(GENERIC); in COVER_best_finish()
932 best->dictSize = 0; in COVER_best_finish()
933 ZSTD_pthread_cond_signal(&best->cond); in COVER_best_finish()
934 ZSTD_pthread_mutex_unlock(&best->mutex); in COVER_best_finish()
940 memcpy(best->dict, dict, dictSize); in COVER_best_finish()
941 best->dictSize = dictSize; in COVER_best_finish()
942 best->parameters = parameters; in COVER_best_finish()
943 best->compressedSize = compressedSize; in COVER_best_finish()
947 ZSTD_pthread_cond_broadcast(&best->cond); in COVER_best_finish()
949 ZSTD_pthread_mutex_unlock(&best->mutex); in COVER_best_finish()
1021 … candidateDictBuffer, dictBufferCapacity, customDictContentEnd - dictContentSize, dictContentSize, in COVER_selectDict()
1077 const COVER_ctx_t *const ctx = data->ctx; in COVER_tryParameters()
1078 const ZDICT_cover_params_t parameters = data->parameters; in COVER_tryParameters()
1079 size_t dictBufferCapacity = data->dictBufferCapacity; in COVER_tryParameters()
1085 U32* const freqs = (U32*)malloc(ctx->suffixSize * sizeof(U32)); in COVER_tryParameters()
1086 if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { in COVER_tryParameters()
1095 memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32)); in COVER_tryParameters()
1100 selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail, in COVER_tryParameters()
1101 …ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSample… in COVER_tryParameters()
1111 COVER_best_finish(data->best, parameters, selection); in COVER_tryParameters()
1124 const unsigned nbThreads = parameters->nbThreads; in ZDICT_optimizeTrainFromBuffer_cover()
1126 parameters->splitPoint <= 0.0 ? COVER_DEFAULT_SPLITPOINT : parameters->splitPoint; in ZDICT_optimizeTrainFromBuffer_cover()
1127 const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; in ZDICT_optimizeTrainFromBuffer_cover()
1128 const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; in ZDICT_optimizeTrainFromBuffer_cover()
1129 const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; in ZDICT_optimizeTrainFromBuffer_cover()
1130 const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k; in ZDICT_optimizeTrainFromBuffer_cover()
1131 const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps; in ZDICT_optimizeTrainFromBuffer_cover()
1132 const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1); in ZDICT_optimizeTrainFromBuffer_cover()
1134 (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); in ZDICT_optimizeTrainFromBuffer_cover()
1137 const int displayLevel = parameters->zParams.notificationLevel; in ZDICT_optimizeTrainFromBuffer_cover()
1172 g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1; in ZDICT_optimizeTrainFromBuffer_cover()
1206 data->ctx = &ctx; in ZDICT_optimizeTrainFromBuffer_cover()
1207 data->best = &best; in ZDICT_optimizeTrainFromBuffer_cover()
1208 data->dictBufferCapacity = dictBufferCapacity; in ZDICT_optimizeTrainFromBuffer_cover()
1209 data->parameters = *parameters; in ZDICT_optimizeTrainFromBuffer_cover()
1210 data->parameters.k = k; in ZDICT_optimizeTrainFromBuffer_cover()
1211 data->parameters.d = d; in ZDICT_optimizeTrainFromBuffer_cover()
1212 data->parameters.splitPoint = splitPoint; in ZDICT_optimizeTrainFromBuffer_cover()
1213 data->parameters.steps = kSteps; in ZDICT_optimizeTrainFromBuffer_cover()
1214 data->parameters.shrinkDict = shrinkDict; in ZDICT_optimizeTrainFromBuffer_cover()
1215 data->parameters.zParams.notificationLevel = g_displayLevel; in ZDICT_optimizeTrainFromBuffer_cover()
1217 if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) { in ZDICT_optimizeTrainFromBuffer_cover()