1 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
2 /*
3 * Copyright (c) Meta Platforms, Inc. and affiliates.
4 * All rights reserved.
5 *
6 * This source code is licensed under both the BSD-style license (found in the
7 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
8 * in the COPYING file in the root directory of this source tree).
9 * You may select, at your option, one of the above-listed licenses.
10 */
11
12 #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
13 #include "zstd_fast.h"
14
15 static
16 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
ZSTD_fillHashTableForCDict(ZSTD_MatchState_t * ms,const void * const end,ZSTD_dictTableLoadMethod_e dtlm)17 void ZSTD_fillHashTableForCDict(ZSTD_MatchState_t* ms,
18 const void* const end,
19 ZSTD_dictTableLoadMethod_e dtlm)
20 {
21 const ZSTD_compressionParameters* const cParams = &ms->cParams;
22 U32* const hashTable = ms->hashTable;
23 U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
24 U32 const mls = cParams->minMatch;
25 const BYTE* const base = ms->window.base;
26 const BYTE* ip = base + ms->nextToUpdate;
27 const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
28 const U32 fastHashFillStep = 3;
29
30 /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
31 * Feel free to remove this assert if there's a good reason! */
32 assert(dtlm == ZSTD_dtlm_full);
33
34 /* Always insert every fastHashFillStep position into the hash table.
35 * Insert the other positions if their hash entry is empty.
36 */
37 for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
38 U32 const curr = (U32)(ip - base);
39 { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
40 ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); }
41
42 if (dtlm == ZSTD_dtlm_fast) continue;
43 /* Only load extra positions for ZSTD_dtlm_full */
44 { U32 p;
45 for (p = 1; p < fastHashFillStep; ++p) {
46 size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
47 if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */
48 ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
49 } } } }
50 }
51
52 static
53 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t * ms,const void * const end,ZSTD_dictTableLoadMethod_e dtlm)54 void ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t* ms,
55 const void* const end,
56 ZSTD_dictTableLoadMethod_e dtlm)
57 {
58 const ZSTD_compressionParameters* const cParams = &ms->cParams;
59 U32* const hashTable = ms->hashTable;
60 U32 const hBits = cParams->hashLog;
61 U32 const mls = cParams->minMatch;
62 const BYTE* const base = ms->window.base;
63 const BYTE* ip = base + ms->nextToUpdate;
64 const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
65 const U32 fastHashFillStep = 3;
66
67 /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
68 * Feel free to remove this assert if there's a good reason! */
69 assert(dtlm == ZSTD_dtlm_fast);
70
71 /* Always insert every fastHashFillStep position into the hash table.
72 * Insert the other positions if their hash entry is empty.
73 */
74 for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
75 U32 const curr = (U32)(ip - base);
76 size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls);
77 hashTable[hash0] = curr;
78 if (dtlm == ZSTD_dtlm_fast) continue;
79 /* Only load extra positions for ZSTD_dtlm_full */
80 { U32 p;
81 for (p = 1; p < fastHashFillStep; ++p) {
82 size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls);
83 if (hashTable[hash] == 0) { /* not yet filled */
84 hashTable[hash] = curr + p;
85 } } } }
86 }
87
ZSTD_fillHashTable(ZSTD_MatchState_t * ms,const void * const end,ZSTD_dictTableLoadMethod_e dtlm,ZSTD_tableFillPurpose_e tfp)88 void ZSTD_fillHashTable(ZSTD_MatchState_t* ms,
89 const void* const end,
90 ZSTD_dictTableLoadMethod_e dtlm,
91 ZSTD_tableFillPurpose_e tfp)
92 {
93 if (tfp == ZSTD_tfp_forCDict) {
94 ZSTD_fillHashTableForCDict(ms, end, dtlm);
95 } else {
96 ZSTD_fillHashTableForCCtx(ms, end, dtlm);
97 }
98 }
99
100
101 typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit);
102
103 static int
ZSTD_match4Found_cmov(const BYTE * currentPtr,const BYTE * matchAddress,U32 matchIdx,U32 idxLowLimit)104 ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
105 {
106 /* Array of ~random data, should have low probability of matching data.
107 * Load from here if the index is invalid.
108 * Used to avoid unpredictable branches. */
109 static const BYTE dummy[] = {0x12,0x34,0x56,0x78};
110
111 /* currentIdx >= lowLimit is a (somewhat) unpredictable branch.
112 * However expression below compiles into conditional move.
113 */
114 const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy);
115 /* Note: this used to be written as : return test1 && test2;
116 * Unfortunately, once inlined, these tests become branches,
117 * in which case it becomes critical that they are executed in the right order (test1 then test2).
118 * So we have to write these tests in a specific manner to ensure their ordering.
119 */
120 if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0;
121 /* force ordering of these tests, which matters once the function is inlined, as they become branches */
122 __asm__("");
123 return matchIdx >= idxLowLimit;
124 }
125
126 static int
ZSTD_match4Found_branch(const BYTE * currentPtr,const BYTE * matchAddress,U32 matchIdx,U32 idxLowLimit)127 ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
128 {
129 /* using a branch instead of a cmov,
130 * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true,
131 * aka almost all candidates are within range */
132 U32 mval;
133 if (matchIdx >= idxLowLimit) {
134 mval = MEM_read32(matchAddress);
135 } else {
136 mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */
137 }
138
139 return (MEM_read32(currentPtr) == mval);
140 }
141
142
143 /*
144 * If you squint hard enough (and ignore repcodes), the search operation at any
145 * given position is broken into 4 stages:
146 *
147 * 1. Hash (map position to hash value via input read)
148 * 2. Lookup (map hash val to index via hashtable read)
149 * 3. Load (map index to value at that position via input read)
150 * 4. Compare
151 *
152 * Each of these steps involves a memory read at an address which is computed
153 * from the previous step. This means these steps must be sequenced and their
154 * latencies are cumulative.
155 *
156 * Rather than do 1->2->3->4 sequentially for a single position before moving
157 * onto the next, this implementation interleaves these operations across the
158 * next few positions:
159 *
160 * R = Repcode Read & Compare
161 * H = Hash
162 * T = Table Lookup
163 * M = Match Read & Compare
164 *
165 * Pos | Time -->
166 * ----+-------------------
167 * N | ... M
168 * N+1 | ... TM
169 * N+2 | R H T M
170 * N+3 | H TM
171 * N+4 | R H T M
172 * N+5 | H ...
173 * N+6 | R ...
174 *
175 * This is very much analogous to the pipelining of execution in a CPU. And just
176 * like a CPU, we have to dump the pipeline when we find a match (i.e., take a
177 * branch).
178 *
179 * When this happens, we throw away our current state, and do the following prep
180 * to re-enter the loop:
181 *
182 * Pos | Time -->
183 * ----+-------------------
184 * N | H T
185 * N+1 | H
186 *
187 * This is also the work we do at the beginning to enter the loop initially.
188 */
189 FORCE_INLINE_TEMPLATE
190 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
ZSTD_compressBlock_fast_noDict_generic(ZSTD_MatchState_t * ms,SeqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize,U32 const mls,int useCmov)191 size_t ZSTD_compressBlock_fast_noDict_generic(
192 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
193 void const* src, size_t srcSize,
194 U32 const mls, int useCmov)
195 {
196 const ZSTD_compressionParameters* const cParams = &ms->cParams;
197 U32* const hashTable = ms->hashTable;
198 U32 const hlog = cParams->hashLog;
199 size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */
200 const BYTE* const base = ms->window.base;
201 const BYTE* const istart = (const BYTE*)src;
202 const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
203 const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
204 const BYTE* const prefixStart = base + prefixStartIndex;
205 const BYTE* const iend = istart + srcSize;
206 const BYTE* const ilimit = iend - HASH_READ_SIZE;
207
208 const BYTE* anchor = istart;
209 const BYTE* ip0 = istart;
210 const BYTE* ip1;
211 const BYTE* ip2;
212 const BYTE* ip3;
213 U32 current0;
214
215 U32 rep_offset1 = rep[0];
216 U32 rep_offset2 = rep[1];
217 U32 offsetSaved1 = 0, offsetSaved2 = 0;
218
219 size_t hash0; /* hash for ip0 */
220 size_t hash1; /* hash for ip1 */
221 U32 matchIdx; /* match idx for ip0 */
222
223 U32 offcode;
224 const BYTE* match0;
225 size_t mLength;
226
227 /* ip0 and ip1 are always adjacent. The targetLength skipping and
228 * uncompressibility acceleration is applied to every other position,
229 * matching the behavior of #1562. step therefore represents the gap
230 * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */
231 size_t step;
232 const BYTE* nextStep;
233 const size_t kStepIncr = (1 << (kSearchStrength - 1));
234 const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch;
235
236 DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
237 ip0 += (ip0 == prefixStart);
238 { U32 const curr = (U32)(ip0 - base);
239 U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
240 U32 const maxRep = curr - windowLow;
241 if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
242 if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
243 }
244
245 /* start each op */
246 _start: /* Requires: ip0 */
247
248 step = stepSize;
249 nextStep = ip0 + kStepIncr;
250
251 /* calculate positions, ip0 - anchor == 0, so we skip step calc */
252 ip1 = ip0 + 1;
253 ip2 = ip0 + step;
254 ip3 = ip2 + 1;
255
256 if (ip3 >= ilimit) {
257 goto _cleanup;
258 }
259
260 hash0 = ZSTD_hashPtr(ip0, hlog, mls);
261 hash1 = ZSTD_hashPtr(ip1, hlog, mls);
262
263 matchIdx = hashTable[hash0];
264
265 do {
266 /* load repcode match for ip[2]*/
267 const U32 rval = MEM_read32(ip2 - rep_offset1);
268
269 /* write back hash table entry */
270 current0 = (U32)(ip0 - base);
271 hashTable[hash0] = current0;
272
273 /* check repcode at ip[2] */
274 if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) {
275 ip0 = ip2;
276 match0 = ip0 - rep_offset1;
277 mLength = ip0[-1] == match0[-1];
278 ip0 -= mLength;
279 match0 -= mLength;
280 offcode = REPCODE1_TO_OFFBASE;
281 mLength += 4;
282
283 /* Write next hash table entry: it's already calculated.
284 * This write is known to be safe because ip1 is before the
285 * repcode (ip2). */
286 hashTable[hash1] = (U32)(ip1 - base);
287
288 goto _match;
289 }
290
291 if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
292 /* Write next hash table entry (it's already calculated).
293 * This write is known to be safe because the ip1 == ip0 + 1,
294 * so searching will resume after ip1 */
295 hashTable[hash1] = (U32)(ip1 - base);
296
297 goto _offset;
298 }
299
300 /* lookup ip[1] */
301 matchIdx = hashTable[hash1];
302
303 /* hash ip[2] */
304 hash0 = hash1;
305 hash1 = ZSTD_hashPtr(ip2, hlog, mls);
306
307 /* advance to next positions */
308 ip0 = ip1;
309 ip1 = ip2;
310 ip2 = ip3;
311
312 /* write back hash table entry */
313 current0 = (U32)(ip0 - base);
314 hashTable[hash0] = current0;
315
316 if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
317 /* Write next hash table entry, since it's already calculated */
318 if (step <= 4) {
319 /* Avoid writing an index if it's >= position where search will resume.
320 * The minimum possible match has length 4, so search can resume at ip0 + 4.
321 */
322 hashTable[hash1] = (U32)(ip1 - base);
323 }
324 goto _offset;
325 }
326
327 /* lookup ip[1] */
328 matchIdx = hashTable[hash1];
329
330 /* hash ip[2] */
331 hash0 = hash1;
332 hash1 = ZSTD_hashPtr(ip2, hlog, mls);
333
334 /* advance to next positions */
335 ip0 = ip1;
336 ip1 = ip2;
337 ip2 = ip0 + step;
338 ip3 = ip1 + step;
339
340 /* calculate step */
341 if (ip2 >= nextStep) {
342 step++;
343 PREFETCH_L1(ip1 + 64);
344 PREFETCH_L1(ip1 + 128);
345 nextStep += kStepIncr;
346 }
347 } while (ip3 < ilimit);
348
349 _cleanup:
350 /* Note that there are probably still a couple positions one could search.
351 * However, it seems to be a meaningful performance hit to try to search
352 * them. So let's not. */
353
354 /* When the repcodes are outside of the prefix, we set them to zero before the loop.
355 * When the offsets are still zero, we need to restore them after the block to have a correct
356 * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
357 * offsets were invalid. We need to figure out which offset to refill with.
358 * - If both offsets are zero they are in the same order.
359 * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
360 * - If only one is zero, we need to decide which offset to restore.
361 * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
362 * - It is impossible for rep_offset2 to be non-zero.
363 *
364 * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
365 * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
366 */
367 offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
368
369 /* save reps for next block */
370 rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
371 rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
372
373 /* Return the last literals size */
374 return (size_t)(iend - anchor);
375
376 _offset: /* Requires: ip0, idx */
377
378 /* Compute the offset code. */
379 match0 = base + matchIdx;
380 rep_offset2 = rep_offset1;
381 rep_offset1 = (U32)(ip0-match0);
382 offcode = OFFSET_TO_OFFBASE(rep_offset1);
383 mLength = 4;
384
385 /* Count the backwards match length. */
386 while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) {
387 ip0--;
388 match0--;
389 mLength++;
390 }
391
392 _match: /* Requires: ip0, match0, offcode */
393
394 /* Count the forward length. */
395 mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend);
396
397 ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
398
399 ip0 += mLength;
400 anchor = ip0;
401
402 /* Fill table and check for immediate repcode. */
403 if (ip0 <= ilimit) {
404 /* Fill Table */
405 assert(base+current0+2 > istart); /* check base overflow */
406 hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
407 hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
408
409 if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */
410 while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) {
411 /* store sequence */
412 size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4;
413 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
414 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
415 ip0 += rLength;
416 ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
417 anchor = ip0;
418 continue; /* faster when present (confirmed on gcc-8) ... (?) */
419 } } }
420
421 goto _start;
422 }
423
424 #define ZSTD_GEN_FAST_FN(dictMode, mml, cmov) \
425 static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov( \
426 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \
427 void const* src, size_t srcSize) \
428 { \
429 return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \
430 }
431
432 ZSTD_GEN_FAST_FN(noDict, 4, 1)
433 ZSTD_GEN_FAST_FN(noDict, 5, 1)
434 ZSTD_GEN_FAST_FN(noDict, 6, 1)
435 ZSTD_GEN_FAST_FN(noDict, 7, 1)
436
437 ZSTD_GEN_FAST_FN(noDict, 4, 0)
438 ZSTD_GEN_FAST_FN(noDict, 5, 0)
439 ZSTD_GEN_FAST_FN(noDict, 6, 0)
440 ZSTD_GEN_FAST_FN(noDict, 7, 0)
441
ZSTD_compressBlock_fast(ZSTD_MatchState_t * ms,SeqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)442 size_t ZSTD_compressBlock_fast(
443 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
444 void const* src, size_t srcSize)
445 {
446 U32 const mml = ms->cParams.minMatch;
447 /* use cmov when "candidate in range" branch is likely unpredictable */
448 int const useCmov = ms->cParams.windowLog < 19;
449 assert(ms->dictMatchState == NULL);
450 if (useCmov) {
451 switch(mml)
452 {
453 default: /* includes case 3 */
454 case 4 :
455 return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize);
456 case 5 :
457 return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize);
458 case 6 :
459 return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize);
460 case 7 :
461 return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
462 }
463 } else {
464 /* use a branch instead */
465 switch(mml)
466 {
467 default: /* includes case 3 */
468 case 4 :
469 return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize);
470 case 5 :
471 return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize);
472 case 6 :
473 return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize);
474 case 7 :
475 return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
476 }
477 }
478 }
479
480 FORCE_INLINE_TEMPLATE
481 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
ZSTD_compressBlock_fast_dictMatchState_generic(ZSTD_MatchState_t * ms,SeqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize,U32 const mls,U32 const hasStep)482 size_t ZSTD_compressBlock_fast_dictMatchState_generic(
483 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
484 void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
485 {
486 const ZSTD_compressionParameters* const cParams = &ms->cParams;
487 U32* const hashTable = ms->hashTable;
488 U32 const hlog = cParams->hashLog;
489 /* support stepSize of 0 */
490 U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
491 const BYTE* const base = ms->window.base;
492 const BYTE* const istart = (const BYTE*)src;
493 const BYTE* ip0 = istart;
494 const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
495 const BYTE* anchor = istart;
496 const U32 prefixStartIndex = ms->window.dictLimit;
497 const BYTE* const prefixStart = base + prefixStartIndex;
498 const BYTE* const iend = istart + srcSize;
499 const BYTE* const ilimit = iend - HASH_READ_SIZE;
500 U32 offset_1=rep[0], offset_2=rep[1];
501
502 const ZSTD_MatchState_t* const dms = ms->dictMatchState;
503 const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
504 const U32* const dictHashTable = dms->hashTable;
505 const U32 dictStartIndex = dms->window.dictLimit;
506 const BYTE* const dictBase = dms->window.base;
507 const BYTE* const dictStart = dictBase + dictStartIndex;
508 const BYTE* const dictEnd = dms->window.nextSrc;
509 const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase);
510 const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart);
511 const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
512
513 /* if a dictionary is still attached, it necessarily means that
514 * it is within window size. So we just check it. */
515 const U32 maxDistance = 1U << cParams->windowLog;
516 const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
517 assert(endIndex - prefixStartIndex <= maxDistance);
518 (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */
519
520 (void)hasStep; /* not currently specialized on whether it's accelerated */
521
522 /* ensure there will be no underflow
523 * when translating a dict index into a local index */
524 assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
525
526 if (ms->prefetchCDictTables) {
527 size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
528 PREFETCH_AREA(dictHashTable, hashTableBytes);
529 }
530
531 /* init */
532 DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
533 ip0 += (dictAndPrefixLength == 0);
534 /* dictMatchState repCode checks don't currently handle repCode == 0
535 * disabling. */
536 assert(offset_1 <= dictAndPrefixLength);
537 assert(offset_2 <= dictAndPrefixLength);
538
539 /* Outer search loop */
540 assert(stepSize >= 1);
541 while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
542 size_t mLength;
543 size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
544
545 size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
546 U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
547 int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
548
549 U32 matchIndex = hashTable[hash0];
550 U32 curr = (U32)(ip0 - base);
551 size_t step = stepSize;
552 const size_t kStepIncr = 1 << kSearchStrength;
553 const BYTE* nextStep = ip0 + kStepIncr;
554
555 /* Inner search loop */
556 while (1) {
557 const BYTE* match = base + matchIndex;
558 const U32 repIndex = curr + 1 - offset_1;
559 const BYTE* repMatch = (repIndex < prefixStartIndex) ?
560 dictBase + (repIndex - dictIndexDelta) :
561 base + repIndex;
562 const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
563 size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
564 hashTable[hash0] = curr; /* update hash table */
565
566 if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex))
567 && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
568 const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
569 mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
570 ip0++;
571 ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
572 break;
573 }
574
575 if (dictTagsMatch) {
576 /* Found a possible dict match */
577 const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
578 const BYTE* dictMatch = dictBase + dictMatchIndex;
579 if (dictMatchIndex > dictStartIndex &&
580 MEM_read32(dictMatch) == MEM_read32(ip0)) {
581 /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
582 if (matchIndex <= prefixStartIndex) {
583 U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
584 mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
585 while (((ip0 > anchor) & (dictMatch > dictStart))
586 && (ip0[-1] == dictMatch[-1])) {
587 ip0--;
588 dictMatch--;
589 mLength++;
590 } /* catch up */
591 offset_2 = offset_1;
592 offset_1 = offset;
593 ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
594 break;
595 }
596 }
597 }
598
599 if (ZSTD_match4Found_cmov(ip0, match, matchIndex, prefixStartIndex)) {
600 /* found a regular match of size >= 4 */
601 U32 const offset = (U32) (ip0 - match);
602 mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
603 while (((ip0 > anchor) & (match > prefixStart))
604 && (ip0[-1] == match[-1])) {
605 ip0--;
606 match--;
607 mLength++;
608 } /* catch up */
609 offset_2 = offset_1;
610 offset_1 = offset;
611 ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
612 break;
613 }
614
615 /* Prepare for next iteration */
616 dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
617 dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
618 matchIndex = hashTable[hash1];
619
620 if (ip1 >= nextStep) {
621 step++;
622 nextStep += kStepIncr;
623 }
624 ip0 = ip1;
625 ip1 = ip1 + step;
626 if (ip1 > ilimit) goto _cleanup;
627
628 curr = (U32)(ip0 - base);
629 hash0 = hash1;
630 } /* end inner search loop */
631
632 /* match found */
633 assert(mLength);
634 ip0 += mLength;
635 anchor = ip0;
636
637 if (ip0 <= ilimit) {
638 /* Fill Table */
639 assert(base+curr+2 > istart); /* check base overflow */
640 hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */
641 hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
642
643 /* check immediate repcode */
644 while (ip0 <= ilimit) {
645 U32 const current2 = (U32)(ip0-base);
646 U32 const repIndex2 = current2 - offset_2;
647 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
648 dictBase - dictIndexDelta + repIndex2 :
649 base + repIndex2;
650 if ( (ZSTD_index_overlap_check(prefixStartIndex, repIndex2))
651 && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
652 const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
653 size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
654 U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
655 ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
656 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
657 ip0 += repLength2;
658 anchor = ip0;
659 continue;
660 }
661 break;
662 }
663 }
664
665 /* Prepare for next iteration */
666 assert(ip0 == anchor);
667 ip1 = ip0 + stepSize;
668 }
669
670 _cleanup:
671 /* save reps for next block */
672 rep[0] = offset_1;
673 rep[1] = offset_2;
674
675 /* Return the last literals size */
676 return (size_t)(iend - anchor);
677 }
678
679
680 ZSTD_GEN_FAST_FN(dictMatchState, 4, 0)
681 ZSTD_GEN_FAST_FN(dictMatchState, 5, 0)
682 ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
683 ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
684
ZSTD_compressBlock_fast_dictMatchState(ZSTD_MatchState_t * ms,SeqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)685 size_t ZSTD_compressBlock_fast_dictMatchState(
686 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
687 void const* src, size_t srcSize)
688 {
689 U32 const mls = ms->cParams.minMatch;
690 assert(ms->dictMatchState != NULL);
691 switch(mls)
692 {
693 default: /* includes case 3 */
694 case 4 :
695 return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize);
696 case 5 :
697 return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize);
698 case 6 :
699 return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize);
700 case 7 :
701 return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize);
702 }
703 }
704
705
706 static
707 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
ZSTD_compressBlock_fast_extDict_generic(ZSTD_MatchState_t * ms,SeqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize,U32 const mls,U32 const hasStep)708 size_t ZSTD_compressBlock_fast_extDict_generic(
709 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
710 void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
711 {
712 const ZSTD_compressionParameters* const cParams = &ms->cParams;
713 U32* const hashTable = ms->hashTable;
714 U32 const hlog = cParams->hashLog;
715 /* support stepSize of 0 */
716 size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
717 const BYTE* const base = ms->window.base;
718 const BYTE* const dictBase = ms->window.dictBase;
719 const BYTE* const istart = (const BYTE*)src;
720 const BYTE* anchor = istart;
721 const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
722 const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
723 const U32 dictStartIndex = lowLimit;
724 const BYTE* const dictStart = dictBase + dictStartIndex;
725 const U32 dictLimit = ms->window.dictLimit;
726 const U32 prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit;
727 const BYTE* const prefixStart = base + prefixStartIndex;
728 const BYTE* const dictEnd = dictBase + prefixStartIndex;
729 const BYTE* const iend = istart + srcSize;
730 const BYTE* const ilimit = iend - 8;
731 U32 offset_1=rep[0], offset_2=rep[1];
732 U32 offsetSaved1 = 0, offsetSaved2 = 0;
733
734 const BYTE* ip0 = istart;
735 const BYTE* ip1;
736 const BYTE* ip2;
737 const BYTE* ip3;
738 U32 current0;
739
740
741 size_t hash0; /* hash for ip0 */
742 size_t hash1; /* hash for ip1 */
743 U32 idx; /* match idx for ip0 */
744 const BYTE* idxBase; /* base pointer for idx */
745
746 U32 offcode;
747 const BYTE* match0;
748 size_t mLength;
749 const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
750
751 size_t step;
752 const BYTE* nextStep;
753 const size_t kStepIncr = (1 << (kSearchStrength - 1));
754
755 (void)hasStep; /* not currently specialized on whether it's accelerated */
756
757 DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);
758
759 /* switch to "regular" variant if extDict is invalidated due to maxDistance */
760 if (prefixStartIndex == dictStartIndex)
761 return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
762
763 { U32 const curr = (U32)(ip0 - base);
764 U32 const maxRep = curr - dictStartIndex;
765 if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
766 if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
767 }
768
769 /* start each op */
770 _start: /* Requires: ip0 */
771
772 step = stepSize;
773 nextStep = ip0 + kStepIncr;
774
775 /* calculate positions, ip0 - anchor == 0, so we skip step calc */
776 ip1 = ip0 + 1;
777 ip2 = ip0 + step;
778 ip3 = ip2 + 1;
779
780 if (ip3 >= ilimit) {
781 goto _cleanup;
782 }
783
784 hash0 = ZSTD_hashPtr(ip0, hlog, mls);
785 hash1 = ZSTD_hashPtr(ip1, hlog, mls);
786
787 idx = hashTable[hash0];
788 idxBase = idx < prefixStartIndex ? dictBase : base;
789
790 do {
791 { /* load repcode match for ip[2] */
792 U32 const current2 = (U32)(ip2 - base);
793 U32 const repIndex = current2 - offset_1;
794 const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
795 U32 rval;
796 if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
797 & (offset_1 > 0) ) {
798 rval = MEM_read32(repBase + repIndex);
799 } else {
800 rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
801 }
802
803 /* write back hash table entry */
804 current0 = (U32)(ip0 - base);
805 hashTable[hash0] = current0;
806
807 /* check repcode at ip[2] */
808 if (MEM_read32(ip2) == rval) {
809 ip0 = ip2;
810 match0 = repBase + repIndex;
811 matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
812 assert((match0 != prefixStart) & (match0 != dictStart));
813 mLength = ip0[-1] == match0[-1];
814 ip0 -= mLength;
815 match0 -= mLength;
816 offcode = REPCODE1_TO_OFFBASE;
817 mLength += 4;
818 goto _match;
819 } }
820
821 { /* load match for ip[0] */
822 U32 const mval = idx >= dictStartIndex ?
823 MEM_read32(idxBase + idx) :
824 MEM_read32(ip0) ^ 1; /* guaranteed not to match */
825
826 /* check match at ip[0] */
827 if (MEM_read32(ip0) == mval) {
828 /* found a match! */
829 goto _offset;
830 } }
831
832 /* lookup ip[1] */
833 idx = hashTable[hash1];
834 idxBase = idx < prefixStartIndex ? dictBase : base;
835
836 /* hash ip[2] */
837 hash0 = hash1;
838 hash1 = ZSTD_hashPtr(ip2, hlog, mls);
839
840 /* advance to next positions */
841 ip0 = ip1;
842 ip1 = ip2;
843 ip2 = ip3;
844
845 /* write back hash table entry */
846 current0 = (U32)(ip0 - base);
847 hashTable[hash0] = current0;
848
849 { /* load match for ip[0] */
850 U32 const mval = idx >= dictStartIndex ?
851 MEM_read32(idxBase + idx) :
852 MEM_read32(ip0) ^ 1; /* guaranteed not to match */
853
854 /* check match at ip[0] */
855 if (MEM_read32(ip0) == mval) {
856 /* found a match! */
857 goto _offset;
858 } }
859
860 /* lookup ip[1] */
861 idx = hashTable[hash1];
862 idxBase = idx < prefixStartIndex ? dictBase : base;
863
864 /* hash ip[2] */
865 hash0 = hash1;
866 hash1 = ZSTD_hashPtr(ip2, hlog, mls);
867
868 /* advance to next positions */
869 ip0 = ip1;
870 ip1 = ip2;
871 ip2 = ip0 + step;
872 ip3 = ip1 + step;
873
874 /* calculate step */
875 if (ip2 >= nextStep) {
876 step++;
877 PREFETCH_L1(ip1 + 64);
878 PREFETCH_L1(ip1 + 128);
879 nextStep += kStepIncr;
880 }
881 } while (ip3 < ilimit);
882
883 _cleanup:
884 /* Note that there are probably still a couple positions we could search.
885 * However, it seems to be a meaningful performance hit to try to search
886 * them. So let's not. */
887
888 /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
889 * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
890 offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
891
892 /* save reps for next block */
893 rep[0] = offset_1 ? offset_1 : offsetSaved1;
894 rep[1] = offset_2 ? offset_2 : offsetSaved2;
895
896 /* Return the last literals size */
897 return (size_t)(iend - anchor);
898
899 _offset: /* Requires: ip0, idx, idxBase */
900
901 /* Compute the offset code. */
902 { U32 const offset = current0 - idx;
903 const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
904 matchEnd = idx < prefixStartIndex ? dictEnd : iend;
905 match0 = idxBase + idx;
906 offset_2 = offset_1;
907 offset_1 = offset;
908 offcode = OFFSET_TO_OFFBASE(offset);
909 mLength = 4;
910
911 /* Count the backwards match length. */
912 while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
913 ip0--;
914 match0--;
915 mLength++;
916 } }
917
918 _match: /* Requires: ip0, match0, offcode, matchEnd */
919
920 /* Count the forward length. */
921 assert(matchEnd != 0);
922 mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
923
924 ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
925
926 ip0 += mLength;
927 anchor = ip0;
928
929 /* write next hash table entry */
930 if (ip1 < ip0) {
931 hashTable[hash1] = (U32)(ip1 - base);
932 }
933
934 /* Fill table and check for immediate repcode. */
935 if (ip0 <= ilimit) {
936 /* Fill Table */
937 assert(base+current0+2 > istart); /* check base overflow */
938 hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
939 hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
940
941 while (ip0 <= ilimit) {
942 U32 const repIndex2 = (U32)(ip0-base) - offset_2;
943 const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
944 if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) & (offset_2 > 0))
945 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
946 const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
947 size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
948 { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */
949 ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
950 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
951 ip0 += repLength2;
952 anchor = ip0;
953 continue;
954 }
955 break;
956 } }
957
958 goto _start;
959 }
960
961 ZSTD_GEN_FAST_FN(extDict, 4, 0)
962 ZSTD_GEN_FAST_FN(extDict, 5, 0)
963 ZSTD_GEN_FAST_FN(extDict, 6, 0)
964 ZSTD_GEN_FAST_FN(extDict, 7, 0)
965
ZSTD_compressBlock_fast_extDict(ZSTD_MatchState_t * ms,SeqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)966 size_t ZSTD_compressBlock_fast_extDict(
967 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
968 void const* src, size_t srcSize)
969 {
970 U32 const mls = ms->cParams.minMatch;
971 assert(ms->dictMatchState == NULL);
972 switch(mls)
973 {
974 default: /* includes case 3 */
975 case 4 :
976 return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize);
977 case 5 :
978 return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize);
979 case 6 :
980 return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize);
981 case 7 :
982 return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize);
983 }
984 }
985