xref: /linux/lib/zstd/compress/zstd_fast.c (revision d97e2634fbdcd238a51bc363267df0139c17f4da)
1 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
2 /*
3  * Copyright (c) Meta Platforms, Inc. and affiliates.
4  * All rights reserved.
5  *
6  * This source code is licensed under both the BSD-style license (found in the
7  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
8  * in the COPYING file in the root directory of this source tree).
9  * You may select, at your option, one of the above-listed licenses.
10  */
11 
12 #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
13 #include "zstd_fast.h"
14 
15 static
16 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
17 void ZSTD_fillHashTableForCDict(ZSTD_MatchState_t* ms,
18                         const void* const end,
19                         ZSTD_dictTableLoadMethod_e dtlm)
20 {
21     const ZSTD_compressionParameters* const cParams = &ms->cParams;
22     U32* const hashTable = ms->hashTable;
23     U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
24     U32  const mls = cParams->minMatch;
25     const BYTE* const base = ms->window.base;
26     const BYTE* ip = base + ms->nextToUpdate;
27     const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
28     const U32 fastHashFillStep = 3;
29 
30     /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
31      * Feel free to remove this assert if there's a good reason! */
32     assert(dtlm == ZSTD_dtlm_full);
33 
34     /* Always insert every fastHashFillStep position into the hash table.
35      * Insert the other positions if their hash entry is empty.
36      */
37     for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
38         U32 const curr = (U32)(ip - base);
39         {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
40             ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
41 
42         if (dtlm == ZSTD_dtlm_fast) continue;
43         /* Only load extra positions for ZSTD_dtlm_full */
44         {   U32 p;
45             for (p = 1; p < fastHashFillStep; ++p) {
46                 size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
47                 if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
48                     ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
49     }   }   }   }
50 }
51 
52 static
53 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
54 void ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t* ms,
55                         const void* const end,
56                         ZSTD_dictTableLoadMethod_e dtlm)
57 {
58     const ZSTD_compressionParameters* const cParams = &ms->cParams;
59     U32* const hashTable = ms->hashTable;
60     U32  const hBits = cParams->hashLog;
61     U32  const mls = cParams->minMatch;
62     const BYTE* const base = ms->window.base;
63     const BYTE* ip = base + ms->nextToUpdate;
64     const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
65     const U32 fastHashFillStep = 3;
66 
67     /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
68      * Feel free to remove this assert if there's a good reason! */
69     assert(dtlm == ZSTD_dtlm_fast);
70 
71     /* Always insert every fastHashFillStep position into the hash table.
72      * Insert the other positions if their hash entry is empty.
73      */
74     for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
75         U32 const curr = (U32)(ip - base);
76         size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls);
77         hashTable[hash0] = curr;
78         if (dtlm == ZSTD_dtlm_fast) continue;
79         /* Only load extra positions for ZSTD_dtlm_full */
80         {   U32 p;
81             for (p = 1; p < fastHashFillStep; ++p) {
82                 size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls);
83                 if (hashTable[hash] == 0) {  /* not yet filled */
84                     hashTable[hash] = curr + p;
85     }   }   }   }
86 }
87 
88 void ZSTD_fillHashTable(ZSTD_MatchState_t* ms,
89                         const void* const end,
90                         ZSTD_dictTableLoadMethod_e dtlm,
91                         ZSTD_tableFillPurpose_e tfp)
92 {
93     if (tfp == ZSTD_tfp_forCDict) {
94         ZSTD_fillHashTableForCDict(ms, end, dtlm);
95     } else {
96         ZSTD_fillHashTableForCCtx(ms, end, dtlm);
97     }
98 }
99 
100 
101 typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit);
102 
103 static int
104 ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
105 {
106     /* Array of ~random data, should have low probability of matching data.
107      * Load from here if the index is invalid.
108      * Used to avoid unpredictable branches. */
109     static const BYTE dummy[] = {0x12,0x34,0x56,0x78};
110 
111     /* currentIdx >= lowLimit is a (somewhat) unpredictable branch.
112      * However expression below compiles into conditional move.
113      */
114     const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy);
115     /* Note: this used to be written as : return test1 && test2;
116      * Unfortunately, once inlined, these tests become branches,
117      * in which case it becomes critical that they are executed in the right order (test1 then test2).
118      * So we have to write these tests in a specific manner to ensure their ordering.
119      */
120     if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0;
121     /* force ordering of these tests, which matters once the function is inlined, as they become branches */
122     __asm__("");
123     return matchIdx >= idxLowLimit;
124 }
125 
126 static int
127 ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
128 {
129     /* using a branch instead of a cmov,
130      * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true,
131      * aka almost all candidates are within range */
132     U32 mval;
133     if (matchIdx >= idxLowLimit) {
134         mval = MEM_read32(matchAddress);
135     } else {
136         mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */
137     }
138 
139     return (MEM_read32(currentPtr) == mval);
140 }
141 
142 
143 /*
144  * If you squint hard enough (and ignore repcodes), the search operation at any
145  * given position is broken into 4 stages:
146  *
147  * 1. Hash   (map position to hash value via input read)
148  * 2. Lookup (map hash val to index via hashtable read)
149  * 3. Load   (map index to value at that position via input read)
150  * 4. Compare
151  *
152  * Each of these steps involves a memory read at an address which is computed
153  * from the previous step. This means these steps must be sequenced and their
154  * latencies are cumulative.
155  *
156  * Rather than do 1->2->3->4 sequentially for a single position before moving
157  * onto the next, this implementation interleaves these operations across the
158  * next few positions:
159  *
160  * R = Repcode Read & Compare
161  * H = Hash
162  * T = Table Lookup
163  * M = Match Read & Compare
164  *
165  * Pos | Time -->
166  * ----+-------------------
167  * N   | ... M
168  * N+1 | ...   TM
169  * N+2 |    R H   T M
170  * N+3 |         H    TM
171  * N+4 |           R H   T M
172  * N+5 |                H   ...
173  * N+6 |                  R ...
174  *
175  * This is very much analogous to the pipelining of execution in a CPU. And just
176  * like a CPU, we have to dump the pipeline when we find a match (i.e., take a
177  * branch).
178  *
179  * When this happens, we throw away our current state, and do the following prep
180  * to re-enter the loop:
181  *
182  * Pos | Time -->
183  * ----+-------------------
184  * N   | H T
185  * N+1 |  H
186  *
187  * This is also the work we do at the beginning to enter the loop initially.
188  */
189 FORCE_INLINE_TEMPLATE
190 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
191 size_t ZSTD_compressBlock_fast_noDict_generic(
192         ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
193         void const* src, size_t srcSize,
194         U32 const mls, int useCmov)
195 {
196     const ZSTD_compressionParameters* const cParams = &ms->cParams;
197     U32* const hashTable = ms->hashTable;
198     U32 const hlog = cParams->hashLog;
199     size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */
200     const BYTE* const base = ms->window.base;
201     const BYTE* const istart = (const BYTE*)src;
202     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
203     const U32   prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
204     const BYTE* const prefixStart = base + prefixStartIndex;
205     const BYTE* const iend = istart + srcSize;
206     const BYTE* const ilimit = iend - HASH_READ_SIZE;
207 
208     const BYTE* anchor = istart;
209     const BYTE* ip0 = istart;
210     const BYTE* ip1;
211     const BYTE* ip2;
212     const BYTE* ip3;
213     U32 current0;
214 
215     U32 rep_offset1 = rep[0];
216     U32 rep_offset2 = rep[1];
217     U32 offsetSaved1 = 0, offsetSaved2 = 0;
218 
219     size_t hash0; /* hash for ip0 */
220     size_t hash1; /* hash for ip1 */
221     U32 matchIdx; /* match idx for ip0 */
222 
223     U32 offcode;
224     const BYTE* match0;
225     size_t mLength;
226 
227     /* ip0 and ip1 are always adjacent. The targetLength skipping and
228      * uncompressibility acceleration is applied to every other position,
229      * matching the behavior of #1562. step therefore represents the gap
230      * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */
231     size_t step;
232     const BYTE* nextStep;
233     const size_t kStepIncr = (1 << (kSearchStrength - 1));
234     const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch;
235 
236     DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
237     ip0 += (ip0 == prefixStart);
238     {   U32 const curr = (U32)(ip0 - base);
239         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
240         U32 const maxRep = curr - windowLow;
241         if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
242         if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
243     }
244 
245     /* start each op */
246 _start: /* Requires: ip0 */
247 
248     step = stepSize;
249     nextStep = ip0 + kStepIncr;
250 
251     /* calculate positions, ip0 - anchor == 0, so we skip step calc */
252     ip1 = ip0 + 1;
253     ip2 = ip0 + step;
254     ip3 = ip2 + 1;
255 
256     if (ip3 >= ilimit) {
257         goto _cleanup;
258     }
259 
260     hash0 = ZSTD_hashPtr(ip0, hlog, mls);
261     hash1 = ZSTD_hashPtr(ip1, hlog, mls);
262 
263     matchIdx = hashTable[hash0];
264 
265     do {
266         /* load repcode match for ip[2]*/
267         const U32 rval = MEM_read32(ip2 - rep_offset1);
268 
269         /* write back hash table entry */
270         current0 = (U32)(ip0 - base);
271         hashTable[hash0] = current0;
272 
273         /* check repcode at ip[2] */
274         if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) {
275             ip0 = ip2;
276             match0 = ip0 - rep_offset1;
277             mLength = ip0[-1] == match0[-1];
278             ip0 -= mLength;
279             match0 -= mLength;
280             offcode = REPCODE1_TO_OFFBASE;
281             mLength += 4;
282 
283             /* Write next hash table entry: it's already calculated.
284              * This write is known to be safe because ip1 is before the
285              * repcode (ip2). */
286             hashTable[hash1] = (U32)(ip1 - base);
287 
288             goto _match;
289         }
290 
291          if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
292             /* Write next hash table entry (it's already calculated).
293             * This write is known to be safe because the ip1 == ip0 + 1,
294             * so searching will resume after ip1 */
295             hashTable[hash1] = (U32)(ip1 - base);
296 
297             goto _offset;
298         }
299 
300         /* lookup ip[1] */
301         matchIdx = hashTable[hash1];
302 
303         /* hash ip[2] */
304         hash0 = hash1;
305         hash1 = ZSTD_hashPtr(ip2, hlog, mls);
306 
307         /* advance to next positions */
308         ip0 = ip1;
309         ip1 = ip2;
310         ip2 = ip3;
311 
312         /* write back hash table entry */
313         current0 = (U32)(ip0 - base);
314         hashTable[hash0] = current0;
315 
316          if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
317             /* Write next hash table entry, since it's already calculated */
318             if (step <= 4) {
319                 /* Avoid writing an index if it's >= position where search will resume.
320                 * The minimum possible match has length 4, so search can resume at ip0 + 4.
321                 */
322                 hashTable[hash1] = (U32)(ip1 - base);
323             }
324             goto _offset;
325         }
326 
327         /* lookup ip[1] */
328         matchIdx = hashTable[hash1];
329 
330         /* hash ip[2] */
331         hash0 = hash1;
332         hash1 = ZSTD_hashPtr(ip2, hlog, mls);
333 
334         /* advance to next positions */
335         ip0 = ip1;
336         ip1 = ip2;
337         ip2 = ip0 + step;
338         ip3 = ip1 + step;
339 
340         /* calculate step */
341         if (ip2 >= nextStep) {
342             step++;
343             PREFETCH_L1(ip1 + 64);
344             PREFETCH_L1(ip1 + 128);
345             nextStep += kStepIncr;
346         }
347     } while (ip3 < ilimit);
348 
349 _cleanup:
350     /* Note that there are probably still a couple positions one could search.
351      * However, it seems to be a meaningful performance hit to try to search
352      * them. So let's not. */
353 
354     /* When the repcodes are outside of the prefix, we set them to zero before the loop.
355      * When the offsets are still zero, we need to restore them after the block to have a correct
356      * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
357      * offsets were invalid. We need to figure out which offset to refill with.
358      *     - If both offsets are zero they are in the same order.
359      *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
360      *     - If only one is zero, we need to decide which offset to restore.
361      *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
362      *         - It is impossible for rep_offset2 to be non-zero.
363      *
364      * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
365      * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
366      */
367     offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
368 
369     /* save reps for next block */
370     rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
371     rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
372 
373     /* Return the last literals size */
374     return (size_t)(iend - anchor);
375 
376 _offset: /* Requires: ip0, idx */
377 
378     /* Compute the offset code. */
379     match0 = base + matchIdx;
380     rep_offset2 = rep_offset1;
381     rep_offset1 = (U32)(ip0-match0);
382     offcode = OFFSET_TO_OFFBASE(rep_offset1);
383     mLength = 4;
384 
385     /* Count the backwards match length. */
386     while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) {
387         ip0--;
388         match0--;
389         mLength++;
390     }
391 
392 _match: /* Requires: ip0, match0, offcode */
393 
394     /* Count the forward length. */
395     mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend);
396 
397     ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
398 
399     ip0 += mLength;
400     anchor = ip0;
401 
402     /* Fill table and check for immediate repcode. */
403     if (ip0 <= ilimit) {
404         /* Fill Table */
405         assert(base+current0+2 > istart);  /* check base overflow */
406         hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
407         hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
408 
409         if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */
410             while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) {
411                 /* store sequence */
412                 size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4;
413                 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
414                 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
415                 ip0 += rLength;
416                 ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
417                 anchor = ip0;
418                 continue;   /* faster when present (confirmed on gcc-8) ... (?) */
419     }   }   }
420 
421     goto _start;
422 }
423 
424 #define ZSTD_GEN_FAST_FN(dictMode, mml, cmov)                                                       \
425     static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov(                              \
426             ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                    \
427             void const* src, size_t srcSize)                                                       \
428     {                                                                                              \
429         return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \
430     }
431 
432 ZSTD_GEN_FAST_FN(noDict, 4, 1)
433 ZSTD_GEN_FAST_FN(noDict, 5, 1)
434 ZSTD_GEN_FAST_FN(noDict, 6, 1)
435 ZSTD_GEN_FAST_FN(noDict, 7, 1)
436 
437 ZSTD_GEN_FAST_FN(noDict, 4, 0)
438 ZSTD_GEN_FAST_FN(noDict, 5, 0)
439 ZSTD_GEN_FAST_FN(noDict, 6, 0)
440 ZSTD_GEN_FAST_FN(noDict, 7, 0)
441 
442 size_t ZSTD_compressBlock_fast(
443         ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
444         void const* src, size_t srcSize)
445 {
446     U32 const mml = ms->cParams.minMatch;
447     /* use cmov when "candidate in range" branch is likely unpredictable */
448     int const useCmov = ms->cParams.windowLog < 19;
449     assert(ms->dictMatchState == NULL);
450     if (useCmov) {
451         switch(mml)
452         {
453         default: /* includes case 3 */
454         case 4 :
455             return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize);
456         case 5 :
457             return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize);
458         case 6 :
459             return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize);
460         case 7 :
461             return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
462         }
463     } else {
464         /* use a branch instead */
465         switch(mml)
466         {
467         default: /* includes case 3 */
468         case 4 :
469             return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize);
470         case 5 :
471             return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize);
472         case 6 :
473             return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize);
474         case 7 :
475             return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
476         }
477     }
478 }
479 
480 FORCE_INLINE_TEMPLATE
481 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
482 size_t ZSTD_compressBlock_fast_dictMatchState_generic(
483         ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
484         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
485 {
486     const ZSTD_compressionParameters* const cParams = &ms->cParams;
487     U32* const hashTable = ms->hashTable;
488     U32 const hlog = cParams->hashLog;
489     /* support stepSize of 0 */
490     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
491     const BYTE* const base = ms->window.base;
492     const BYTE* const istart = (const BYTE*)src;
493     const BYTE* ip0 = istart;
494     const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
495     const BYTE* anchor = istart;
496     const U32   prefixStartIndex = ms->window.dictLimit;
497     const BYTE* const prefixStart = base + prefixStartIndex;
498     const BYTE* const iend = istart + srcSize;
499     const BYTE* const ilimit = iend - HASH_READ_SIZE;
500     U32 offset_1=rep[0], offset_2=rep[1];
501 
502     const ZSTD_MatchState_t* const dms = ms->dictMatchState;
503     const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
504     const U32* const dictHashTable = dms->hashTable;
505     const U32 dictStartIndex       = dms->window.dictLimit;
506     const BYTE* const dictBase     = dms->window.base;
507     const BYTE* const dictStart    = dictBase + dictStartIndex;
508     const BYTE* const dictEnd      = dms->window.nextSrc;
509     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
510     const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
511     const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
512 
513     /* if a dictionary is still attached, it necessarily means that
514      * it is within window size. So we just check it. */
515     const U32 maxDistance = 1U << cParams->windowLog;
516     const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
517     assert(endIndex - prefixStartIndex <= maxDistance);
518     (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
519 
520     (void)hasStep; /* not currently specialized on whether it's accelerated */
521 
522     /* ensure there will be no underflow
523      * when translating a dict index into a local index */
524     assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
525 
526     if (ms->prefetchCDictTables) {
527         size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
528         PREFETCH_AREA(dictHashTable, hashTableBytes);
529     }
530 
531     /* init */
532     DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
533     ip0 += (dictAndPrefixLength == 0);
534     /* dictMatchState repCode checks don't currently handle repCode == 0
535      * disabling. */
536     assert(offset_1 <= dictAndPrefixLength);
537     assert(offset_2 <= dictAndPrefixLength);
538 
539     /* Outer search loop */
540     assert(stepSize >= 1);
541     while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
542         size_t mLength;
543         size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
544 
545         size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
546         U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
547         int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
548 
549         U32 matchIndex = hashTable[hash0];
550         U32 curr = (U32)(ip0 - base);
551         size_t step = stepSize;
552         const size_t kStepIncr = 1 << kSearchStrength;
553         const BYTE* nextStep = ip0 + kStepIncr;
554 
555         /* Inner search loop */
556         while (1) {
557             const BYTE* match = base + matchIndex;
558             const U32 repIndex = curr + 1 - offset_1;
559             const BYTE* repMatch = (repIndex < prefixStartIndex) ?
560                                    dictBase + (repIndex - dictIndexDelta) :
561                                    base + repIndex;
562             const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
563             size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
564             hashTable[hash0] = curr;   /* update hash table */
565 
566             if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex))
567                 && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
568                 const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
569                 mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
570                 ip0++;
571                 ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
572                 break;
573             }
574 
575             if (dictTagsMatch) {
576                 /* Found a possible dict match */
577                 const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
578                 const BYTE* dictMatch = dictBase + dictMatchIndex;
579                 if (dictMatchIndex > dictStartIndex &&
580                     MEM_read32(dictMatch) == MEM_read32(ip0)) {
581                     /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
582                     if (matchIndex <= prefixStartIndex) {
583                         U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
584                         mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
585                         while (((ip0 > anchor) & (dictMatch > dictStart))
586                             && (ip0[-1] == dictMatch[-1])) {
587                             ip0--;
588                             dictMatch--;
589                             mLength++;
590                         } /* catch up */
591                         offset_2 = offset_1;
592                         offset_1 = offset;
593                         ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
594                         break;
595                     }
596                 }
597             }
598 
599             if (ZSTD_match4Found_cmov(ip0, match, matchIndex, prefixStartIndex)) {
600                 /* found a regular match of size >= 4 */
601                 U32 const offset = (U32) (ip0 - match);
602                 mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
603                 while (((ip0 > anchor) & (match > prefixStart))
604                        && (ip0[-1] == match[-1])) {
605                     ip0--;
606                     match--;
607                     mLength++;
608                 } /* catch up */
609                 offset_2 = offset_1;
610                 offset_1 = offset;
611                 ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
612                 break;
613             }
614 
615             /* Prepare for next iteration */
616             dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
617             dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
618             matchIndex = hashTable[hash1];
619 
620             if (ip1 >= nextStep) {
621                 step++;
622                 nextStep += kStepIncr;
623             }
624             ip0 = ip1;
625             ip1 = ip1 + step;
626             if (ip1 > ilimit) goto _cleanup;
627 
628             curr = (U32)(ip0 - base);
629             hash0 = hash1;
630         }   /* end inner search loop */
631 
632         /* match found */
633         assert(mLength);
634         ip0 += mLength;
635         anchor = ip0;
636 
637         if (ip0 <= ilimit) {
638             /* Fill Table */
639             assert(base+curr+2 > istart);  /* check base overflow */
640             hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
641             hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
642 
643             /* check immediate repcode */
644             while (ip0 <= ilimit) {
645                 U32 const current2 = (U32)(ip0-base);
646                 U32 const repIndex2 = current2 - offset_2;
647                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
648                         dictBase - dictIndexDelta + repIndex2 :
649                         base + repIndex2;
650                 if ( (ZSTD_index_overlap_check(prefixStartIndex, repIndex2))
651                    && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
652                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
653                     size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
654                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
655                     ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
656                     hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
657                     ip0 += repLength2;
658                     anchor = ip0;
659                     continue;
660                 }
661                 break;
662             }
663         }
664 
665         /* Prepare for next iteration */
666         assert(ip0 == anchor);
667         ip1 = ip0 + stepSize;
668     }
669 
670 _cleanup:
671     /* save reps for next block */
672     rep[0] = offset_1;
673     rep[1] = offset_2;
674 
675     /* Return the last literals size */
676     return (size_t)(iend - anchor);
677 }
678 
679 
680 ZSTD_GEN_FAST_FN(dictMatchState, 4, 0)
681 ZSTD_GEN_FAST_FN(dictMatchState, 5, 0)
682 ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
683 ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
684 
685 size_t ZSTD_compressBlock_fast_dictMatchState(
686         ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
687         void const* src, size_t srcSize)
688 {
689     U32 const mls = ms->cParams.minMatch;
690     assert(ms->dictMatchState != NULL);
691     switch(mls)
692     {
693     default: /* includes case 3 */
694     case 4 :
695         return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize);
696     case 5 :
697         return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize);
698     case 6 :
699         return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize);
700     case 7 :
701         return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize);
702     }
703 }
704 
705 
706 static
707 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
708 size_t ZSTD_compressBlock_fast_extDict_generic(
709         ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
710         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
711 {
712     const ZSTD_compressionParameters* const cParams = &ms->cParams;
713     U32* const hashTable = ms->hashTable;
714     U32 const hlog = cParams->hashLog;
715     /* support stepSize of 0 */
716     size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
717     const BYTE* const base = ms->window.base;
718     const BYTE* const dictBase = ms->window.dictBase;
719     const BYTE* const istart = (const BYTE*)src;
720     const BYTE* anchor = istart;
721     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
722     const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
723     const U32   dictStartIndex = lowLimit;
724     const BYTE* const dictStart = dictBase + dictStartIndex;
725     const U32   dictLimit = ms->window.dictLimit;
726     const U32   prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit;
727     const BYTE* const prefixStart = base + prefixStartIndex;
728     const BYTE* const dictEnd = dictBase + prefixStartIndex;
729     const BYTE* const iend = istart + srcSize;
730     const BYTE* const ilimit = iend - 8;
731     U32 offset_1=rep[0], offset_2=rep[1];
732     U32 offsetSaved1 = 0, offsetSaved2 = 0;
733 
734     const BYTE* ip0 = istart;
735     const BYTE* ip1;
736     const BYTE* ip2;
737     const BYTE* ip3;
738     U32 current0;
739 
740 
741     size_t hash0; /* hash for ip0 */
742     size_t hash1; /* hash for ip1 */
743     U32 idx; /* match idx for ip0 */
744     const BYTE* idxBase; /* base pointer for idx */
745 
746     U32 offcode;
747     const BYTE* match0;
748     size_t mLength;
749     const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
750 
751     size_t step;
752     const BYTE* nextStep;
753     const size_t kStepIncr = (1 << (kSearchStrength - 1));
754 
755     (void)hasStep; /* not currently specialized on whether it's accelerated */
756 
757     DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);
758 
759     /* switch to "regular" variant if extDict is invalidated due to maxDistance */
760     if (prefixStartIndex == dictStartIndex)
761         return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
762 
763     {   U32 const curr = (U32)(ip0 - base);
764         U32 const maxRep = curr - dictStartIndex;
765         if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
766         if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
767     }
768 
769     /* start each op */
770 _start: /* Requires: ip0 */
771 
772     step = stepSize;
773     nextStep = ip0 + kStepIncr;
774 
775     /* calculate positions, ip0 - anchor == 0, so we skip step calc */
776     ip1 = ip0 + 1;
777     ip2 = ip0 + step;
778     ip3 = ip2 + 1;
779 
780     if (ip3 >= ilimit) {
781         goto _cleanup;
782     }
783 
784     hash0 = ZSTD_hashPtr(ip0, hlog, mls);
785     hash1 = ZSTD_hashPtr(ip1, hlog, mls);
786 
787     idx = hashTable[hash0];
788     idxBase = idx < prefixStartIndex ? dictBase : base;
789 
790     do {
791         {   /* load repcode match for ip[2] */
792             U32 const current2 = (U32)(ip2 - base);
793             U32 const repIndex = current2 - offset_1;
794             const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
795             U32 rval;
796             if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
797                  & (offset_1 > 0) ) {
798                 rval = MEM_read32(repBase + repIndex);
799             } else {
800                 rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
801             }
802 
803             /* write back hash table entry */
804             current0 = (U32)(ip0 - base);
805             hashTable[hash0] = current0;
806 
807             /* check repcode at ip[2] */
808             if (MEM_read32(ip2) == rval) {
809                 ip0 = ip2;
810                 match0 = repBase + repIndex;
811                 matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
812                 assert((match0 != prefixStart) & (match0 != dictStart));
813                 mLength = ip0[-1] == match0[-1];
814                 ip0 -= mLength;
815                 match0 -= mLength;
816                 offcode = REPCODE1_TO_OFFBASE;
817                 mLength += 4;
818                 goto _match;
819         }   }
820 
821         {   /* load match for ip[0] */
822             U32 const mval = idx >= dictStartIndex ?
823                     MEM_read32(idxBase + idx) :
824                     MEM_read32(ip0) ^ 1; /* guaranteed not to match */
825 
826             /* check match at ip[0] */
827             if (MEM_read32(ip0) == mval) {
828                 /* found a match! */
829                 goto _offset;
830         }   }
831 
832         /* lookup ip[1] */
833         idx = hashTable[hash1];
834         idxBase = idx < prefixStartIndex ? dictBase : base;
835 
836         /* hash ip[2] */
837         hash0 = hash1;
838         hash1 = ZSTD_hashPtr(ip2, hlog, mls);
839 
840         /* advance to next positions */
841         ip0 = ip1;
842         ip1 = ip2;
843         ip2 = ip3;
844 
845         /* write back hash table entry */
846         current0 = (U32)(ip0 - base);
847         hashTable[hash0] = current0;
848 
849         {   /* load match for ip[0] */
850             U32 const mval = idx >= dictStartIndex ?
851                     MEM_read32(idxBase + idx) :
852                     MEM_read32(ip0) ^ 1; /* guaranteed not to match */
853 
854             /* check match at ip[0] */
855             if (MEM_read32(ip0) == mval) {
856                 /* found a match! */
857                 goto _offset;
858         }   }
859 
860         /* lookup ip[1] */
861         idx = hashTable[hash1];
862         idxBase = idx < prefixStartIndex ? dictBase : base;
863 
864         /* hash ip[2] */
865         hash0 = hash1;
866         hash1 = ZSTD_hashPtr(ip2, hlog, mls);
867 
868         /* advance to next positions */
869         ip0 = ip1;
870         ip1 = ip2;
871         ip2 = ip0 + step;
872         ip3 = ip1 + step;
873 
874         /* calculate step */
875         if (ip2 >= nextStep) {
876             step++;
877             PREFETCH_L1(ip1 + 64);
878             PREFETCH_L1(ip1 + 128);
879             nextStep += kStepIncr;
880         }
881     } while (ip3 < ilimit);
882 
883 _cleanup:
884     /* Note that there are probably still a couple positions we could search.
885      * However, it seems to be a meaningful performance hit to try to search
886      * them. So let's not. */
887 
888     /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
889      * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
890     offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
891 
892     /* save reps for next block */
893     rep[0] = offset_1 ? offset_1 : offsetSaved1;
894     rep[1] = offset_2 ? offset_2 : offsetSaved2;
895 
896     /* Return the last literals size */
897     return (size_t)(iend - anchor);
898 
899 _offset: /* Requires: ip0, idx, idxBase */
900 
901     /* Compute the offset code. */
902     {   U32 const offset = current0 - idx;
903         const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
904         matchEnd = idx < prefixStartIndex ? dictEnd : iend;
905         match0 = idxBase + idx;
906         offset_2 = offset_1;
907         offset_1 = offset;
908         offcode = OFFSET_TO_OFFBASE(offset);
909         mLength = 4;
910 
911         /* Count the backwards match length. */
912         while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
913             ip0--;
914             match0--;
915             mLength++;
916     }   }
917 
918 _match: /* Requires: ip0, match0, offcode, matchEnd */
919 
920     /* Count the forward length. */
921     assert(matchEnd != 0);
922     mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
923 
924     ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
925 
926     ip0 += mLength;
927     anchor = ip0;
928 
929     /* write next hash table entry */
930     if (ip1 < ip0) {
931         hashTable[hash1] = (U32)(ip1 - base);
932     }
933 
934     /* Fill table and check for immediate repcode. */
935     if (ip0 <= ilimit) {
936         /* Fill Table */
937         assert(base+current0+2 > istart);  /* check base overflow */
938         hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
939         hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
940 
941         while (ip0 <= ilimit) {
942             U32 const repIndex2 = (U32)(ip0-base) - offset_2;
943             const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
944             if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) & (offset_2 > 0))
945                  && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
946                 const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
947                 size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
948                 { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
949                 ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
950                 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
951                 ip0 += repLength2;
952                 anchor = ip0;
953                 continue;
954             }
955             break;
956     }   }
957 
958     goto _start;
959 }
960 
961 ZSTD_GEN_FAST_FN(extDict, 4, 0)
962 ZSTD_GEN_FAST_FN(extDict, 5, 0)
963 ZSTD_GEN_FAST_FN(extDict, 6, 0)
964 ZSTD_GEN_FAST_FN(extDict, 7, 0)
965 
966 size_t ZSTD_compressBlock_fast_extDict(
967         ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
968         void const* src, size_t srcSize)
969 {
970     U32 const mls = ms->cParams.minMatch;
971     assert(ms->dictMatchState == NULL);
972     switch(mls)
973     {
974     default: /* includes case 3 */
975     case 4 :
976         return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize);
977     case 5 :
978         return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize);
979     case 6 :
980         return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize);
981     case 7 :
982         return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize);
983     }
984 }
985