1 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause 2 /* 3 * Copyright (c) Meta Platforms, Inc. and affiliates. 4 * All rights reserved. 5 * 6 * This source code is licensed under both the BSD-style license (found in the 7 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 8 * in the COPYING file in the root directory of this source tree). 9 * You may select, at your option, one of the above-listed licenses. 10 */ 11 12 /* zstd_decompress_block : 13 * this module takes care of decompressing _compressed_ block */ 14 15 /*-******************************************************* 16 * Dependencies 17 *********************************************************/ 18 #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ 19 #include "../common/compiler.h" /* prefetch */ 20 #include "../common/cpu.h" /* bmi2 */ 21 #include "../common/mem.h" /* low level memory routines */ 22 #define FSE_STATIC_LINKING_ONLY 23 #include "../common/fse.h" 24 #include "../common/huf.h" 25 #include "../common/zstd_internal.h" 26 #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ 27 #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ 28 #include "zstd_decompress_block.h" 29 #include "../common/bits.h" /* ZSTD_highbit32 */ 30 31 /*_******************************************************* 32 * Macros 33 **********************************************************/ 34 35 /* These two optional macros force the use one way or another of the two 36 * ZSTD_decompressSequences implementations. You can't force in both directions 37 * at the same time. 38 */ 39 #if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 40 defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 41 #error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!" 42 #endif 43 44 45 /*_******************************************************* 46 * Memory operations 47 **********************************************************/ 48 static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } 49 50 51 /*-************************************************************* 52 * Block decoding 53 ***************************************************************/ 54 55 static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx) 56 { 57 size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX; 58 assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX); 59 return blockSizeMax; 60 } 61 62 /*! ZSTD_getcBlockSize() : 63 * Provides the size of compressed block from block header `src` */ 64 size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, 65 blockProperties_t* bpPtr) 66 { 67 RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, ""); 68 69 { U32 const cBlockHeader = MEM_readLE24(src); 70 U32 const cSize = cBlockHeader >> 3; 71 bpPtr->lastBlock = cBlockHeader & 1; 72 bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3); 73 bpPtr->origSize = cSize; /* only useful for RLE */ 74 if (bpPtr->blockType == bt_rle) return 1; 75 RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, ""); 76 return cSize; 77 } 78 } 79 80 /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ 81 static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, 82 const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) 83 { 84 size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); 85 assert(litSize <= blockSizeMax); 86 assert(dctx->isFrameDecompression || streaming == not_streaming); 87 assert(expectedWriteSize <= blockSizeMax); 88 if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { 89 /* If we aren't streaming, we can just put the literals after the output 90 * of the current block. We don't need to worry about overwriting the 91 * extDict of our window, because it doesn't exist. 92 * So if we have space after the end of the block, just put it there. 93 */ 94 dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH; 95 dctx->litBufferEnd = dctx->litBuffer + litSize; 96 dctx->litBufferLocation = ZSTD_in_dst; 97 } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) { 98 /* Literals fit entirely within the extra buffer, put them there to avoid 99 * having to split the literals. 100 */ 101 dctx->litBuffer = dctx->litExtraBuffer; 102 dctx->litBufferEnd = dctx->litBuffer + litSize; 103 dctx->litBufferLocation = ZSTD_not_in_dst; 104 } else { 105 assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); 106 /* Literals must be split between the output block and the extra lit 107 * buffer. We fill the extra lit buffer with the tail of the literals, 108 * and put the rest of the literals at the end of the block, with 109 * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. 110 * This MUST not write more than our maxBlockSize beyond dst, because in 111 * streaming mode, that could overwrite part of our extDict window. 112 */ 113 if (splitImmediately) { 114 /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ 115 dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; 116 dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; 117 } else { 118 /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ 119 dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; 120 dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; 121 } 122 dctx->litBufferLocation = ZSTD_split; 123 assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize); 124 } 125 } 126 127 /*! ZSTD_decodeLiteralsBlock() : 128 * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored 129 * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current 130 * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being 131 * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write. 132 * 133 * @return : nb of bytes read from src (< srcSize ) 134 * note : symbol not declared but exposed for fullbench */ 135 static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, 136 const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ 137 void* dst, size_t dstCapacity, const streaming_operation streaming) 138 { 139 DEBUGLOG(5, "ZSTD_decodeLiteralsBlock"); 140 RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); 141 142 { const BYTE* const istart = (const BYTE*) src; 143 SymbolEncodingType_e const litEncType = (SymbolEncodingType_e)(istart[0] & 3); 144 size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); 145 146 switch(litEncType) 147 { 148 case set_repeat: 149 DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block"); 150 RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, ""); 151 ZSTD_FALLTHROUGH; 152 153 case set_compressed: 154 RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); 155 { size_t lhSize, litSize, litCSize; 156 U32 singleStream=0; 157 U32 const lhlCode = (istart[0] >> 2) & 3; 158 U32 const lhc = MEM_readLE32(istart); 159 size_t hufSuccess; 160 size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); 161 int const flags = 0 162 | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) 163 | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); 164 switch(lhlCode) 165 { 166 case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ 167 /* 2 - 2 - 10 - 10 */ 168 singleStream = !lhlCode; 169 lhSize = 3; 170 litSize = (lhc >> 4) & 0x3FF; 171 litCSize = (lhc >> 14) & 0x3FF; 172 break; 173 case 2: 174 /* 2 - 2 - 14 - 14 */ 175 lhSize = 4; 176 litSize = (lhc >> 4) & 0x3FFF; 177 litCSize = lhc >> 18; 178 break; 179 case 3: 180 /* 2 - 2 - 18 - 18 */ 181 lhSize = 5; 182 litSize = (lhc >> 4) & 0x3FFFF; 183 litCSize = (lhc >> 22) + ((size_t)istart[4] << 10); 184 break; 185 } 186 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 187 RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); 188 if (!singleStream) 189 RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, 190 "Not enough literals (%zu) for the 4-streams mode (min %u)", 191 litSize, MIN_LITERALS_FOR_4_STREAMS); 192 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); 193 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); 194 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); 195 196 /* prefetch huffman table if cold */ 197 if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { 198 PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable)); 199 } 200 201 if (litEncType==set_repeat) { 202 if (singleStream) { 203 hufSuccess = HUF_decompress1X_usingDTable( 204 dctx->litBuffer, litSize, istart+lhSize, litCSize, 205 dctx->HUFptr, flags); 206 } else { 207 assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); 208 hufSuccess = HUF_decompress4X_usingDTable( 209 dctx->litBuffer, litSize, istart+lhSize, litCSize, 210 dctx->HUFptr, flags); 211 } 212 } else { 213 if (singleStream) { 214 #if defined(HUF_FORCE_DECOMPRESS_X2) 215 hufSuccess = HUF_decompress1X_DCtx_wksp( 216 dctx->entropy.hufTable, dctx->litBuffer, litSize, 217 istart+lhSize, litCSize, dctx->workspace, 218 sizeof(dctx->workspace), flags); 219 #else 220 hufSuccess = HUF_decompress1X1_DCtx_wksp( 221 dctx->entropy.hufTable, dctx->litBuffer, litSize, 222 istart+lhSize, litCSize, dctx->workspace, 223 sizeof(dctx->workspace), flags); 224 #endif 225 } else { 226 hufSuccess = HUF_decompress4X_hufOnly_wksp( 227 dctx->entropy.hufTable, dctx->litBuffer, litSize, 228 istart+lhSize, litCSize, dctx->workspace, 229 sizeof(dctx->workspace), flags); 230 } 231 } 232 if (dctx->litBufferLocation == ZSTD_split) 233 { 234 assert(litSize > ZSTD_LITBUFFEREXTRASIZE); 235 ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); 236 ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); 237 dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; 238 dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; 239 assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax); 240 } 241 242 RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); 243 244 dctx->litPtr = dctx->litBuffer; 245 dctx->litSize = litSize; 246 dctx->litEntropy = 1; 247 if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; 248 return litCSize + lhSize; 249 } 250 251 case set_basic: 252 { size_t litSize, lhSize; 253 U32 const lhlCode = ((istart[0]) >> 2) & 3; 254 size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); 255 switch(lhlCode) 256 { 257 case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ 258 lhSize = 1; 259 litSize = istart[0] >> 3; 260 break; 261 case 1: 262 lhSize = 2; 263 litSize = MEM_readLE16(istart) >> 4; 264 break; 265 case 3: 266 lhSize = 3; 267 RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); 268 litSize = MEM_readLE24(istart) >> 4; 269 break; 270 } 271 272 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 273 RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); 274 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); 275 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); 276 if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ 277 RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); 278 if (dctx->litBufferLocation == ZSTD_split) 279 { 280 ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE); 281 ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); 282 } 283 else 284 { 285 ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize); 286 } 287 dctx->litPtr = dctx->litBuffer; 288 dctx->litSize = litSize; 289 return lhSize+litSize; 290 } 291 /* direct reference into compressed stream */ 292 dctx->litPtr = istart+lhSize; 293 dctx->litSize = litSize; 294 dctx->litBufferEnd = dctx->litPtr + litSize; 295 dctx->litBufferLocation = ZSTD_not_in_dst; 296 return lhSize+litSize; 297 } 298 299 case set_rle: 300 { U32 const lhlCode = ((istart[0]) >> 2) & 3; 301 size_t litSize, lhSize; 302 size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); 303 switch(lhlCode) 304 { 305 case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ 306 lhSize = 1; 307 litSize = istart[0] >> 3; 308 break; 309 case 1: 310 lhSize = 2; 311 RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); 312 litSize = MEM_readLE16(istart) >> 4; 313 break; 314 case 3: 315 lhSize = 3; 316 RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); 317 litSize = MEM_readLE24(istart) >> 4; 318 break; 319 } 320 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 321 RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); 322 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); 323 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); 324 if (dctx->litBufferLocation == ZSTD_split) 325 { 326 ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE); 327 ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE); 328 } 329 else 330 { 331 ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize); 332 } 333 dctx->litPtr = dctx->litBuffer; 334 dctx->litSize = litSize; 335 return lhSize+1; 336 } 337 default: 338 RETURN_ERROR(corruption_detected, "impossible"); 339 } 340 } 341 } 342 343 /* Hidden declaration for fullbench */ 344 size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, 345 const void* src, size_t srcSize, 346 void* dst, size_t dstCapacity); 347 size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, 348 const void* src, size_t srcSize, 349 void* dst, size_t dstCapacity) 350 { 351 dctx->isFrameDecompression = 0; 352 return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming); 353 } 354 355 /* Default FSE distribution tables. 356 * These are pre-calculated FSE decoding tables using default distributions as defined in specification : 357 * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions 358 * They were generated programmatically with following method : 359 * - start from default distributions, present in /lib/common/zstd_internal.h 360 * - generate tables normally, using ZSTD_buildFSETable() 361 * - printout the content of tables 362 * - prettify output, report below, test with fuzzer to ensure it's correct */ 363 364 /* Default FSE distribution table for Literal Lengths */ 365 static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = { 366 { 1, 1, 1, LL_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ 367 /* nextState, nbAddBits, nbBits, baseVal */ 368 { 0, 0, 4, 0}, { 16, 0, 4, 0}, 369 { 32, 0, 5, 1}, { 0, 0, 5, 3}, 370 { 0, 0, 5, 4}, { 0, 0, 5, 6}, 371 { 0, 0, 5, 7}, { 0, 0, 5, 9}, 372 { 0, 0, 5, 10}, { 0, 0, 5, 12}, 373 { 0, 0, 6, 14}, { 0, 1, 5, 16}, 374 { 0, 1, 5, 20}, { 0, 1, 5, 22}, 375 { 0, 2, 5, 28}, { 0, 3, 5, 32}, 376 { 0, 4, 5, 48}, { 32, 6, 5, 64}, 377 { 0, 7, 5, 128}, { 0, 8, 6, 256}, 378 { 0, 10, 6, 1024}, { 0, 12, 6, 4096}, 379 { 32, 0, 4, 0}, { 0, 0, 4, 1}, 380 { 0, 0, 5, 2}, { 32, 0, 5, 4}, 381 { 0, 0, 5, 5}, { 32, 0, 5, 7}, 382 { 0, 0, 5, 8}, { 32, 0, 5, 10}, 383 { 0, 0, 5, 11}, { 0, 0, 6, 13}, 384 { 32, 1, 5, 16}, { 0, 1, 5, 18}, 385 { 32, 1, 5, 22}, { 0, 2, 5, 24}, 386 { 32, 3, 5, 32}, { 0, 3, 5, 40}, 387 { 0, 6, 4, 64}, { 16, 6, 4, 64}, 388 { 32, 7, 5, 128}, { 0, 9, 6, 512}, 389 { 0, 11, 6, 2048}, { 48, 0, 4, 0}, 390 { 16, 0, 4, 1}, { 32, 0, 5, 2}, 391 { 32, 0, 5, 3}, { 32, 0, 5, 5}, 392 { 32, 0, 5, 6}, { 32, 0, 5, 8}, 393 { 32, 0, 5, 9}, { 32, 0, 5, 11}, 394 { 32, 0, 5, 12}, { 0, 0, 6, 15}, 395 { 32, 1, 5, 18}, { 32, 1, 5, 20}, 396 { 32, 2, 5, 24}, { 32, 2, 5, 28}, 397 { 32, 3, 5, 40}, { 32, 4, 5, 48}, 398 { 0, 16, 6,65536}, { 0, 15, 6,32768}, 399 { 0, 14, 6,16384}, { 0, 13, 6, 8192}, 400 }; /* LL_defaultDTable */ 401 402 /* Default FSE distribution table for Offset Codes */ 403 static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = { 404 { 1, 1, 1, OF_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ 405 /* nextState, nbAddBits, nbBits, baseVal */ 406 { 0, 0, 5, 0}, { 0, 6, 4, 61}, 407 { 0, 9, 5, 509}, { 0, 15, 5,32765}, 408 { 0, 21, 5,2097149}, { 0, 3, 5, 5}, 409 { 0, 7, 4, 125}, { 0, 12, 5, 4093}, 410 { 0, 18, 5,262141}, { 0, 23, 5,8388605}, 411 { 0, 5, 5, 29}, { 0, 8, 4, 253}, 412 { 0, 14, 5,16381}, { 0, 20, 5,1048573}, 413 { 0, 2, 5, 1}, { 16, 7, 4, 125}, 414 { 0, 11, 5, 2045}, { 0, 17, 5,131069}, 415 { 0, 22, 5,4194301}, { 0, 4, 5, 13}, 416 { 16, 8, 4, 253}, { 0, 13, 5, 8189}, 417 { 0, 19, 5,524285}, { 0, 1, 5, 1}, 418 { 16, 6, 4, 61}, { 0, 10, 5, 1021}, 419 { 0, 16, 5,65533}, { 0, 28, 5,268435453}, 420 { 0, 27, 5,134217725}, { 0, 26, 5,67108861}, 421 { 0, 25, 5,33554429}, { 0, 24, 5,16777213}, 422 }; /* OF_defaultDTable */ 423 424 425 /* Default FSE distribution table for Match Lengths */ 426 static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = { 427 { 1, 1, 1, ML_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ 428 /* nextState, nbAddBits, nbBits, baseVal */ 429 { 0, 0, 6, 3}, { 0, 0, 4, 4}, 430 { 32, 0, 5, 5}, { 0, 0, 5, 6}, 431 { 0, 0, 5, 8}, { 0, 0, 5, 9}, 432 { 0, 0, 5, 11}, { 0, 0, 6, 13}, 433 { 0, 0, 6, 16}, { 0, 0, 6, 19}, 434 { 0, 0, 6, 22}, { 0, 0, 6, 25}, 435 { 0, 0, 6, 28}, { 0, 0, 6, 31}, 436 { 0, 0, 6, 34}, { 0, 1, 6, 37}, 437 { 0, 1, 6, 41}, { 0, 2, 6, 47}, 438 { 0, 3, 6, 59}, { 0, 4, 6, 83}, 439 { 0, 7, 6, 131}, { 0, 9, 6, 515}, 440 { 16, 0, 4, 4}, { 0, 0, 4, 5}, 441 { 32, 0, 5, 6}, { 0, 0, 5, 7}, 442 { 32, 0, 5, 9}, { 0, 0, 5, 10}, 443 { 0, 0, 6, 12}, { 0, 0, 6, 15}, 444 { 0, 0, 6, 18}, { 0, 0, 6, 21}, 445 { 0, 0, 6, 24}, { 0, 0, 6, 27}, 446 { 0, 0, 6, 30}, { 0, 0, 6, 33}, 447 { 0, 1, 6, 35}, { 0, 1, 6, 39}, 448 { 0, 2, 6, 43}, { 0, 3, 6, 51}, 449 { 0, 4, 6, 67}, { 0, 5, 6, 99}, 450 { 0, 8, 6, 259}, { 32, 0, 4, 4}, 451 { 48, 0, 4, 4}, { 16, 0, 4, 5}, 452 { 32, 0, 5, 7}, { 32, 0, 5, 8}, 453 { 32, 0, 5, 10}, { 32, 0, 5, 11}, 454 { 0, 0, 6, 14}, { 0, 0, 6, 17}, 455 { 0, 0, 6, 20}, { 0, 0, 6, 23}, 456 { 0, 0, 6, 26}, { 0, 0, 6, 29}, 457 { 0, 0, 6, 32}, { 0, 16, 6,65539}, 458 { 0, 15, 6,32771}, { 0, 14, 6,16387}, 459 { 0, 13, 6, 8195}, { 0, 12, 6, 4099}, 460 { 0, 11, 6, 2051}, { 0, 10, 6, 1027}, 461 }; /* ML_defaultDTable */ 462 463 464 static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits) 465 { 466 void* ptr = dt; 467 ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr; 468 ZSTD_seqSymbol* const cell = dt + 1; 469 470 DTableH->tableLog = 0; 471 DTableH->fastMode = 0; 472 473 cell->nbBits = 0; 474 cell->nextState = 0; 475 assert(nbAddBits < 255); 476 cell->nbAdditionalBits = nbAddBits; 477 cell->baseValue = baseValue; 478 } 479 480 481 /* ZSTD_buildFSETable() : 482 * generate FSE decoding table for one symbol (ll, ml or off) 483 * cannot fail if input is valid => 484 * all inputs are presumed validated at this stage */ 485 FORCE_INLINE_TEMPLATE 486 void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, 487 const short* normalizedCounter, unsigned maxSymbolValue, 488 const U32* baseValue, const U8* nbAdditionalBits, 489 unsigned tableLog, void* wksp, size_t wkspSize) 490 { 491 ZSTD_seqSymbol* const tableDecode = dt+1; 492 U32 const maxSV1 = maxSymbolValue + 1; 493 U32 const tableSize = 1 << tableLog; 494 495 U16* symbolNext = (U16*)wksp; 496 BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1); 497 U32 highThreshold = tableSize - 1; 498 499 500 /* Sanity Checks */ 501 assert(maxSymbolValue <= MaxSeq); 502 assert(tableLog <= MaxFSELog); 503 assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE); 504 (void)wkspSize; 505 /* Init, lay down lowprob symbols */ 506 { ZSTD_seqSymbol_header DTableH; 507 DTableH.tableLog = tableLog; 508 DTableH.fastMode = 1; 509 { S16 const largeLimit= (S16)(1 << (tableLog-1)); 510 U32 s; 511 for (s=0; s<maxSV1; s++) { 512 if (normalizedCounter[s]==-1) { 513 tableDecode[highThreshold--].baseValue = s; 514 symbolNext[s] = 1; 515 } else { 516 if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; 517 assert(normalizedCounter[s]>=0); 518 symbolNext[s] = (U16)normalizedCounter[s]; 519 } } } 520 ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); 521 } 522 523 /* Spread symbols */ 524 assert(tableSize <= 512); 525 /* Specialized symbol spreading for the case when there are 526 * no low probability (-1 count) symbols. When compressing 527 * small blocks we avoid low probability symbols to hit this 528 * case, since header decoding speed matters more. 529 */ 530 if (highThreshold == tableSize - 1) { 531 size_t const tableMask = tableSize-1; 532 size_t const step = FSE_TABLESTEP(tableSize); 533 /* First lay down the symbols in order. 534 * We use a uint64_t to lay down 8 bytes at a time. This reduces branch 535 * misses since small blocks generally have small table logs, so nearly 536 * all symbols have counts <= 8. We ensure we have 8 bytes at the end of 537 * our buffer to handle the over-write. 538 */ 539 { 540 U64 const add = 0x0101010101010101ull; 541 size_t pos = 0; 542 U64 sv = 0; 543 U32 s; 544 for (s=0; s<maxSV1; ++s, sv += add) { 545 int i; 546 int const n = normalizedCounter[s]; 547 MEM_write64(spread + pos, sv); 548 for (i = 8; i < n; i += 8) { 549 MEM_write64(spread + pos + i, sv); 550 } 551 assert(n>=0); 552 pos += (size_t)n; 553 } 554 } 555 /* Now we spread those positions across the table. 556 * The benefit of doing it in two stages is that we avoid the 557 * variable size inner loop, which caused lots of branch misses. 558 * Now we can run through all the positions without any branch misses. 559 * We unroll the loop twice, since that is what empirically worked best. 560 */ 561 { 562 size_t position = 0; 563 size_t s; 564 size_t const unroll = 2; 565 assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ 566 for (s = 0; s < (size_t)tableSize; s += unroll) { 567 size_t u; 568 for (u = 0; u < unroll; ++u) { 569 size_t const uPosition = (position + (u * step)) & tableMask; 570 tableDecode[uPosition].baseValue = spread[s + u]; 571 } 572 position = (position + (unroll * step)) & tableMask; 573 } 574 assert(position == 0); 575 } 576 } else { 577 U32 const tableMask = tableSize-1; 578 U32 const step = FSE_TABLESTEP(tableSize); 579 U32 s, position = 0; 580 for (s=0; s<maxSV1; s++) { 581 int i; 582 int const n = normalizedCounter[s]; 583 for (i=0; i<n; i++) { 584 tableDecode[position].baseValue = s; 585 position = (position + step) & tableMask; 586 while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */ 587 } } 588 assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ 589 } 590 591 /* Build Decoding table */ 592 { 593 U32 u; 594 for (u=0; u<tableSize; u++) { 595 U32 const symbol = tableDecode[u].baseValue; 596 U32 const nextState = symbolNext[symbol]++; 597 tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) ); 598 tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize); 599 assert(nbAdditionalBits[symbol] < 255); 600 tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol]; 601 tableDecode[u].baseValue = baseValue[symbol]; 602 } 603 } 604 } 605 606 /* Avoids the FORCE_INLINE of the _body() function. */ 607 static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt, 608 const short* normalizedCounter, unsigned maxSymbolValue, 609 const U32* baseValue, const U8* nbAdditionalBits, 610 unsigned tableLog, void* wksp, size_t wkspSize) 611 { 612 ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, 613 baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 614 } 615 616 #if DYNAMIC_BMI2 617 BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt, 618 const short* normalizedCounter, unsigned maxSymbolValue, 619 const U32* baseValue, const U8* nbAdditionalBits, 620 unsigned tableLog, void* wksp, size_t wkspSize) 621 { 622 ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, 623 baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 624 } 625 #endif 626 627 void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, 628 const short* normalizedCounter, unsigned maxSymbolValue, 629 const U32* baseValue, const U8* nbAdditionalBits, 630 unsigned tableLog, void* wksp, size_t wkspSize, int bmi2) 631 { 632 #if DYNAMIC_BMI2 633 if (bmi2) { 634 ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue, 635 baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 636 return; 637 } 638 #endif 639 (void)bmi2; 640 ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue, 641 baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 642 } 643 644 645 /*! ZSTD_buildSeqTable() : 646 * @return : nb bytes read from src, 647 * or an error code if it fails */ 648 static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr, 649 SymbolEncodingType_e type, unsigned max, U32 maxLog, 650 const void* src, size_t srcSize, 651 const U32* baseValue, const U8* nbAdditionalBits, 652 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable, 653 int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize, 654 int bmi2) 655 { 656 switch(type) 657 { 658 case set_rle : 659 RETURN_ERROR_IF(!srcSize, srcSize_wrong, ""); 660 RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, ""); 661 { U32 const symbol = *(const BYTE*)src; 662 U32 const baseline = baseValue[symbol]; 663 U8 const nbBits = nbAdditionalBits[symbol]; 664 ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); 665 } 666 *DTablePtr = DTableSpace; 667 return 1; 668 case set_basic : 669 *DTablePtr = defaultTable; 670 return 0; 671 case set_repeat: 672 RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, ""); 673 /* prefetch FSE table if used */ 674 if (ddictIsCold && (nbSeq > 24 /* heuristic */)) { 675 const void* const pStart = *DTablePtr; 676 size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog)); 677 PREFETCH_AREA(pStart, pSize); 678 } 679 return 0; 680 case set_compressed : 681 { unsigned tableLog; 682 S16 norm[MaxSeq+1]; 683 size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); 684 RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, ""); 685 RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, ""); 686 ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2); 687 *DTablePtr = DTableSpace; 688 return headerSize; 689 } 690 default : 691 assert(0); 692 RETURN_ERROR(GENERIC, "impossible"); 693 } 694 } 695 696 size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, 697 const void* src, size_t srcSize) 698 { 699 const BYTE* const istart = (const BYTE*)src; 700 const BYTE* const iend = istart + srcSize; 701 const BYTE* ip = istart; 702 int nbSeq; 703 DEBUGLOG(5, "ZSTD_decodeSeqHeaders"); 704 705 /* check */ 706 RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, ""); 707 708 /* SeqHead */ 709 nbSeq = *ip++; 710 if (nbSeq > 0x7F) { 711 if (nbSeq == 0xFF) { 712 RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); 713 nbSeq = MEM_readLE16(ip) + LONGNBSEQ; 714 ip+=2; 715 } else { 716 RETURN_ERROR_IF(ip >= iend, srcSize_wrong, ""); 717 nbSeq = ((nbSeq-0x80)<<8) + *ip++; 718 } 719 } 720 *nbSeqPtr = nbSeq; 721 722 if (nbSeq == 0) { 723 /* No sequence : section ends immediately */ 724 RETURN_ERROR_IF(ip != iend, corruption_detected, 725 "extraneous data present in the Sequences section"); 726 return (size_t)(ip - istart); 727 } 728 729 /* FSE table descriptors */ 730 RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ 731 RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ 732 { SymbolEncodingType_e const LLtype = (SymbolEncodingType_e)(*ip >> 6); 733 SymbolEncodingType_e const OFtype = (SymbolEncodingType_e)((*ip >> 4) & 3); 734 SymbolEncodingType_e const MLtype = (SymbolEncodingType_e)((*ip >> 2) & 3); 735 ip++; 736 737 /* Build DTables */ 738 { size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, 739 LLtype, MaxLL, LLFSELog, 740 ip, iend-ip, 741 LL_base, LL_bits, 742 LL_defaultDTable, dctx->fseEntropy, 743 dctx->ddictIsCold, nbSeq, 744 dctx->workspace, sizeof(dctx->workspace), 745 ZSTD_DCtx_get_bmi2(dctx)); 746 RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 747 ip += llhSize; 748 } 749 750 { size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, 751 OFtype, MaxOff, OffFSELog, 752 ip, iend-ip, 753 OF_base, OF_bits, 754 OF_defaultDTable, dctx->fseEntropy, 755 dctx->ddictIsCold, nbSeq, 756 dctx->workspace, sizeof(dctx->workspace), 757 ZSTD_DCtx_get_bmi2(dctx)); 758 RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 759 ip += ofhSize; 760 } 761 762 { size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, 763 MLtype, MaxML, MLFSELog, 764 ip, iend-ip, 765 ML_base, ML_bits, 766 ML_defaultDTable, dctx->fseEntropy, 767 dctx->ddictIsCold, nbSeq, 768 dctx->workspace, sizeof(dctx->workspace), 769 ZSTD_DCtx_get_bmi2(dctx)); 770 RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 771 ip += mlhSize; 772 } 773 } 774 775 return ip-istart; 776 } 777 778 779 typedef struct { 780 size_t litLength; 781 size_t matchLength; 782 size_t offset; 783 } seq_t; 784 785 typedef struct { 786 size_t state; 787 const ZSTD_seqSymbol* table; 788 } ZSTD_fseState; 789 790 typedef struct { 791 BIT_DStream_t DStream; 792 ZSTD_fseState stateLL; 793 ZSTD_fseState stateOffb; 794 ZSTD_fseState stateML; 795 size_t prevOffset[ZSTD_REP_NUM]; 796 } seqState_t; 797 798 /*! ZSTD_overlapCopy8() : 799 * Copies 8 bytes from ip to op and updates op and ip where ip <= op. 800 * If the offset is < 8 then the offset is spread to at least 8 bytes. 801 * 802 * Precondition: *ip <= *op 803 * Postcondition: *op - *op >= 8 804 */ 805 HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) { 806 assert(*ip <= *op); 807 if (offset < 8) { 808 /* close range match, overlap */ 809 static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ 810 static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */ 811 int const sub2 = dec64table[offset]; 812 (*op)[0] = (*ip)[0]; 813 (*op)[1] = (*ip)[1]; 814 (*op)[2] = (*ip)[2]; 815 (*op)[3] = (*ip)[3]; 816 *ip += dec32table[offset]; 817 ZSTD_copy4(*op+4, *ip); 818 *ip -= sub2; 819 } else { 820 ZSTD_copy8(*op, *ip); 821 } 822 *ip += 8; 823 *op += 8; 824 assert(*op - *ip >= 8); 825 } 826 827 /*! ZSTD_safecopy() : 828 * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer 829 * and write up to 16 bytes past oend_w (op >= oend_w is allowed). 830 * This function is only called in the uncommon case where the sequence is near the end of the block. It 831 * should be fast for a single long sequence, but can be slow for several short sequences. 832 * 833 * @param ovtype controls the overlap detection 834 * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart. 835 * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. 836 * The src buffer must be before the dst buffer. 837 */ 838 static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { 839 ptrdiff_t const diff = op - ip; 840 BYTE* const oend = op + length; 841 842 assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) || 843 (ovtype == ZSTD_overlap_src_before_dst && diff >= 0)); 844 845 if (length < 8) { 846 /* Handle short lengths. */ 847 while (op < oend) *op++ = *ip++; 848 return; 849 } 850 if (ovtype == ZSTD_overlap_src_before_dst) { 851 /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */ 852 assert(length >= 8); 853 ZSTD_overlapCopy8(&op, &ip, diff); 854 length -= 8; 855 assert(op - ip >= 8); 856 assert(op <= oend); 857 } 858 859 if (oend <= oend_w) { 860 /* No risk of overwrite. */ 861 ZSTD_wildcopy(op, ip, length, ovtype); 862 return; 863 } 864 if (op <= oend_w) { 865 /* Wildcopy until we get close to the end. */ 866 assert(oend > oend_w); 867 ZSTD_wildcopy(op, ip, oend_w - op, ovtype); 868 ip += oend_w - op; 869 op += oend_w - op; 870 } 871 /* Handle the leftovers. */ 872 while (op < oend) *op++ = *ip++; 873 } 874 875 /* ZSTD_safecopyDstBeforeSrc(): 876 * This version allows overlap with dst before src, or handles the non-overlap case with dst after src 877 * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ 878 static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) { 879 ptrdiff_t const diff = op - ip; 880 BYTE* const oend = op + length; 881 882 if (length < 8 || diff > -8) { 883 /* Handle short lengths, close overlaps, and dst not before src. */ 884 while (op < oend) *op++ = *ip++; 885 return; 886 } 887 888 if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) { 889 ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap); 890 ip += oend - WILDCOPY_OVERLENGTH - op; 891 op += oend - WILDCOPY_OVERLENGTH - op; 892 } 893 894 /* Handle the leftovers. */ 895 while (op < oend) *op++ = *ip++; 896 } 897 898 /* ZSTD_execSequenceEnd(): 899 * This version handles cases that are near the end of the output buffer. It requires 900 * more careful checks to make sure there is no overflow. By separating out these hard 901 * and unlikely cases, we can speed up the common cases. 902 * 903 * NOTE: This function needs to be fast for a single long sequence, but doesn't need 904 * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). 905 */ 906 FORCE_NOINLINE 907 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 908 size_t ZSTD_execSequenceEnd(BYTE* op, 909 BYTE* const oend, seq_t sequence, 910 const BYTE** litPtr, const BYTE* const litLimit, 911 const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 912 { 913 BYTE* const oLitEnd = op + sequence.litLength; 914 size_t const sequenceLength = sequence.litLength + sequence.matchLength; 915 const BYTE* const iLitEnd = *litPtr + sequence.litLength; 916 const BYTE* match = oLitEnd - sequence.offset; 917 BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; 918 919 /* bounds checks : careful of address space overflow in 32-bit mode */ 920 RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); 921 RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); 922 assert(op < op + sequenceLength); 923 assert(oLitEnd < op + sequenceLength); 924 925 /* copy literals */ 926 ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap); 927 op = oLitEnd; 928 *litPtr = iLitEnd; 929 930 /* copy Match */ 931 if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 932 /* offset beyond prefix */ 933 RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); 934 match = dictEnd - (prefixStart - match); 935 if (match + sequence.matchLength <= dictEnd) { 936 ZSTD_memmove(oLitEnd, match, sequence.matchLength); 937 return sequenceLength; 938 } 939 /* span extDict & currentPrefixSegment */ 940 { size_t const length1 = dictEnd - match; 941 ZSTD_memmove(oLitEnd, match, length1); 942 op = oLitEnd + length1; 943 sequence.matchLength -= length1; 944 match = prefixStart; 945 } 946 } 947 ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); 948 return sequenceLength; 949 } 950 951 /* ZSTD_execSequenceEndSplitLitBuffer(): 952 * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. 953 */ 954 FORCE_NOINLINE 955 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 956 size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, 957 BYTE* const oend, const BYTE* const oend_w, seq_t sequence, 958 const BYTE** litPtr, const BYTE* const litLimit, 959 const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 960 { 961 BYTE* const oLitEnd = op + sequence.litLength; 962 size_t const sequenceLength = sequence.litLength + sequence.matchLength; 963 const BYTE* const iLitEnd = *litPtr + sequence.litLength; 964 const BYTE* match = oLitEnd - sequence.offset; 965 966 967 /* bounds checks : careful of address space overflow in 32-bit mode */ 968 RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); 969 RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); 970 assert(op < op + sequenceLength); 971 assert(oLitEnd < op + sequenceLength); 972 973 /* copy literals */ 974 RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer"); 975 ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength); 976 op = oLitEnd; 977 *litPtr = iLitEnd; 978 979 /* copy Match */ 980 if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 981 /* offset beyond prefix */ 982 RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); 983 match = dictEnd - (prefixStart - match); 984 if (match + sequence.matchLength <= dictEnd) { 985 ZSTD_memmove(oLitEnd, match, sequence.matchLength); 986 return sequenceLength; 987 } 988 /* span extDict & currentPrefixSegment */ 989 { size_t const length1 = dictEnd - match; 990 ZSTD_memmove(oLitEnd, match, length1); 991 op = oLitEnd + length1; 992 sequence.matchLength -= length1; 993 match = prefixStart; 994 } 995 } 996 ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); 997 return sequenceLength; 998 } 999 1000 HINT_INLINE 1001 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1002 size_t ZSTD_execSequence(BYTE* op, 1003 BYTE* const oend, seq_t sequence, 1004 const BYTE** litPtr, const BYTE* const litLimit, 1005 const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 1006 { 1007 BYTE* const oLitEnd = op + sequence.litLength; 1008 size_t const sequenceLength = sequence.litLength + sequence.matchLength; 1009 BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ 1010 BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */ 1011 const BYTE* const iLitEnd = *litPtr + sequence.litLength; 1012 const BYTE* match = oLitEnd - sequence.offset; 1013 1014 assert(op != NULL /* Precondition */); 1015 assert(oend_w < oend /* No underflow */); 1016 1017 #if defined(__aarch64__) 1018 /* prefetch sequence starting from match that will be used for copy later */ 1019 PREFETCH_L1(match); 1020 #endif 1021 /* Handle edge cases in a slow path: 1022 * - Read beyond end of literals 1023 * - Match end is within WILDCOPY_OVERLIMIT of oend 1024 * - 32-bit mode and the match length overflows 1025 */ 1026 if (UNLIKELY( 1027 iLitEnd > litLimit || 1028 oMatchEnd > oend_w || 1029 (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) 1030 return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); 1031 1032 /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ 1033 assert(op <= oLitEnd /* No overflow */); 1034 assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); 1035 assert(oMatchEnd <= oend /* No underflow */); 1036 assert(iLitEnd <= litLimit /* Literal length is in bounds */); 1037 assert(oLitEnd <= oend_w /* Can wildcopy literals */); 1038 assert(oMatchEnd <= oend_w /* Can wildcopy matches */); 1039 1040 /* Copy Literals: 1041 * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. 1042 * We likely don't need the full 32-byte wildcopy. 1043 */ 1044 assert(WILDCOPY_OVERLENGTH >= 16); 1045 ZSTD_copy16(op, (*litPtr)); 1046 if (UNLIKELY(sequence.litLength > 16)) { 1047 ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap); 1048 } 1049 op = oLitEnd; 1050 *litPtr = iLitEnd; /* update for next sequence */ 1051 1052 /* Copy Match */ 1053 if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 1054 /* offset beyond prefix -> go into extDict */ 1055 RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); 1056 match = dictEnd + (match - prefixStart); 1057 if (match + sequence.matchLength <= dictEnd) { 1058 ZSTD_memmove(oLitEnd, match, sequence.matchLength); 1059 return sequenceLength; 1060 } 1061 /* span extDict & currentPrefixSegment */ 1062 { size_t const length1 = dictEnd - match; 1063 ZSTD_memmove(oLitEnd, match, length1); 1064 op = oLitEnd + length1; 1065 sequence.matchLength -= length1; 1066 match = prefixStart; 1067 } 1068 } 1069 /* Match within prefix of 1 or more bytes */ 1070 assert(op <= oMatchEnd); 1071 assert(oMatchEnd <= oend_w); 1072 assert(match >= prefixStart); 1073 assert(sequence.matchLength >= 1); 1074 1075 /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy 1076 * without overlap checking. 1077 */ 1078 if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { 1079 /* We bet on a full wildcopy for matches, since we expect matches to be 1080 * longer than literals (in general). In silesia, ~10% of matches are longer 1081 * than 16 bytes. 1082 */ 1083 ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); 1084 return sequenceLength; 1085 } 1086 assert(sequence.offset < WILDCOPY_VECLEN); 1087 1088 /* Copy 8 bytes and spread the offset to be >= 8. */ 1089 ZSTD_overlapCopy8(&op, &match, sequence.offset); 1090 1091 /* If the match length is > 8 bytes, then continue with the wildcopy. */ 1092 if (sequence.matchLength > 8) { 1093 assert(op < oMatchEnd); 1094 ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst); 1095 } 1096 return sequenceLength; 1097 } 1098 1099 HINT_INLINE 1100 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1101 size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, 1102 BYTE* const oend, const BYTE* const oend_w, seq_t sequence, 1103 const BYTE** litPtr, const BYTE* const litLimit, 1104 const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 1105 { 1106 BYTE* const oLitEnd = op + sequence.litLength; 1107 size_t const sequenceLength = sequence.litLength + sequence.matchLength; 1108 BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ 1109 const BYTE* const iLitEnd = *litPtr + sequence.litLength; 1110 const BYTE* match = oLitEnd - sequence.offset; 1111 1112 assert(op != NULL /* Precondition */); 1113 assert(oend_w < oend /* No underflow */); 1114 /* Handle edge cases in a slow path: 1115 * - Read beyond end of literals 1116 * - Match end is within WILDCOPY_OVERLIMIT of oend 1117 * - 32-bit mode and the match length overflows 1118 */ 1119 if (UNLIKELY( 1120 iLitEnd > litLimit || 1121 oMatchEnd > oend_w || 1122 (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) 1123 return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); 1124 1125 /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ 1126 assert(op <= oLitEnd /* No overflow */); 1127 assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); 1128 assert(oMatchEnd <= oend /* No underflow */); 1129 assert(iLitEnd <= litLimit /* Literal length is in bounds */); 1130 assert(oLitEnd <= oend_w /* Can wildcopy literals */); 1131 assert(oMatchEnd <= oend_w /* Can wildcopy matches */); 1132 1133 /* Copy Literals: 1134 * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. 1135 * We likely don't need the full 32-byte wildcopy. 1136 */ 1137 assert(WILDCOPY_OVERLENGTH >= 16); 1138 ZSTD_copy16(op, (*litPtr)); 1139 if (UNLIKELY(sequence.litLength > 16)) { 1140 ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap); 1141 } 1142 op = oLitEnd; 1143 *litPtr = iLitEnd; /* update for next sequence */ 1144 1145 /* Copy Match */ 1146 if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 1147 /* offset beyond prefix -> go into extDict */ 1148 RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); 1149 match = dictEnd + (match - prefixStart); 1150 if (match + sequence.matchLength <= dictEnd) { 1151 ZSTD_memmove(oLitEnd, match, sequence.matchLength); 1152 return sequenceLength; 1153 } 1154 /* span extDict & currentPrefixSegment */ 1155 { size_t const length1 = dictEnd - match; 1156 ZSTD_memmove(oLitEnd, match, length1); 1157 op = oLitEnd + length1; 1158 sequence.matchLength -= length1; 1159 match = prefixStart; 1160 } } 1161 /* Match within prefix of 1 or more bytes */ 1162 assert(op <= oMatchEnd); 1163 assert(oMatchEnd <= oend_w); 1164 assert(match >= prefixStart); 1165 assert(sequence.matchLength >= 1); 1166 1167 /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy 1168 * without overlap checking. 1169 */ 1170 if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { 1171 /* We bet on a full wildcopy for matches, since we expect matches to be 1172 * longer than literals (in general). In silesia, ~10% of matches are longer 1173 * than 16 bytes. 1174 */ 1175 ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); 1176 return sequenceLength; 1177 } 1178 assert(sequence.offset < WILDCOPY_VECLEN); 1179 1180 /* Copy 8 bytes and spread the offset to be >= 8. */ 1181 ZSTD_overlapCopy8(&op, &match, sequence.offset); 1182 1183 /* If the match length is > 8 bytes, then continue with the wildcopy. */ 1184 if (sequence.matchLength > 8) { 1185 assert(op < oMatchEnd); 1186 ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); 1187 } 1188 return sequenceLength; 1189 } 1190 1191 1192 static void 1193 ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) 1194 { 1195 const void* ptr = dt; 1196 const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr; 1197 DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog); 1198 DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits", 1199 (U32)DStatePtr->state, DTableH->tableLog); 1200 BIT_reloadDStream(bitD); 1201 DStatePtr->table = dt + 1; 1202 } 1203 1204 FORCE_INLINE_TEMPLATE void 1205 ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits) 1206 { 1207 size_t const lowBits = BIT_readBits(bitD, nbBits); 1208 DStatePtr->state = nextState + lowBits; 1209 } 1210 1211 /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum 1212 * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 1213 * bits before reloading. This value is the maximum number of bytes we read 1214 * after reloading when we are decoding long offsets. 1215 */ 1216 #define LONG_OFFSETS_MAX_EXTRA_BITS_32 \ 1217 (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \ 1218 ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \ 1219 : 0) 1220 1221 typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; 1222 1223 /* 1224 * ZSTD_decodeSequence(): 1225 * @p longOffsets : tells the decoder to reload more bit while decoding large offsets 1226 * only used in 32-bit mode 1227 * @return : Sequence (litL + matchL + offset) 1228 */ 1229 FORCE_INLINE_TEMPLATE seq_t 1230 ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) 1231 { 1232 seq_t seq; 1233 /* 1234 * ZSTD_seqSymbol is a 64 bits wide structure. 1235 * It can be loaded in one operation 1236 * and its fields extracted by simply shifting or bit-extracting on aarch64. 1237 * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh 1238 * operations that cause performance drop. This can be avoided by using this 1239 * ZSTD_memcpy hack. 1240 */ 1241 #if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) 1242 ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; 1243 ZSTD_seqSymbol* const llDInfo = &llDInfoS; 1244 ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; 1245 ZSTD_seqSymbol* const ofDInfo = &ofDInfoS; 1246 ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); 1247 ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); 1248 ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); 1249 #else 1250 const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; 1251 const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; 1252 const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; 1253 #endif 1254 seq.matchLength = mlDInfo->baseValue; 1255 seq.litLength = llDInfo->baseValue; 1256 { U32 const ofBase = ofDInfo->baseValue; 1257 BYTE const llBits = llDInfo->nbAdditionalBits; 1258 BYTE const mlBits = mlDInfo->nbAdditionalBits; 1259 BYTE const ofBits = ofDInfo->nbAdditionalBits; 1260 BYTE const totalBits = llBits+mlBits+ofBits; 1261 1262 U16 const llNext = llDInfo->nextState; 1263 U16 const mlNext = mlDInfo->nextState; 1264 U16 const ofNext = ofDInfo->nextState; 1265 U32 const llnbBits = llDInfo->nbBits; 1266 U32 const mlnbBits = mlDInfo->nbBits; 1267 U32 const ofnbBits = ofDInfo->nbBits; 1268 1269 assert(llBits <= MaxLLBits); 1270 assert(mlBits <= MaxMLBits); 1271 assert(ofBits <= MaxOff); 1272 /* 1273 * As gcc has better branch and block analyzers, sometimes it is only 1274 * valuable to mark likeliness for clang, it gives around 3-4% of 1275 * performance. 1276 */ 1277 1278 /* sequence */ 1279 { size_t offset; 1280 if (ofBits > 1) { 1281 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); 1282 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); 1283 ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); 1284 ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); 1285 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { 1286 /* Always read extra bits, this keeps the logic simple, 1287 * avoids branches, and avoids accidentally reading 0 bits. 1288 */ 1289 U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; 1290 offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); 1291 BIT_reloadDStream(&seqState->DStream); 1292 offset += BIT_readBitsFast(&seqState->DStream, extraBits); 1293 } else { 1294 offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ 1295 if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); 1296 } 1297 seqState->prevOffset[2] = seqState->prevOffset[1]; 1298 seqState->prevOffset[1] = seqState->prevOffset[0]; 1299 seqState->prevOffset[0] = offset; 1300 } else { 1301 U32 const ll0 = (llDInfo->baseValue == 0); 1302 if (LIKELY((ofBits == 0))) { 1303 offset = seqState->prevOffset[ll0]; 1304 seqState->prevOffset[1] = seqState->prevOffset[!ll0]; 1305 seqState->prevOffset[0] = offset; 1306 } else { 1307 offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); 1308 { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; 1309 temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */ 1310 if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; 1311 seqState->prevOffset[1] = seqState->prevOffset[0]; 1312 seqState->prevOffset[0] = offset = temp; 1313 } } } 1314 seq.offset = offset; 1315 } 1316 1317 if (mlBits > 0) 1318 seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); 1319 1320 if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) 1321 BIT_reloadDStream(&seqState->DStream); 1322 if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) 1323 BIT_reloadDStream(&seqState->DStream); 1324 /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ 1325 ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); 1326 1327 if (llBits > 0) 1328 seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); 1329 1330 if (MEM_32bits()) 1331 BIT_reloadDStream(&seqState->DStream); 1332 1333 DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", 1334 (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); 1335 1336 if (!isLastSeq) { 1337 /* don't update FSE state for last Sequence */ 1338 ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ 1339 ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ 1340 if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ 1341 ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ 1342 BIT_reloadDStream(&seqState->DStream); 1343 } 1344 } 1345 1346 return seq; 1347 } 1348 1349 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1350 #if DEBUGLEVEL >= 1 1351 static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) 1352 { 1353 size_t const windowSize = dctx->fParams.windowSize; 1354 /* No dictionary used. */ 1355 if (dctx->dictContentEndForFuzzing == NULL) return 0; 1356 /* Dictionary is our prefix. */ 1357 if (prefixStart == dctx->dictContentBeginForFuzzing) return 1; 1358 /* Dictionary is not our ext-dict. */ 1359 if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0; 1360 /* Dictionary is not within our window size. */ 1361 if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0; 1362 /* Dictionary is active. */ 1363 return 1; 1364 } 1365 #endif 1366 1367 static void ZSTD_assertValidSequence( 1368 ZSTD_DCtx const* dctx, 1369 BYTE const* op, BYTE const* oend, 1370 seq_t const seq, 1371 BYTE const* prefixStart, BYTE const* virtualStart) 1372 { 1373 #if DEBUGLEVEL >= 1 1374 if (dctx->isFrameDecompression) { 1375 size_t const windowSize = dctx->fParams.windowSize; 1376 size_t const sequenceSize = seq.litLength + seq.matchLength; 1377 BYTE const* const oLitEnd = op + seq.litLength; 1378 DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", 1379 (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); 1380 assert(op <= oend); 1381 assert((size_t)(oend - op) >= sequenceSize); 1382 assert(sequenceSize <= ZSTD_blockSizeMax(dctx)); 1383 if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { 1384 size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); 1385 /* Offset must be within the dictionary. */ 1386 assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); 1387 assert(seq.offset <= windowSize + dictSize); 1388 } else { 1389 /* Offset must be within our window. */ 1390 assert(seq.offset <= windowSize); 1391 } 1392 } 1393 #else 1394 (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; 1395 #endif 1396 } 1397 #endif 1398 1399 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1400 1401 1402 FORCE_INLINE_TEMPLATE size_t 1403 DONT_VECTORIZE 1404 ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, 1405 void* dst, size_t maxDstSize, 1406 const void* seqStart, size_t seqSize, int nbSeq, 1407 const ZSTD_longOffset_e isLongOffset) 1408 { 1409 const BYTE* ip = (const BYTE*)seqStart; 1410 const BYTE* const iend = ip + seqSize; 1411 BYTE* const ostart = (BYTE*)dst; 1412 BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize); 1413 BYTE* op = ostart; 1414 const BYTE* litPtr = dctx->litPtr; 1415 const BYTE* litBufferEnd = dctx->litBufferEnd; 1416 const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); 1417 const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); 1418 const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); 1419 DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq); 1420 1421 /* Literals are split between internal buffer & output buffer */ 1422 if (nbSeq) { 1423 seqState_t seqState; 1424 dctx->fseEntropy = 1; 1425 { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1426 RETURN_ERROR_IF( 1427 ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), 1428 corruption_detected, ""); 1429 ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); 1430 ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); 1431 ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); 1432 assert(dst != NULL); 1433 1434 ZSTD_STATIC_ASSERT( 1435 BIT_DStream_unfinished < BIT_DStream_completed && 1436 BIT_DStream_endOfBuffer < BIT_DStream_completed && 1437 BIT_DStream_completed < BIT_DStream_overflow); 1438 1439 /* decompress without overrunning litPtr begins */ 1440 { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ 1441 /* Align the decompression loop to 32 + 16 bytes. 1442 * 1443 * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression 1444 * speed swings based on the alignment of the decompression loop. This 1445 * performance swing is caused by parts of the decompression loop falling 1446 * out of the DSB. The entire decompression loop should fit in the DSB, 1447 * when it can't we get much worse performance. You can measure if you've 1448 * hit the good case or the bad case with this perf command for some 1449 * compressed file test.zst: 1450 * 1451 * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ 1452 * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst 1453 * 1454 * If you see most cycles served out of the MITE you've hit the bad case. 1455 * If you see most cycles served out of the DSB you've hit the good case. 1456 * If it is pretty even then you may be in an okay case. 1457 * 1458 * This issue has been reproduced on the following CPUs: 1459 * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 1460 * Use Instruments->Counters to get DSB/MITE cycles. 1461 * I never got performance swings, but I was able to 1462 * go from the good case of mostly DSB to half of the 1463 * cycles served from MITE. 1464 * - Coffeelake: Intel i9-9900k 1465 * - Coffeelake: Intel i7-9700k 1466 * 1467 * I haven't been able to reproduce the instability or DSB misses on any 1468 * of the following CPUS: 1469 * - Haswell 1470 * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH 1471 * - Skylake 1472 * 1473 * Alignment is done for each of the three major decompression loops: 1474 * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer 1475 * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer 1476 * - ZSTD_decompressSequences_body 1477 * Alignment choices are made to minimize large swings on bad cases and influence on performance 1478 * from changes external to this code, rather than to overoptimize on the current commit. 1479 * 1480 * If you are seeing performance stability this script can help test. 1481 * It tests on 4 commits in zstd where I saw performance change. 1482 * 1483 * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 1484 */ 1485 #if defined(__x86_64__) 1486 __asm__(".p2align 6"); 1487 # if __GNUC__ >= 7 1488 /* good for gcc-7, gcc-9, and gcc-11 */ 1489 __asm__("nop"); 1490 __asm__(".p2align 5"); 1491 __asm__("nop"); 1492 __asm__(".p2align 4"); 1493 # if __GNUC__ == 8 || __GNUC__ == 10 1494 /* good for gcc-8 and gcc-10 */ 1495 __asm__("nop"); 1496 __asm__(".p2align 3"); 1497 # endif 1498 # endif 1499 #endif 1500 1501 /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ 1502 for ( ; nbSeq; nbSeq--) { 1503 sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); 1504 if (litPtr + sequence.litLength > dctx->litBufferEnd) break; 1505 { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1506 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1507 assert(!ZSTD_isError(oneSeqSize)); 1508 ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1509 #endif 1510 if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1511 return oneSeqSize; 1512 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1513 op += oneSeqSize; 1514 } } 1515 DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)"); 1516 1517 /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ 1518 if (nbSeq > 0) { 1519 const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1520 DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); 1521 if (leftoverLit) { 1522 RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1523 ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1524 sequence.litLength -= leftoverLit; 1525 op += leftoverLit; 1526 } 1527 litPtr = dctx->litExtraBuffer; 1528 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1529 dctx->litBufferLocation = ZSTD_not_in_dst; 1530 { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1531 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1532 assert(!ZSTD_isError(oneSeqSize)); 1533 ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1534 #endif 1535 if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1536 return oneSeqSize; 1537 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1538 op += oneSeqSize; 1539 } 1540 nbSeq--; 1541 } 1542 } 1543 1544 if (nbSeq > 0) { 1545 /* there is remaining lit from extra buffer */ 1546 1547 #if defined(__x86_64__) 1548 __asm__(".p2align 6"); 1549 __asm__("nop"); 1550 # if __GNUC__ != 7 1551 /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */ 1552 __asm__(".p2align 4"); 1553 __asm__("nop"); 1554 __asm__(".p2align 3"); 1555 # elif __GNUC__ >= 11 1556 __asm__(".p2align 3"); 1557 # else 1558 __asm__(".p2align 5"); 1559 __asm__("nop"); 1560 __asm__(".p2align 3"); 1561 # endif 1562 #endif 1563 1564 for ( ; nbSeq ; nbSeq--) { 1565 seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); 1566 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1567 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1568 assert(!ZSTD_isError(oneSeqSize)); 1569 ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1570 #endif 1571 if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1572 return oneSeqSize; 1573 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1574 op += oneSeqSize; 1575 } 1576 } 1577 1578 /* check if reached exact end */ 1579 DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); 1580 RETURN_ERROR_IF(nbSeq, corruption_detected, ""); 1581 DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed); 1582 RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); 1583 /* save reps for next block */ 1584 { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } 1585 } 1586 1587 /* last literal segment */ 1588 if (dctx->litBufferLocation == ZSTD_split) { 1589 /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ 1590 size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); 1591 DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize); 1592 RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); 1593 if (op != NULL) { 1594 ZSTD_memmove(op, litPtr, lastLLSize); 1595 op += lastLLSize; 1596 } 1597 litPtr = dctx->litExtraBuffer; 1598 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1599 dctx->litBufferLocation = ZSTD_not_in_dst; 1600 } 1601 /* copy last literals from internal buffer */ 1602 { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); 1603 DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize); 1604 RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1605 if (op != NULL) { 1606 ZSTD_memcpy(op, litPtr, lastLLSize); 1607 op += lastLLSize; 1608 } } 1609 1610 DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); 1611 return (size_t)(op - ostart); 1612 } 1613 1614 FORCE_INLINE_TEMPLATE size_t 1615 DONT_VECTORIZE 1616 ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, 1617 void* dst, size_t maxDstSize, 1618 const void* seqStart, size_t seqSize, int nbSeq, 1619 const ZSTD_longOffset_e isLongOffset) 1620 { 1621 const BYTE* ip = (const BYTE*)seqStart; 1622 const BYTE* const iend = ip + seqSize; 1623 BYTE* const ostart = (BYTE*)dst; 1624 BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer; 1625 BYTE* op = ostart; 1626 const BYTE* litPtr = dctx->litPtr; 1627 const BYTE* const litEnd = litPtr + dctx->litSize; 1628 const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); 1629 const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); 1630 const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); 1631 DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); 1632 1633 /* Regen sequences */ 1634 if (nbSeq) { 1635 seqState_t seqState; 1636 dctx->fseEntropy = 1; 1637 { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1638 RETURN_ERROR_IF( 1639 ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)), 1640 corruption_detected, ""); 1641 ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); 1642 ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); 1643 ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); 1644 assert(dst != NULL); 1645 1646 #if defined(__x86_64__) 1647 __asm__(".p2align 6"); 1648 __asm__("nop"); 1649 # if __GNUC__ >= 7 1650 __asm__(".p2align 5"); 1651 __asm__("nop"); 1652 __asm__(".p2align 3"); 1653 # else 1654 __asm__(".p2align 4"); 1655 __asm__("nop"); 1656 __asm__(".p2align 3"); 1657 # endif 1658 #endif 1659 1660 for ( ; nbSeq ; nbSeq--) { 1661 seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); 1662 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); 1663 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1664 assert(!ZSTD_isError(oneSeqSize)); 1665 ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1666 #endif 1667 if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1668 return oneSeqSize; 1669 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1670 op += oneSeqSize; 1671 } 1672 1673 /* check if reached exact end */ 1674 assert(nbSeq == 0); 1675 RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); 1676 /* save reps for next block */ 1677 { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } 1678 } 1679 1680 /* last literal segment */ 1681 { size_t const lastLLSize = (size_t)(litEnd - litPtr); 1682 DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize); 1683 RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1684 if (op != NULL) { 1685 ZSTD_memcpy(op, litPtr, lastLLSize); 1686 op += lastLLSize; 1687 } } 1688 1689 DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); 1690 return (size_t)(op - ostart); 1691 } 1692 1693 static size_t 1694 ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, 1695 void* dst, size_t maxDstSize, 1696 const void* seqStart, size_t seqSize, int nbSeq, 1697 const ZSTD_longOffset_e isLongOffset) 1698 { 1699 return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1700 } 1701 1702 static size_t 1703 ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, 1704 void* dst, size_t maxDstSize, 1705 const void* seqStart, size_t seqSize, int nbSeq, 1706 const ZSTD_longOffset_e isLongOffset) 1707 { 1708 return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1709 } 1710 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1711 1712 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1713 1714 FORCE_INLINE_TEMPLATE 1715 1716 size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, 1717 const BYTE* const prefixStart, const BYTE* const dictEnd) 1718 { 1719 prefetchPos += sequence.litLength; 1720 { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; 1721 /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. 1722 * No consequence though : memory address is only used for prefetching, not for dereferencing */ 1723 const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset); 1724 PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ 1725 } 1726 return prefetchPos + sequence.matchLength; 1727 } 1728 1729 /* This decoding function employs prefetching 1730 * to reduce latency impact of cache misses. 1731 * It's generally employed when block contains a significant portion of long-distance matches 1732 * or when coupled with a "cold" dictionary */ 1733 FORCE_INLINE_TEMPLATE size_t 1734 ZSTD_decompressSequencesLong_body( 1735 ZSTD_DCtx* dctx, 1736 void* dst, size_t maxDstSize, 1737 const void* seqStart, size_t seqSize, int nbSeq, 1738 const ZSTD_longOffset_e isLongOffset) 1739 { 1740 const BYTE* ip = (const BYTE*)seqStart; 1741 const BYTE* const iend = ip + seqSize; 1742 BYTE* const ostart = (BYTE*)dst; 1743 BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize); 1744 BYTE* op = ostart; 1745 const BYTE* litPtr = dctx->litPtr; 1746 const BYTE* litBufferEnd = dctx->litBufferEnd; 1747 const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); 1748 const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); 1749 const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); 1750 1751 /* Regen sequences */ 1752 if (nbSeq) { 1753 #define STORED_SEQS 8 1754 #define STORED_SEQS_MASK (STORED_SEQS-1) 1755 #define ADVANCED_SEQS STORED_SEQS 1756 seq_t sequences[STORED_SEQS]; 1757 int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); 1758 seqState_t seqState; 1759 int seqNb; 1760 size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */ 1761 1762 dctx->fseEntropy = 1; 1763 { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1764 assert(dst != NULL); 1765 assert(iend >= ip); 1766 RETURN_ERROR_IF( 1767 ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), 1768 corruption_detected, ""); 1769 ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); 1770 ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); 1771 ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); 1772 1773 /* prepare in advance */ 1774 for (seqNb=0; seqNb<seqAdvance; seqNb++) { 1775 seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1); 1776 prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1777 sequences[seqNb] = sequence; 1778 } 1779 1780 /* decompress without stomping litBuffer */ 1781 for (; seqNb < nbSeq; seqNb++) { 1782 seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1); 1783 1784 if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) { 1785 /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ 1786 const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1787 if (leftoverLit) 1788 { 1789 RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1790 ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1791 sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit; 1792 op += leftoverLit; 1793 } 1794 litPtr = dctx->litExtraBuffer; 1795 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1796 dctx->litBufferLocation = ZSTD_not_in_dst; 1797 { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1798 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1799 assert(!ZSTD_isError(oneSeqSize)); 1800 ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); 1801 #endif 1802 if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1803 1804 prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1805 sequences[seqNb & STORED_SEQS_MASK] = sequence; 1806 op += oneSeqSize; 1807 } } 1808 else 1809 { 1810 /* lit buffer is either wholly contained in first or second split, or not split at all*/ 1811 size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? 1812 ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : 1813 ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1814 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1815 assert(!ZSTD_isError(oneSeqSize)); 1816 ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); 1817 #endif 1818 if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1819 1820 prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1821 sequences[seqNb & STORED_SEQS_MASK] = sequence; 1822 op += oneSeqSize; 1823 } 1824 } 1825 RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); 1826 1827 /* finish queue */ 1828 seqNb -= seqAdvance; 1829 for ( ; seqNb<nbSeq ; seqNb++) { 1830 seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]); 1831 if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) { 1832 const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1833 if (leftoverLit) { 1834 RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1835 ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1836 sequence->litLength -= leftoverLit; 1837 op += leftoverLit; 1838 } 1839 litPtr = dctx->litExtraBuffer; 1840 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1841 dctx->litBufferLocation = ZSTD_not_in_dst; 1842 { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1843 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1844 assert(!ZSTD_isError(oneSeqSize)); 1845 ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); 1846 #endif 1847 if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1848 op += oneSeqSize; 1849 } 1850 } 1851 else 1852 { 1853 size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? 1854 ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : 1855 ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1856 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1857 assert(!ZSTD_isError(oneSeqSize)); 1858 ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); 1859 #endif 1860 if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1861 op += oneSeqSize; 1862 } 1863 } 1864 1865 /* save reps for next block */ 1866 { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } 1867 } 1868 1869 /* last literal segment */ 1870 if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */ 1871 size_t const lastLLSize = litBufferEnd - litPtr; 1872 RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); 1873 if (op != NULL) { 1874 ZSTD_memmove(op, litPtr, lastLLSize); 1875 op += lastLLSize; 1876 } 1877 litPtr = dctx->litExtraBuffer; 1878 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1879 } 1880 { size_t const lastLLSize = litBufferEnd - litPtr; 1881 RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1882 if (op != NULL) { 1883 ZSTD_memmove(op, litPtr, lastLLSize); 1884 op += lastLLSize; 1885 } 1886 } 1887 1888 return (size_t)(op - ostart); 1889 } 1890 1891 static size_t 1892 ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, 1893 void* dst, size_t maxDstSize, 1894 const void* seqStart, size_t seqSize, int nbSeq, 1895 const ZSTD_longOffset_e isLongOffset) 1896 { 1897 return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1898 } 1899 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ 1900 1901 1902 1903 #if DYNAMIC_BMI2 1904 1905 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1906 static BMI2_TARGET_ATTRIBUTE size_t 1907 DONT_VECTORIZE 1908 ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, 1909 void* dst, size_t maxDstSize, 1910 const void* seqStart, size_t seqSize, int nbSeq, 1911 const ZSTD_longOffset_e isLongOffset) 1912 { 1913 return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1914 } 1915 static BMI2_TARGET_ATTRIBUTE size_t 1916 DONT_VECTORIZE 1917 ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, 1918 void* dst, size_t maxDstSize, 1919 const void* seqStart, size_t seqSize, int nbSeq, 1920 const ZSTD_longOffset_e isLongOffset) 1921 { 1922 return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1923 } 1924 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1925 1926 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1927 static BMI2_TARGET_ATTRIBUTE size_t 1928 ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, 1929 void* dst, size_t maxDstSize, 1930 const void* seqStart, size_t seqSize, int nbSeq, 1931 const ZSTD_longOffset_e isLongOffset) 1932 { 1933 return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1934 } 1935 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ 1936 1937 #endif /* DYNAMIC_BMI2 */ 1938 1939 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1940 static size_t 1941 ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, 1942 const void* seqStart, size_t seqSize, int nbSeq, 1943 const ZSTD_longOffset_e isLongOffset) 1944 { 1945 DEBUGLOG(5, "ZSTD_decompressSequences"); 1946 #if DYNAMIC_BMI2 1947 if (ZSTD_DCtx_get_bmi2(dctx)) { 1948 return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1949 } 1950 #endif 1951 return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1952 } 1953 static size_t 1954 ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, 1955 const void* seqStart, size_t seqSize, int nbSeq, 1956 const ZSTD_longOffset_e isLongOffset) 1957 { 1958 DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); 1959 #if DYNAMIC_BMI2 1960 if (ZSTD_DCtx_get_bmi2(dctx)) { 1961 return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1962 } 1963 #endif 1964 return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1965 } 1966 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1967 1968 1969 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1970 /* ZSTD_decompressSequencesLong() : 1971 * decompression function triggered when a minimum share of offsets is considered "long", 1972 * aka out of cache. 1973 * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance". 1974 * This function will try to mitigate main memory latency through the use of prefetching */ 1975 static size_t 1976 ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, 1977 void* dst, size_t maxDstSize, 1978 const void* seqStart, size_t seqSize, int nbSeq, 1979 const ZSTD_longOffset_e isLongOffset) 1980 { 1981 DEBUGLOG(5, "ZSTD_decompressSequencesLong"); 1982 #if DYNAMIC_BMI2 1983 if (ZSTD_DCtx_get_bmi2(dctx)) { 1984 return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1985 } 1986 #endif 1987 return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1988 } 1989 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ 1990 1991 1992 /* 1993 * @returns The total size of the history referenceable by zstd, including 1994 * both the prefix and the extDict. At @p op any offset larger than this 1995 * is invalid. 1996 */ 1997 static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart) 1998 { 1999 return (size_t)(op - virtualStart); 2000 } 2001 2002 typedef struct { 2003 unsigned longOffsetShare; 2004 unsigned maxNbAdditionalBits; 2005 } ZSTD_OffsetInfo; 2006 2007 /* ZSTD_getOffsetInfo() : 2008 * condition : offTable must be valid 2009 * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) 2010 * compared to maximum possible of (1<<OffFSELog), 2011 * as well as the maximum number additional bits required. 2012 */ 2013 static ZSTD_OffsetInfo 2014 ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq) 2015 { 2016 ZSTD_OffsetInfo info = {0, 0}; 2017 /* If nbSeq == 0, then the offTable is uninitialized, but we have 2018 * no sequences, so both values should be 0. 2019 */ 2020 if (nbSeq != 0) { 2021 const void* ptr = offTable; 2022 U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; 2023 const ZSTD_seqSymbol* table = offTable + 1; 2024 U32 const max = 1 << tableLog; 2025 U32 u; 2026 DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog); 2027 2028 assert(max <= (1 << OffFSELog)); /* max not too large */ 2029 for (u=0; u<max; u++) { 2030 info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits); 2031 if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1; 2032 } 2033 2034 assert(tableLog <= OffFSELog); 2035 info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ 2036 } 2037 2038 return info; 2039 } 2040 2041 /* 2042 * @returns The maximum offset we can decode in one read of our bitstream, without 2043 * reloading more bits in the middle of the offset bits read. Any offsets larger 2044 * than this must use the long offset decoder. 2045 */ 2046 static size_t ZSTD_maxShortOffset(void) 2047 { 2048 if (MEM_64bits()) { 2049 /* We can decode any offset without reloading bits. 2050 * This might change if the max window size grows. 2051 */ 2052 ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); 2053 return (size_t)-1; 2054 } else { 2055 /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. 2056 * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. 2057 * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. 2058 */ 2059 size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; 2060 size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; 2061 assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); 2062 return maxOffset; 2063 } 2064 } 2065 2066 size_t 2067 ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, 2068 void* dst, size_t dstCapacity, 2069 const void* src, size_t srcSize, const streaming_operation streaming) 2070 { /* blockType == blockCompressed */ 2071 const BYTE* ip = (const BYTE*)src; 2072 DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize); 2073 2074 /* Note : the wording of the specification 2075 * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). 2076 * This generally does not happen, as it makes little sense, 2077 * since an uncompressed block would feature same size and have no decompression cost. 2078 * Also, note that decoder from reference libzstd before < v1.5.4 2079 * would consider this edge case as an error. 2080 * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx) 2081 * for broader compatibility with the deployed ecosystem of zstd decoders */ 2082 RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, ""); 2083 2084 /* Decode literals section */ 2085 { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); 2086 DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); 2087 if (ZSTD_isError(litCSize)) return litCSize; 2088 ip += litCSize; 2089 srcSize -= litCSize; 2090 } 2091 2092 /* Build Decoding Tables */ 2093 { 2094 /* Compute the maximum block size, which must also work when !frame and fParams are unset. 2095 * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. 2096 */ 2097 size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx)); 2098 size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart); 2099 /* isLongOffset must be true if there are long offsets. 2100 * Offsets are long if they are larger than ZSTD_maxShortOffset(). 2101 * We don't expect that to be the case in 64-bit mode. 2102 * 2103 * We check here to see if our history is large enough to allow long offsets. 2104 * If it isn't, then we can't possible have (valid) long offsets. If the offset 2105 * is invalid, then it is okay to read it incorrectly. 2106 * 2107 * If isLongOffsets is true, then we will later check our decoding table to see 2108 * if it is even possible to generate long offsets. 2109 */ 2110 ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); 2111 /* These macros control at build-time which decompressor implementation 2112 * we use. If neither is defined, we do some inspection and dispatch at 2113 * runtime. 2114 */ 2115 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 2116 !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 2117 int usePrefetchDecoder = dctx->ddictIsCold; 2118 #else 2119 /* Set to 1 to avoid computing offset info if we don't need to. 2120 * Otherwise this value is ignored. 2121 */ 2122 int usePrefetchDecoder = 1; 2123 #endif 2124 int nbSeq; 2125 size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); 2126 if (ZSTD_isError(seqHSize)) return seqHSize; 2127 ip += seqHSize; 2128 srcSize -= seqHSize; 2129 2130 RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); 2131 RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, 2132 "invalid dst"); 2133 2134 /* If we could potentially have long offsets, or we might want to use the prefetch decoder, 2135 * compute information about the share of long offsets, and the maximum nbAdditionalBits. 2136 * NOTE: could probably use a larger nbSeq limit 2137 */ 2138 if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { 2139 ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); 2140 if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { 2141 /* If isLongOffset, but the maximum number of additional bits that we see in our table is small 2142 * enough, then we know it is impossible to have too long an offset in this block, so we can 2143 * use the regular offset decoder. 2144 */ 2145 isLongOffset = ZSTD_lo_isRegularOffset; 2146 } 2147 if (!usePrefetchDecoder) { 2148 U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ 2149 usePrefetchDecoder = (info.longOffsetShare >= minShare); 2150 } 2151 } 2152 2153 dctx->ddictIsCold = 0; 2154 2155 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 2156 !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 2157 if (usePrefetchDecoder) { 2158 #else 2159 (void)usePrefetchDecoder; 2160 { 2161 #endif 2162 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 2163 return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); 2164 #endif 2165 } 2166 2167 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 2168 /* else */ 2169 if (dctx->litBufferLocation == ZSTD_split) 2170 return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); 2171 else 2172 return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); 2173 #endif 2174 } 2175 } 2176 2177 2178 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 2179 void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) 2180 { 2181 if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ 2182 dctx->dictEnd = dctx->previousDstEnd; 2183 dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); 2184 dctx->prefixStart = dst; 2185 dctx->previousDstEnd = dst; 2186 } 2187 } 2188 2189 2190 size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, 2191 void* dst, size_t dstCapacity, 2192 const void* src, size_t srcSize) 2193 { 2194 size_t dSize; 2195 dctx->isFrameDecompression = 0; 2196 ZSTD_checkContinuity(dctx, dst, dstCapacity); 2197 dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming); 2198 FORWARD_IF_ERROR(dSize, ""); 2199 dctx->previousDstEnd = (char*)dst + dSize; 2200 return dSize; 2201 } 2202 2203 2204 /* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ 2205 size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, 2206 void* dst, size_t dstCapacity, 2207 const void* src, size_t srcSize) 2208 { 2209 return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); 2210 } 2211