1 /* 2 * Copyright (c) Yann Collet, Facebook, Inc. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11 /* zstd_decompress_block : 12 * this module takes care of decompressing _compressed_ block */ 13 14 /*-******************************************************* 15 * Dependencies 16 *********************************************************/ 17 #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ 18 #include "../common/compiler.h" /* prefetch */ 19 #include "../common/cpu.h" /* bmi2 */ 20 #include "../common/mem.h" /* low level memory routines */ 21 #define FSE_STATIC_LINKING_ONLY 22 #include "../common/fse.h" 23 #define HUF_STATIC_LINKING_ONLY 24 #include "../common/huf.h" 25 #include "../common/zstd_internal.h" 26 #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ 27 #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ 28 #include "zstd_decompress_block.h" 29 30 /*_******************************************************* 31 * Macros 32 **********************************************************/ 33 34 /* These two optional macros force the use one way or another of the two 35 * ZSTD_decompressSequences implementations. You can't force in both directions 36 * at the same time. 37 */ 38 #if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 39 defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 40 #error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!" 41 #endif 42 43 44 /*_******************************************************* 45 * Memory operations 46 **********************************************************/ 47 static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } 48 49 50 /*-************************************************************* 51 * Block decoding 52 ***************************************************************/ 53 54 /*! ZSTD_getcBlockSize() : 55 * Provides the size of compressed block from block header `src` */ 56 size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, 57 blockProperties_t* bpPtr) 58 { 59 RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, ""); 60 61 { U32 const cBlockHeader = MEM_readLE24(src); 62 U32 const cSize = cBlockHeader >> 3; 63 bpPtr->lastBlock = cBlockHeader & 1; 64 bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3); 65 bpPtr->origSize = cSize; /* only useful for RLE */ 66 if (bpPtr->blockType == bt_rle) return 1; 67 RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, ""); 68 return cSize; 69 } 70 } 71 72 /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ 73 static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, 74 const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) 75 { 76 if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) 77 { 78 /* room for litbuffer to fit without read faulting */ 79 dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH; 80 dctx->litBufferEnd = dctx->litBuffer + litSize; 81 dctx->litBufferLocation = ZSTD_in_dst; 82 } 83 else if (litSize > ZSTD_LITBUFFEREXTRASIZE) 84 { 85 /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ 86 if (splitImmediately) { 87 /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ 88 dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; 89 dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; 90 } 91 else { 92 /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */ 93 dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; 94 dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; 95 } 96 dctx->litBufferLocation = ZSTD_split; 97 } 98 else 99 { 100 /* fits entirely within litExtraBuffer, so no split is necessary */ 101 dctx->litBuffer = dctx->litExtraBuffer; 102 dctx->litBufferEnd = dctx->litBuffer + litSize; 103 dctx->litBufferLocation = ZSTD_not_in_dst; 104 } 105 } 106 107 /* Hidden declaration for fullbench */ 108 size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, 109 const void* src, size_t srcSize, 110 void* dst, size_t dstCapacity, const streaming_operation streaming); 111 /*! ZSTD_decodeLiteralsBlock() : 112 * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored 113 * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current 114 * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being 115 * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write. 116 * 117 * @return : nb of bytes read from src (< srcSize ) 118 * note : symbol not declared but exposed for fullbench */ 119 size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, 120 const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ 121 void* dst, size_t dstCapacity, const streaming_operation streaming) 122 { 123 DEBUGLOG(5, "ZSTD_decodeLiteralsBlock"); 124 RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); 125 126 { const BYTE* const istart = (const BYTE*) src; 127 symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); 128 129 switch(litEncType) 130 { 131 case set_repeat: 132 DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block"); 133 RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, ""); 134 ZSTD_FALLTHROUGH; 135 136 case set_compressed: 137 RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); 138 { size_t lhSize, litSize, litCSize; 139 U32 singleStream=0; 140 U32 const lhlCode = (istart[0] >> 2) & 3; 141 U32 const lhc = MEM_readLE32(istart); 142 size_t hufSuccess; 143 size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); 144 switch(lhlCode) 145 { 146 case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ 147 /* 2 - 2 - 10 - 10 */ 148 singleStream = !lhlCode; 149 lhSize = 3; 150 litSize = (lhc >> 4) & 0x3FF; 151 litCSize = (lhc >> 14) & 0x3FF; 152 break; 153 case 2: 154 /* 2 - 2 - 14 - 14 */ 155 lhSize = 4; 156 litSize = (lhc >> 4) & 0x3FFF; 157 litCSize = lhc >> 18; 158 break; 159 case 3: 160 /* 2 - 2 - 18 - 18 */ 161 lhSize = 5; 162 litSize = (lhc >> 4) & 0x3FFFF; 163 litCSize = (lhc >> 22) + ((size_t)istart[4] << 10); 164 break; 165 } 166 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 167 RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); 168 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); 169 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); 170 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); 171 172 /* prefetch huffman table if cold */ 173 if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { 174 PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable)); 175 } 176 177 if (litEncType==set_repeat) { 178 if (singleStream) { 179 hufSuccess = HUF_decompress1X_usingDTable_bmi2( 180 dctx->litBuffer, litSize, istart+lhSize, litCSize, 181 dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); 182 } else { 183 hufSuccess = HUF_decompress4X_usingDTable_bmi2( 184 dctx->litBuffer, litSize, istart+lhSize, litCSize, 185 dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); 186 } 187 } else { 188 if (singleStream) { 189 #if defined(HUF_FORCE_DECOMPRESS_X2) 190 hufSuccess = HUF_decompress1X_DCtx_wksp( 191 dctx->entropy.hufTable, dctx->litBuffer, litSize, 192 istart+lhSize, litCSize, dctx->workspace, 193 sizeof(dctx->workspace)); 194 #else 195 hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( 196 dctx->entropy.hufTable, dctx->litBuffer, litSize, 197 istart+lhSize, litCSize, dctx->workspace, 198 sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); 199 #endif 200 } else { 201 hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( 202 dctx->entropy.hufTable, dctx->litBuffer, litSize, 203 istart+lhSize, litCSize, dctx->workspace, 204 sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); 205 } 206 } 207 if (dctx->litBufferLocation == ZSTD_split) 208 { 209 ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); 210 ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); 211 dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; 212 dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; 213 } 214 215 RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); 216 217 dctx->litPtr = dctx->litBuffer; 218 dctx->litSize = litSize; 219 dctx->litEntropy = 1; 220 if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; 221 return litCSize + lhSize; 222 } 223 224 case set_basic: 225 { size_t litSize, lhSize; 226 U32 const lhlCode = ((istart[0]) >> 2) & 3; 227 size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); 228 switch(lhlCode) 229 { 230 case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ 231 lhSize = 1; 232 litSize = istart[0] >> 3; 233 break; 234 case 1: 235 lhSize = 2; 236 litSize = MEM_readLE16(istart) >> 4; 237 break; 238 case 3: 239 lhSize = 3; 240 litSize = MEM_readLE24(istart) >> 4; 241 break; 242 } 243 244 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 245 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); 246 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); 247 if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ 248 RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); 249 if (dctx->litBufferLocation == ZSTD_split) 250 { 251 ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE); 252 ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); 253 } 254 else 255 { 256 ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize); 257 } 258 dctx->litPtr = dctx->litBuffer; 259 dctx->litSize = litSize; 260 return lhSize+litSize; 261 } 262 /* direct reference into compressed stream */ 263 dctx->litPtr = istart+lhSize; 264 dctx->litSize = litSize; 265 dctx->litBufferEnd = dctx->litPtr + litSize; 266 dctx->litBufferLocation = ZSTD_not_in_dst; 267 return lhSize+litSize; 268 } 269 270 case set_rle: 271 { U32 const lhlCode = ((istart[0]) >> 2) & 3; 272 size_t litSize, lhSize; 273 size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); 274 switch(lhlCode) 275 { 276 case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ 277 lhSize = 1; 278 litSize = istart[0] >> 3; 279 break; 280 case 1: 281 lhSize = 2; 282 litSize = MEM_readLE16(istart) >> 4; 283 break; 284 case 3: 285 lhSize = 3; 286 litSize = MEM_readLE24(istart) >> 4; 287 RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); 288 break; 289 } 290 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 291 RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); 292 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); 293 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); 294 if (dctx->litBufferLocation == ZSTD_split) 295 { 296 ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE); 297 ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE); 298 } 299 else 300 { 301 ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize); 302 } 303 dctx->litPtr = dctx->litBuffer; 304 dctx->litSize = litSize; 305 return lhSize+1; 306 } 307 default: 308 RETURN_ERROR(corruption_detected, "impossible"); 309 } 310 } 311 } 312 313 /* Default FSE distribution tables. 314 * These are pre-calculated FSE decoding tables using default distributions as defined in specification : 315 * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions 316 * They were generated programmatically with following method : 317 * - start from default distributions, present in /lib/common/zstd_internal.h 318 * - generate tables normally, using ZSTD_buildFSETable() 319 * - printout the content of tables 320 * - pretify output, report below, test with fuzzer to ensure it's correct */ 321 322 /* Default FSE distribution table for Literal Lengths */ 323 static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = { 324 { 1, 1, 1, LL_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ 325 /* nextState, nbAddBits, nbBits, baseVal */ 326 { 0, 0, 4, 0}, { 16, 0, 4, 0}, 327 { 32, 0, 5, 1}, { 0, 0, 5, 3}, 328 { 0, 0, 5, 4}, { 0, 0, 5, 6}, 329 { 0, 0, 5, 7}, { 0, 0, 5, 9}, 330 { 0, 0, 5, 10}, { 0, 0, 5, 12}, 331 { 0, 0, 6, 14}, { 0, 1, 5, 16}, 332 { 0, 1, 5, 20}, { 0, 1, 5, 22}, 333 { 0, 2, 5, 28}, { 0, 3, 5, 32}, 334 { 0, 4, 5, 48}, { 32, 6, 5, 64}, 335 { 0, 7, 5, 128}, { 0, 8, 6, 256}, 336 { 0, 10, 6, 1024}, { 0, 12, 6, 4096}, 337 { 32, 0, 4, 0}, { 0, 0, 4, 1}, 338 { 0, 0, 5, 2}, { 32, 0, 5, 4}, 339 { 0, 0, 5, 5}, { 32, 0, 5, 7}, 340 { 0, 0, 5, 8}, { 32, 0, 5, 10}, 341 { 0, 0, 5, 11}, { 0, 0, 6, 13}, 342 { 32, 1, 5, 16}, { 0, 1, 5, 18}, 343 { 32, 1, 5, 22}, { 0, 2, 5, 24}, 344 { 32, 3, 5, 32}, { 0, 3, 5, 40}, 345 { 0, 6, 4, 64}, { 16, 6, 4, 64}, 346 { 32, 7, 5, 128}, { 0, 9, 6, 512}, 347 { 0, 11, 6, 2048}, { 48, 0, 4, 0}, 348 { 16, 0, 4, 1}, { 32, 0, 5, 2}, 349 { 32, 0, 5, 3}, { 32, 0, 5, 5}, 350 { 32, 0, 5, 6}, { 32, 0, 5, 8}, 351 { 32, 0, 5, 9}, { 32, 0, 5, 11}, 352 { 32, 0, 5, 12}, { 0, 0, 6, 15}, 353 { 32, 1, 5, 18}, { 32, 1, 5, 20}, 354 { 32, 2, 5, 24}, { 32, 2, 5, 28}, 355 { 32, 3, 5, 40}, { 32, 4, 5, 48}, 356 { 0, 16, 6,65536}, { 0, 15, 6,32768}, 357 { 0, 14, 6,16384}, { 0, 13, 6, 8192}, 358 }; /* LL_defaultDTable */ 359 360 /* Default FSE distribution table for Offset Codes */ 361 static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = { 362 { 1, 1, 1, OF_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ 363 /* nextState, nbAddBits, nbBits, baseVal */ 364 { 0, 0, 5, 0}, { 0, 6, 4, 61}, 365 { 0, 9, 5, 509}, { 0, 15, 5,32765}, 366 { 0, 21, 5,2097149}, { 0, 3, 5, 5}, 367 { 0, 7, 4, 125}, { 0, 12, 5, 4093}, 368 { 0, 18, 5,262141}, { 0, 23, 5,8388605}, 369 { 0, 5, 5, 29}, { 0, 8, 4, 253}, 370 { 0, 14, 5,16381}, { 0, 20, 5,1048573}, 371 { 0, 2, 5, 1}, { 16, 7, 4, 125}, 372 { 0, 11, 5, 2045}, { 0, 17, 5,131069}, 373 { 0, 22, 5,4194301}, { 0, 4, 5, 13}, 374 { 16, 8, 4, 253}, { 0, 13, 5, 8189}, 375 { 0, 19, 5,524285}, { 0, 1, 5, 1}, 376 { 16, 6, 4, 61}, { 0, 10, 5, 1021}, 377 { 0, 16, 5,65533}, { 0, 28, 5,268435453}, 378 { 0, 27, 5,134217725}, { 0, 26, 5,67108861}, 379 { 0, 25, 5,33554429}, { 0, 24, 5,16777213}, 380 }; /* OF_defaultDTable */ 381 382 383 /* Default FSE distribution table for Match Lengths */ 384 static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = { 385 { 1, 1, 1, ML_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ 386 /* nextState, nbAddBits, nbBits, baseVal */ 387 { 0, 0, 6, 3}, { 0, 0, 4, 4}, 388 { 32, 0, 5, 5}, { 0, 0, 5, 6}, 389 { 0, 0, 5, 8}, { 0, 0, 5, 9}, 390 { 0, 0, 5, 11}, { 0, 0, 6, 13}, 391 { 0, 0, 6, 16}, { 0, 0, 6, 19}, 392 { 0, 0, 6, 22}, { 0, 0, 6, 25}, 393 { 0, 0, 6, 28}, { 0, 0, 6, 31}, 394 { 0, 0, 6, 34}, { 0, 1, 6, 37}, 395 { 0, 1, 6, 41}, { 0, 2, 6, 47}, 396 { 0, 3, 6, 59}, { 0, 4, 6, 83}, 397 { 0, 7, 6, 131}, { 0, 9, 6, 515}, 398 { 16, 0, 4, 4}, { 0, 0, 4, 5}, 399 { 32, 0, 5, 6}, { 0, 0, 5, 7}, 400 { 32, 0, 5, 9}, { 0, 0, 5, 10}, 401 { 0, 0, 6, 12}, { 0, 0, 6, 15}, 402 { 0, 0, 6, 18}, { 0, 0, 6, 21}, 403 { 0, 0, 6, 24}, { 0, 0, 6, 27}, 404 { 0, 0, 6, 30}, { 0, 0, 6, 33}, 405 { 0, 1, 6, 35}, { 0, 1, 6, 39}, 406 { 0, 2, 6, 43}, { 0, 3, 6, 51}, 407 { 0, 4, 6, 67}, { 0, 5, 6, 99}, 408 { 0, 8, 6, 259}, { 32, 0, 4, 4}, 409 { 48, 0, 4, 4}, { 16, 0, 4, 5}, 410 { 32, 0, 5, 7}, { 32, 0, 5, 8}, 411 { 32, 0, 5, 10}, { 32, 0, 5, 11}, 412 { 0, 0, 6, 14}, { 0, 0, 6, 17}, 413 { 0, 0, 6, 20}, { 0, 0, 6, 23}, 414 { 0, 0, 6, 26}, { 0, 0, 6, 29}, 415 { 0, 0, 6, 32}, { 0, 16, 6,65539}, 416 { 0, 15, 6,32771}, { 0, 14, 6,16387}, 417 { 0, 13, 6, 8195}, { 0, 12, 6, 4099}, 418 { 0, 11, 6, 2051}, { 0, 10, 6, 1027}, 419 }; /* ML_defaultDTable */ 420 421 422 static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits) 423 { 424 void* ptr = dt; 425 ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr; 426 ZSTD_seqSymbol* const cell = dt + 1; 427 428 DTableH->tableLog = 0; 429 DTableH->fastMode = 0; 430 431 cell->nbBits = 0; 432 cell->nextState = 0; 433 assert(nbAddBits < 255); 434 cell->nbAdditionalBits = nbAddBits; 435 cell->baseValue = baseValue; 436 } 437 438 439 /* ZSTD_buildFSETable() : 440 * generate FSE decoding table for one symbol (ll, ml or off) 441 * cannot fail if input is valid => 442 * all inputs are presumed validated at this stage */ 443 FORCE_INLINE_TEMPLATE 444 void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, 445 const short* normalizedCounter, unsigned maxSymbolValue, 446 const U32* baseValue, const U8* nbAdditionalBits, 447 unsigned tableLog, void* wksp, size_t wkspSize) 448 { 449 ZSTD_seqSymbol* const tableDecode = dt+1; 450 U32 const maxSV1 = maxSymbolValue + 1; 451 U32 const tableSize = 1 << tableLog; 452 453 U16* symbolNext = (U16*)wksp; 454 BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1); 455 U32 highThreshold = tableSize - 1; 456 457 458 /* Sanity Checks */ 459 assert(maxSymbolValue <= MaxSeq); 460 assert(tableLog <= MaxFSELog); 461 assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE); 462 (void)wkspSize; 463 /* Init, lay down lowprob symbols */ 464 { ZSTD_seqSymbol_header DTableH; 465 DTableH.tableLog = tableLog; 466 DTableH.fastMode = 1; 467 { S16 const largeLimit= (S16)(1 << (tableLog-1)); 468 U32 s; 469 for (s=0; s<maxSV1; s++) { 470 if (normalizedCounter[s]==-1) { 471 tableDecode[highThreshold--].baseValue = s; 472 symbolNext[s] = 1; 473 } else { 474 if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; 475 assert(normalizedCounter[s]>=0); 476 symbolNext[s] = (U16)normalizedCounter[s]; 477 } } } 478 ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); 479 } 480 481 /* Spread symbols */ 482 assert(tableSize <= 512); 483 /* Specialized symbol spreading for the case when there are 484 * no low probability (-1 count) symbols. When compressing 485 * small blocks we avoid low probability symbols to hit this 486 * case, since header decoding speed matters more. 487 */ 488 if (highThreshold == tableSize - 1) { 489 size_t const tableMask = tableSize-1; 490 size_t const step = FSE_TABLESTEP(tableSize); 491 /* First lay down the symbols in order. 492 * We use a uint64_t to lay down 8 bytes at a time. This reduces branch 493 * misses since small blocks generally have small table logs, so nearly 494 * all symbols have counts <= 8. We ensure we have 8 bytes at the end of 495 * our buffer to handle the over-write. 496 */ 497 { 498 U64 const add = 0x0101010101010101ull; 499 size_t pos = 0; 500 U64 sv = 0; 501 U32 s; 502 for (s=0; s<maxSV1; ++s, sv += add) { 503 int i; 504 int const n = normalizedCounter[s]; 505 MEM_write64(spread + pos, sv); 506 for (i = 8; i < n; i += 8) { 507 MEM_write64(spread + pos + i, sv); 508 } 509 pos += n; 510 } 511 } 512 /* Now we spread those positions across the table. 513 * The benefit of doing it in two stages is that we avoid the the 514 * variable size inner loop, which caused lots of branch misses. 515 * Now we can run through all the positions without any branch misses. 516 * We unroll the loop twice, since that is what emperically worked best. 517 */ 518 { 519 size_t position = 0; 520 size_t s; 521 size_t const unroll = 2; 522 assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ 523 for (s = 0; s < (size_t)tableSize; s += unroll) { 524 size_t u; 525 for (u = 0; u < unroll; ++u) { 526 size_t const uPosition = (position + (u * step)) & tableMask; 527 tableDecode[uPosition].baseValue = spread[s + u]; 528 } 529 position = (position + (unroll * step)) & tableMask; 530 } 531 assert(position == 0); 532 } 533 } else { 534 U32 const tableMask = tableSize-1; 535 U32 const step = FSE_TABLESTEP(tableSize); 536 U32 s, position = 0; 537 for (s=0; s<maxSV1; s++) { 538 int i; 539 int const n = normalizedCounter[s]; 540 for (i=0; i<n; i++) { 541 tableDecode[position].baseValue = s; 542 position = (position + step) & tableMask; 543 while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */ 544 } } 545 assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ 546 } 547 548 /* Build Decoding table */ 549 { 550 U32 u; 551 for (u=0; u<tableSize; u++) { 552 U32 const symbol = tableDecode[u].baseValue; 553 U32 const nextState = symbolNext[symbol]++; 554 tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) ); 555 tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize); 556 assert(nbAdditionalBits[symbol] < 255); 557 tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol]; 558 tableDecode[u].baseValue = baseValue[symbol]; 559 } 560 } 561 } 562 563 /* Avoids the FORCE_INLINE of the _body() function. */ 564 static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt, 565 const short* normalizedCounter, unsigned maxSymbolValue, 566 const U32* baseValue, const U8* nbAdditionalBits, 567 unsigned tableLog, void* wksp, size_t wkspSize) 568 { 569 ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, 570 baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 571 } 572 573 #if DYNAMIC_BMI2 574 BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt, 575 const short* normalizedCounter, unsigned maxSymbolValue, 576 const U32* baseValue, const U8* nbAdditionalBits, 577 unsigned tableLog, void* wksp, size_t wkspSize) 578 { 579 ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, 580 baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 581 } 582 #endif 583 584 void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, 585 const short* normalizedCounter, unsigned maxSymbolValue, 586 const U32* baseValue, const U8* nbAdditionalBits, 587 unsigned tableLog, void* wksp, size_t wkspSize, int bmi2) 588 { 589 #if DYNAMIC_BMI2 590 if (bmi2) { 591 ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue, 592 baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 593 return; 594 } 595 #endif 596 (void)bmi2; 597 ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue, 598 baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 599 } 600 601 602 /*! ZSTD_buildSeqTable() : 603 * @return : nb bytes read from src, 604 * or an error code if it fails */ 605 static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr, 606 symbolEncodingType_e type, unsigned max, U32 maxLog, 607 const void* src, size_t srcSize, 608 const U32* baseValue, const U8* nbAdditionalBits, 609 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable, 610 int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize, 611 int bmi2) 612 { 613 switch(type) 614 { 615 case set_rle : 616 RETURN_ERROR_IF(!srcSize, srcSize_wrong, ""); 617 RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, ""); 618 { U32 const symbol = *(const BYTE*)src; 619 U32 const baseline = baseValue[symbol]; 620 U8 const nbBits = nbAdditionalBits[symbol]; 621 ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); 622 } 623 *DTablePtr = DTableSpace; 624 return 1; 625 case set_basic : 626 *DTablePtr = defaultTable; 627 return 0; 628 case set_repeat: 629 RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, ""); 630 /* prefetch FSE table if used */ 631 if (ddictIsCold && (nbSeq > 24 /* heuristic */)) { 632 const void* const pStart = *DTablePtr; 633 size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog)); 634 PREFETCH_AREA(pStart, pSize); 635 } 636 return 0; 637 case set_compressed : 638 { unsigned tableLog; 639 S16 norm[MaxSeq+1]; 640 size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); 641 RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, ""); 642 RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, ""); 643 ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2); 644 *DTablePtr = DTableSpace; 645 return headerSize; 646 } 647 default : 648 assert(0); 649 RETURN_ERROR(GENERIC, "impossible"); 650 } 651 } 652 653 size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, 654 const void* src, size_t srcSize) 655 { 656 const BYTE* const istart = (const BYTE*)src; 657 const BYTE* const iend = istart + srcSize; 658 const BYTE* ip = istart; 659 int nbSeq; 660 DEBUGLOG(5, "ZSTD_decodeSeqHeaders"); 661 662 /* check */ 663 RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, ""); 664 665 /* SeqHead */ 666 nbSeq = *ip++; 667 if (!nbSeq) { 668 *nbSeqPtr=0; 669 RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, ""); 670 return 1; 671 } 672 if (nbSeq > 0x7F) { 673 if (nbSeq == 0xFF) { 674 RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); 675 nbSeq = MEM_readLE16(ip) + LONGNBSEQ; 676 ip+=2; 677 } else { 678 RETURN_ERROR_IF(ip >= iend, srcSize_wrong, ""); 679 nbSeq = ((nbSeq-0x80)<<8) + *ip++; 680 } 681 } 682 *nbSeqPtr = nbSeq; 683 684 /* FSE table descriptors */ 685 RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ 686 { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); 687 symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); 688 symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); 689 ip++; 690 691 /* Build DTables */ 692 { size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, 693 LLtype, MaxLL, LLFSELog, 694 ip, iend-ip, 695 LL_base, LL_bits, 696 LL_defaultDTable, dctx->fseEntropy, 697 dctx->ddictIsCold, nbSeq, 698 dctx->workspace, sizeof(dctx->workspace), 699 ZSTD_DCtx_get_bmi2(dctx)); 700 RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 701 ip += llhSize; 702 } 703 704 { size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, 705 OFtype, MaxOff, OffFSELog, 706 ip, iend-ip, 707 OF_base, OF_bits, 708 OF_defaultDTable, dctx->fseEntropy, 709 dctx->ddictIsCold, nbSeq, 710 dctx->workspace, sizeof(dctx->workspace), 711 ZSTD_DCtx_get_bmi2(dctx)); 712 RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 713 ip += ofhSize; 714 } 715 716 { size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, 717 MLtype, MaxML, MLFSELog, 718 ip, iend-ip, 719 ML_base, ML_bits, 720 ML_defaultDTable, dctx->fseEntropy, 721 dctx->ddictIsCold, nbSeq, 722 dctx->workspace, sizeof(dctx->workspace), 723 ZSTD_DCtx_get_bmi2(dctx)); 724 RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 725 ip += mlhSize; 726 } 727 } 728 729 return ip-istart; 730 } 731 732 733 typedef struct { 734 size_t litLength; 735 size_t matchLength; 736 size_t offset; 737 } seq_t; 738 739 typedef struct { 740 size_t state; 741 const ZSTD_seqSymbol* table; 742 } ZSTD_fseState; 743 744 typedef struct { 745 BIT_DStream_t DStream; 746 ZSTD_fseState stateLL; 747 ZSTD_fseState stateOffb; 748 ZSTD_fseState stateML; 749 size_t prevOffset[ZSTD_REP_NUM]; 750 } seqState_t; 751 752 /*! ZSTD_overlapCopy8() : 753 * Copies 8 bytes from ip to op and updates op and ip where ip <= op. 754 * If the offset is < 8 then the offset is spread to at least 8 bytes. 755 * 756 * Precondition: *ip <= *op 757 * Postcondition: *op - *op >= 8 758 */ 759 HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) { 760 assert(*ip <= *op); 761 if (offset < 8) { 762 /* close range match, overlap */ 763 static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ 764 static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */ 765 int const sub2 = dec64table[offset]; 766 (*op)[0] = (*ip)[0]; 767 (*op)[1] = (*ip)[1]; 768 (*op)[2] = (*ip)[2]; 769 (*op)[3] = (*ip)[3]; 770 *ip += dec32table[offset]; 771 ZSTD_copy4(*op+4, *ip); 772 *ip -= sub2; 773 } else { 774 ZSTD_copy8(*op, *ip); 775 } 776 *ip += 8; 777 *op += 8; 778 assert(*op - *ip >= 8); 779 } 780 781 /*! ZSTD_safecopy() : 782 * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer 783 * and write up to 16 bytes past oend_w (op >= oend_w is allowed). 784 * This function is only called in the uncommon case where the sequence is near the end of the block. It 785 * should be fast for a single long sequence, but can be slow for several short sequences. 786 * 787 * @param ovtype controls the overlap detection 788 * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart. 789 * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. 790 * The src buffer must be before the dst buffer. 791 */ 792 static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { 793 ptrdiff_t const diff = op - ip; 794 BYTE* const oend = op + length; 795 796 assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) || 797 (ovtype == ZSTD_overlap_src_before_dst && diff >= 0)); 798 799 if (length < 8) { 800 /* Handle short lengths. */ 801 while (op < oend) *op++ = *ip++; 802 return; 803 } 804 if (ovtype == ZSTD_overlap_src_before_dst) { 805 /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */ 806 assert(length >= 8); 807 ZSTD_overlapCopy8(&op, &ip, diff); 808 length -= 8; 809 assert(op - ip >= 8); 810 assert(op <= oend); 811 } 812 813 if (oend <= oend_w) { 814 /* No risk of overwrite. */ 815 ZSTD_wildcopy(op, ip, length, ovtype); 816 return; 817 } 818 if (op <= oend_w) { 819 /* Wildcopy until we get close to the end. */ 820 assert(oend > oend_w); 821 ZSTD_wildcopy(op, ip, oend_w - op, ovtype); 822 ip += oend_w - op; 823 op += oend_w - op; 824 } 825 /* Handle the leftovers. */ 826 while (op < oend) *op++ = *ip++; 827 } 828 829 /* ZSTD_safecopyDstBeforeSrc(): 830 * This version allows overlap with dst before src, or handles the non-overlap case with dst after src 831 * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ 832 static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) { 833 ptrdiff_t const diff = op - ip; 834 BYTE* const oend = op + length; 835 836 if (length < 8 || diff > -8) { 837 /* Handle short lengths, close overlaps, and dst not before src. */ 838 while (op < oend) *op++ = *ip++; 839 return; 840 } 841 842 if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) { 843 ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap); 844 ip += oend - WILDCOPY_OVERLENGTH - op; 845 op += oend - WILDCOPY_OVERLENGTH - op; 846 } 847 848 /* Handle the leftovers. */ 849 while (op < oend) *op++ = *ip++; 850 } 851 852 /* ZSTD_execSequenceEnd(): 853 * This version handles cases that are near the end of the output buffer. It requires 854 * more careful checks to make sure there is no overflow. By separating out these hard 855 * and unlikely cases, we can speed up the common cases. 856 * 857 * NOTE: This function needs to be fast for a single long sequence, but doesn't need 858 * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). 859 */ 860 FORCE_NOINLINE 861 size_t ZSTD_execSequenceEnd(BYTE* op, 862 BYTE* const oend, seq_t sequence, 863 const BYTE** litPtr, const BYTE* const litLimit, 864 const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 865 { 866 BYTE* const oLitEnd = op + sequence.litLength; 867 size_t const sequenceLength = sequence.litLength + sequence.matchLength; 868 const BYTE* const iLitEnd = *litPtr + sequence.litLength; 869 const BYTE* match = oLitEnd - sequence.offset; 870 BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; 871 872 /* bounds checks : careful of address space overflow in 32-bit mode */ 873 RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); 874 RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); 875 assert(op < op + sequenceLength); 876 assert(oLitEnd < op + sequenceLength); 877 878 /* copy literals */ 879 ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap); 880 op = oLitEnd; 881 *litPtr = iLitEnd; 882 883 /* copy Match */ 884 if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 885 /* offset beyond prefix */ 886 RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); 887 match = dictEnd - (prefixStart - match); 888 if (match + sequence.matchLength <= dictEnd) { 889 ZSTD_memmove(oLitEnd, match, sequence.matchLength); 890 return sequenceLength; 891 } 892 /* span extDict & currentPrefixSegment */ 893 { size_t const length1 = dictEnd - match; 894 ZSTD_memmove(oLitEnd, match, length1); 895 op = oLitEnd + length1; 896 sequence.matchLength -= length1; 897 match = prefixStart; 898 } 899 } 900 ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); 901 return sequenceLength; 902 } 903 904 /* ZSTD_execSequenceEndSplitLitBuffer(): 905 * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. 906 */ 907 FORCE_NOINLINE 908 size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, 909 BYTE* const oend, const BYTE* const oend_w, seq_t sequence, 910 const BYTE** litPtr, const BYTE* const litLimit, 911 const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 912 { 913 BYTE* const oLitEnd = op + sequence.litLength; 914 size_t const sequenceLength = sequence.litLength + sequence.matchLength; 915 const BYTE* const iLitEnd = *litPtr + sequence.litLength; 916 const BYTE* match = oLitEnd - sequence.offset; 917 918 919 /* bounds checks : careful of address space overflow in 32-bit mode */ 920 RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); 921 RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); 922 assert(op < op + sequenceLength); 923 assert(oLitEnd < op + sequenceLength); 924 925 /* copy literals */ 926 RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer"); 927 ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength); 928 op = oLitEnd; 929 *litPtr = iLitEnd; 930 931 /* copy Match */ 932 if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 933 /* offset beyond prefix */ 934 RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); 935 match = dictEnd - (prefixStart - match); 936 if (match + sequence.matchLength <= dictEnd) { 937 ZSTD_memmove(oLitEnd, match, sequence.matchLength); 938 return sequenceLength; 939 } 940 /* span extDict & currentPrefixSegment */ 941 { size_t const length1 = dictEnd - match; 942 ZSTD_memmove(oLitEnd, match, length1); 943 op = oLitEnd + length1; 944 sequence.matchLength -= length1; 945 match = prefixStart; 946 } 947 } 948 ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); 949 return sequenceLength; 950 } 951 952 HINT_INLINE 953 size_t ZSTD_execSequence(BYTE* op, 954 BYTE* const oend, seq_t sequence, 955 const BYTE** litPtr, const BYTE* const litLimit, 956 const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 957 { 958 BYTE* const oLitEnd = op + sequence.litLength; 959 size_t const sequenceLength = sequence.litLength + sequence.matchLength; 960 BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ 961 BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */ 962 const BYTE* const iLitEnd = *litPtr + sequence.litLength; 963 const BYTE* match = oLitEnd - sequence.offset; 964 965 assert(op != NULL /* Precondition */); 966 assert(oend_w < oend /* No underflow */); 967 /* Handle edge cases in a slow path: 968 * - Read beyond end of literals 969 * - Match end is within WILDCOPY_OVERLIMIT of oend 970 * - 32-bit mode and the match length overflows 971 */ 972 if (UNLIKELY( 973 iLitEnd > litLimit || 974 oMatchEnd > oend_w || 975 (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) 976 return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); 977 978 /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ 979 assert(op <= oLitEnd /* No overflow */); 980 assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); 981 assert(oMatchEnd <= oend /* No underflow */); 982 assert(iLitEnd <= litLimit /* Literal length is in bounds */); 983 assert(oLitEnd <= oend_w /* Can wildcopy literals */); 984 assert(oMatchEnd <= oend_w /* Can wildcopy matches */); 985 986 /* Copy Literals: 987 * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. 988 * We likely don't need the full 32-byte wildcopy. 989 */ 990 assert(WILDCOPY_OVERLENGTH >= 16); 991 ZSTD_copy16(op, (*litPtr)); 992 if (UNLIKELY(sequence.litLength > 16)) { 993 ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap); 994 } 995 op = oLitEnd; 996 *litPtr = iLitEnd; /* update for next sequence */ 997 998 /* Copy Match */ 999 if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 1000 /* offset beyond prefix -> go into extDict */ 1001 RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); 1002 match = dictEnd + (match - prefixStart); 1003 if (match + sequence.matchLength <= dictEnd) { 1004 ZSTD_memmove(oLitEnd, match, sequence.matchLength); 1005 return sequenceLength; 1006 } 1007 /* span extDict & currentPrefixSegment */ 1008 { size_t const length1 = dictEnd - match; 1009 ZSTD_memmove(oLitEnd, match, length1); 1010 op = oLitEnd + length1; 1011 sequence.matchLength -= length1; 1012 match = prefixStart; 1013 } 1014 } 1015 /* Match within prefix of 1 or more bytes */ 1016 assert(op <= oMatchEnd); 1017 assert(oMatchEnd <= oend_w); 1018 assert(match >= prefixStart); 1019 assert(sequence.matchLength >= 1); 1020 1021 /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy 1022 * without overlap checking. 1023 */ 1024 if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { 1025 /* We bet on a full wildcopy for matches, since we expect matches to be 1026 * longer than literals (in general). In silesia, ~10% of matches are longer 1027 * than 16 bytes. 1028 */ 1029 ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); 1030 return sequenceLength; 1031 } 1032 assert(sequence.offset < WILDCOPY_VECLEN); 1033 1034 /* Copy 8 bytes and spread the offset to be >= 8. */ 1035 ZSTD_overlapCopy8(&op, &match, sequence.offset); 1036 1037 /* If the match length is > 8 bytes, then continue with the wildcopy. */ 1038 if (sequence.matchLength > 8) { 1039 assert(op < oMatchEnd); 1040 ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst); 1041 } 1042 return sequenceLength; 1043 } 1044 1045 HINT_INLINE 1046 size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, 1047 BYTE* const oend, const BYTE* const oend_w, seq_t sequence, 1048 const BYTE** litPtr, const BYTE* const litLimit, 1049 const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 1050 { 1051 BYTE* const oLitEnd = op + sequence.litLength; 1052 size_t const sequenceLength = sequence.litLength + sequence.matchLength; 1053 BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ 1054 const BYTE* const iLitEnd = *litPtr + sequence.litLength; 1055 const BYTE* match = oLitEnd - sequence.offset; 1056 1057 assert(op != NULL /* Precondition */); 1058 assert(oend_w < oend /* No underflow */); 1059 /* Handle edge cases in a slow path: 1060 * - Read beyond end of literals 1061 * - Match end is within WILDCOPY_OVERLIMIT of oend 1062 * - 32-bit mode and the match length overflows 1063 */ 1064 if (UNLIKELY( 1065 iLitEnd > litLimit || 1066 oMatchEnd > oend_w || 1067 (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) 1068 return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); 1069 1070 /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ 1071 assert(op <= oLitEnd /* No overflow */); 1072 assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); 1073 assert(oMatchEnd <= oend /* No underflow */); 1074 assert(iLitEnd <= litLimit /* Literal length is in bounds */); 1075 assert(oLitEnd <= oend_w /* Can wildcopy literals */); 1076 assert(oMatchEnd <= oend_w /* Can wildcopy matches */); 1077 1078 /* Copy Literals: 1079 * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. 1080 * We likely don't need the full 32-byte wildcopy. 1081 */ 1082 assert(WILDCOPY_OVERLENGTH >= 16); 1083 ZSTD_copy16(op, (*litPtr)); 1084 if (UNLIKELY(sequence.litLength > 16)) { 1085 ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap); 1086 } 1087 op = oLitEnd; 1088 *litPtr = iLitEnd; /* update for next sequence */ 1089 1090 /* Copy Match */ 1091 if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 1092 /* offset beyond prefix -> go into extDict */ 1093 RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); 1094 match = dictEnd + (match - prefixStart); 1095 if (match + sequence.matchLength <= dictEnd) { 1096 ZSTD_memmove(oLitEnd, match, sequence.matchLength); 1097 return sequenceLength; 1098 } 1099 /* span extDict & currentPrefixSegment */ 1100 { size_t const length1 = dictEnd - match; 1101 ZSTD_memmove(oLitEnd, match, length1); 1102 op = oLitEnd + length1; 1103 sequence.matchLength -= length1; 1104 match = prefixStart; 1105 } } 1106 /* Match within prefix of 1 or more bytes */ 1107 assert(op <= oMatchEnd); 1108 assert(oMatchEnd <= oend_w); 1109 assert(match >= prefixStart); 1110 assert(sequence.matchLength >= 1); 1111 1112 /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy 1113 * without overlap checking. 1114 */ 1115 if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { 1116 /* We bet on a full wildcopy for matches, since we expect matches to be 1117 * longer than literals (in general). In silesia, ~10% of matches are longer 1118 * than 16 bytes. 1119 */ 1120 ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); 1121 return sequenceLength; 1122 } 1123 assert(sequence.offset < WILDCOPY_VECLEN); 1124 1125 /* Copy 8 bytes and spread the offset to be >= 8. */ 1126 ZSTD_overlapCopy8(&op, &match, sequence.offset); 1127 1128 /* If the match length is > 8 bytes, then continue with the wildcopy. */ 1129 if (sequence.matchLength > 8) { 1130 assert(op < oMatchEnd); 1131 ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); 1132 } 1133 return sequenceLength; 1134 } 1135 1136 1137 static void 1138 ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) 1139 { 1140 const void* ptr = dt; 1141 const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr; 1142 DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog); 1143 DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits", 1144 (U32)DStatePtr->state, DTableH->tableLog); 1145 BIT_reloadDStream(bitD); 1146 DStatePtr->table = dt + 1; 1147 } 1148 1149 FORCE_INLINE_TEMPLATE void 1150 ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits) 1151 { 1152 size_t const lowBits = BIT_readBits(bitD, nbBits); 1153 DStatePtr->state = nextState + lowBits; 1154 } 1155 1156 /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum 1157 * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1) 1158 * bits before reloading. This value is the maximum number of bytes we read 1159 * after reloading when we are decoding long offsets. 1160 */ 1161 #define LONG_OFFSETS_MAX_EXTRA_BITS_32 \ 1162 (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \ 1163 ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \ 1164 : 0) 1165 1166 typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; 1167 1168 FORCE_INLINE_TEMPLATE seq_t 1169 ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) 1170 { 1171 seq_t seq; 1172 const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; 1173 const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; 1174 const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; 1175 seq.matchLength = mlDInfo->baseValue; 1176 seq.litLength = llDInfo->baseValue; 1177 { U32 const ofBase = ofDInfo->baseValue; 1178 BYTE const llBits = llDInfo->nbAdditionalBits; 1179 BYTE const mlBits = mlDInfo->nbAdditionalBits; 1180 BYTE const ofBits = ofDInfo->nbAdditionalBits; 1181 BYTE const totalBits = llBits+mlBits+ofBits; 1182 1183 U16 const llNext = llDInfo->nextState; 1184 U16 const mlNext = mlDInfo->nextState; 1185 U16 const ofNext = ofDInfo->nextState; 1186 U32 const llnbBits = llDInfo->nbBits; 1187 U32 const mlnbBits = mlDInfo->nbBits; 1188 U32 const ofnbBits = ofDInfo->nbBits; 1189 /* 1190 * As gcc has better branch and block analyzers, sometimes it is only 1191 * valuable to mark likelyness for clang, it gives around 3-4% of 1192 * performance. 1193 */ 1194 1195 /* sequence */ 1196 { size_t offset; 1197 #if defined(__clang__) 1198 if (LIKELY(ofBits > 1)) { 1199 #else 1200 if (ofBits > 1) { 1201 #endif 1202 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); 1203 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); 1204 assert(ofBits <= MaxOff); 1205 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { 1206 U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); 1207 offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); 1208 BIT_reloadDStream(&seqState->DStream); 1209 if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); 1210 assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ 1211 } else { 1212 offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ 1213 if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); 1214 } 1215 seqState->prevOffset[2] = seqState->prevOffset[1]; 1216 seqState->prevOffset[1] = seqState->prevOffset[0]; 1217 seqState->prevOffset[0] = offset; 1218 } else { 1219 U32 const ll0 = (llDInfo->baseValue == 0); 1220 if (LIKELY((ofBits == 0))) { 1221 offset = seqState->prevOffset[ll0]; 1222 seqState->prevOffset[1] = seqState->prevOffset[!ll0]; 1223 seqState->prevOffset[0] = offset; 1224 } else { 1225 offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); 1226 { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; 1227 temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ 1228 if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; 1229 seqState->prevOffset[1] = seqState->prevOffset[0]; 1230 seqState->prevOffset[0] = offset = temp; 1231 } } } 1232 seq.offset = offset; 1233 } 1234 1235 #if defined(__clang__) 1236 if (UNLIKELY(mlBits > 0)) 1237 #else 1238 if (mlBits > 0) 1239 #endif 1240 seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); 1241 1242 if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) 1243 BIT_reloadDStream(&seqState->DStream); 1244 if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) 1245 BIT_reloadDStream(&seqState->DStream); 1246 /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ 1247 ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); 1248 1249 #if defined(__clang__) 1250 if (UNLIKELY(llBits > 0)) 1251 #else 1252 if (llBits > 0) 1253 #endif 1254 seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); 1255 1256 if (MEM_32bits()) 1257 BIT_reloadDStream(&seqState->DStream); 1258 1259 DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", 1260 (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); 1261 1262 ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ 1263 ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ 1264 if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ 1265 ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ 1266 } 1267 1268 return seq; 1269 } 1270 1271 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION 1272 MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) 1273 { 1274 size_t const windowSize = dctx->fParams.windowSize; 1275 /* No dictionary used. */ 1276 if (dctx->dictContentEndForFuzzing == NULL) return 0; 1277 /* Dictionary is our prefix. */ 1278 if (prefixStart == dctx->dictContentBeginForFuzzing) return 1; 1279 /* Dictionary is not our ext-dict. */ 1280 if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0; 1281 /* Dictionary is not within our window size. */ 1282 if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0; 1283 /* Dictionary is active. */ 1284 return 1; 1285 } 1286 1287 MEM_STATIC void ZSTD_assertValidSequence( 1288 ZSTD_DCtx const* dctx, 1289 BYTE const* op, BYTE const* oend, 1290 seq_t const seq, 1291 BYTE const* prefixStart, BYTE const* virtualStart) 1292 { 1293 #if DEBUGLEVEL >= 1 1294 size_t const windowSize = dctx->fParams.windowSize; 1295 size_t const sequenceSize = seq.litLength + seq.matchLength; 1296 BYTE const* const oLitEnd = op + seq.litLength; 1297 DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", 1298 (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); 1299 assert(op <= oend); 1300 assert((size_t)(oend - op) >= sequenceSize); 1301 assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX); 1302 if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { 1303 size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); 1304 /* Offset must be within the dictionary. */ 1305 assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); 1306 assert(seq.offset <= windowSize + dictSize); 1307 } else { 1308 /* Offset must be within our window. */ 1309 assert(seq.offset <= windowSize); 1310 } 1311 #else 1312 (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; 1313 #endif 1314 } 1315 #endif 1316 1317 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1318 1319 1320 FORCE_INLINE_TEMPLATE size_t 1321 DONT_VECTORIZE 1322 ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, 1323 void* dst, size_t maxDstSize, 1324 const void* seqStart, size_t seqSize, int nbSeq, 1325 const ZSTD_longOffset_e isLongOffset, 1326 const int frame) 1327 { 1328 const BYTE* ip = (const BYTE*)seqStart; 1329 const BYTE* const iend = ip + seqSize; 1330 BYTE* const ostart = (BYTE*)dst; 1331 BYTE* const oend = ostart + maxDstSize; 1332 BYTE* op = ostart; 1333 const BYTE* litPtr = dctx->litPtr; 1334 const BYTE* litBufferEnd = dctx->litBufferEnd; 1335 const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); 1336 const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); 1337 const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); 1338 DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer"); 1339 (void)frame; 1340 1341 /* Regen sequences */ 1342 if (nbSeq) { 1343 seqState_t seqState; 1344 dctx->fseEntropy = 1; 1345 { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1346 RETURN_ERROR_IF( 1347 ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), 1348 corruption_detected, ""); 1349 ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); 1350 ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); 1351 ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); 1352 assert(dst != NULL); 1353 1354 ZSTD_STATIC_ASSERT( 1355 BIT_DStream_unfinished < BIT_DStream_completed && 1356 BIT_DStream_endOfBuffer < BIT_DStream_completed && 1357 BIT_DStream_completed < BIT_DStream_overflow); 1358 1359 /* decompress without overrunning litPtr begins */ 1360 { 1361 seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); 1362 /* Align the decompression loop to 32 + 16 bytes. 1363 * 1364 * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression 1365 * speed swings based on the alignment of the decompression loop. This 1366 * performance swing is caused by parts of the decompression loop falling 1367 * out of the DSB. The entire decompression loop should fit in the DSB, 1368 * when it can't we get much worse performance. You can measure if you've 1369 * hit the good case or the bad case with this perf command for some 1370 * compressed file test.zst: 1371 * 1372 * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ 1373 * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst 1374 * 1375 * If you see most cycles served out of the MITE you've hit the bad case. 1376 * If you see most cycles served out of the DSB you've hit the good case. 1377 * If it is pretty even then you may be in an okay case. 1378 * 1379 * This issue has been reproduced on the following CPUs: 1380 * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 1381 * Use Instruments->Counters to get DSB/MITE cycles. 1382 * I never got performance swings, but I was able to 1383 * go from the good case of mostly DSB to half of the 1384 * cycles served from MITE. 1385 * - Coffeelake: Intel i9-9900k 1386 * - Coffeelake: Intel i7-9700k 1387 * 1388 * I haven't been able to reproduce the instability or DSB misses on any 1389 * of the following CPUS: 1390 * - Haswell 1391 * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH 1392 * - Skylake 1393 * 1394 * Alignment is done for each of the three major decompression loops: 1395 * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer 1396 * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer 1397 * - ZSTD_decompressSequences_body 1398 * Alignment choices are made to minimize large swings on bad cases and influence on performance 1399 * from changes external to this code, rather than to overoptimize on the current commit. 1400 * 1401 * If you are seeing performance stability this script can help test. 1402 * It tests on 4 commits in zstd where I saw performance change. 1403 * 1404 * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 1405 */ 1406 #if defined(__x86_64__) 1407 __asm__(".p2align 6"); 1408 # if __GNUC__ >= 7 1409 /* good for gcc-7, gcc-9, and gcc-11 */ 1410 __asm__("nop"); 1411 __asm__(".p2align 5"); 1412 __asm__("nop"); 1413 __asm__(".p2align 4"); 1414 # if __GNUC__ == 8 || __GNUC__ == 10 1415 /* good for gcc-8 and gcc-10 */ 1416 __asm__("nop"); 1417 __asm__(".p2align 3"); 1418 # endif 1419 # endif 1420 #endif 1421 1422 /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ 1423 for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) { 1424 size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1425 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1426 assert(!ZSTD_isError(oneSeqSize)); 1427 if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1428 #endif 1429 if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1430 return oneSeqSize; 1431 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1432 op += oneSeqSize; 1433 if (UNLIKELY(!--nbSeq)) 1434 break; 1435 BIT_reloadDStream(&(seqState.DStream)); 1436 sequence = ZSTD_decodeSequence(&seqState, isLongOffset); 1437 } 1438 1439 /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ 1440 if (nbSeq > 0) { 1441 const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1442 if (leftoverLit) 1443 { 1444 RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1445 ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1446 sequence.litLength -= leftoverLit; 1447 op += leftoverLit; 1448 } 1449 litPtr = dctx->litExtraBuffer; 1450 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1451 dctx->litBufferLocation = ZSTD_not_in_dst; 1452 { 1453 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1454 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1455 assert(!ZSTD_isError(oneSeqSize)); 1456 if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1457 #endif 1458 if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1459 return oneSeqSize; 1460 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1461 op += oneSeqSize; 1462 if (--nbSeq) 1463 BIT_reloadDStream(&(seqState.DStream)); 1464 } 1465 } 1466 } 1467 1468 if (nbSeq > 0) /* there is remaining lit from extra buffer */ 1469 { 1470 1471 #if defined(__x86_64__) 1472 __asm__(".p2align 6"); 1473 __asm__("nop"); 1474 # if __GNUC__ != 7 1475 /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */ 1476 __asm__(".p2align 4"); 1477 __asm__("nop"); 1478 __asm__(".p2align 3"); 1479 # elif __GNUC__ >= 11 1480 __asm__(".p2align 3"); 1481 # else 1482 __asm__(".p2align 5"); 1483 __asm__("nop"); 1484 __asm__(".p2align 3"); 1485 # endif 1486 #endif 1487 1488 for (; ; ) { 1489 seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); 1490 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1491 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1492 assert(!ZSTD_isError(oneSeqSize)); 1493 if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1494 #endif 1495 if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1496 return oneSeqSize; 1497 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1498 op += oneSeqSize; 1499 if (UNLIKELY(!--nbSeq)) 1500 break; 1501 BIT_reloadDStream(&(seqState.DStream)); 1502 } 1503 } 1504 1505 /* check if reached exact end */ 1506 DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); 1507 RETURN_ERROR_IF(nbSeq, corruption_detected, ""); 1508 RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); 1509 /* save reps for next block */ 1510 { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } 1511 } 1512 1513 /* last literal segment */ 1514 if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ 1515 { 1516 size_t const lastLLSize = litBufferEnd - litPtr; 1517 RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); 1518 if (op != NULL) { 1519 ZSTD_memmove(op, litPtr, lastLLSize); 1520 op += lastLLSize; 1521 } 1522 litPtr = dctx->litExtraBuffer; 1523 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1524 dctx->litBufferLocation = ZSTD_not_in_dst; 1525 } 1526 { size_t const lastLLSize = litBufferEnd - litPtr; 1527 RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1528 if (op != NULL) { 1529 ZSTD_memcpy(op, litPtr, lastLLSize); 1530 op += lastLLSize; 1531 } 1532 } 1533 1534 return op-ostart; 1535 } 1536 1537 FORCE_INLINE_TEMPLATE size_t 1538 DONT_VECTORIZE 1539 ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, 1540 void* dst, size_t maxDstSize, 1541 const void* seqStart, size_t seqSize, int nbSeq, 1542 const ZSTD_longOffset_e isLongOffset, 1543 const int frame) 1544 { 1545 const BYTE* ip = (const BYTE*)seqStart; 1546 const BYTE* const iend = ip + seqSize; 1547 BYTE* const ostart = (BYTE*)dst; 1548 BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer; 1549 BYTE* op = ostart; 1550 const BYTE* litPtr = dctx->litPtr; 1551 const BYTE* const litEnd = litPtr + dctx->litSize; 1552 const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); 1553 const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); 1554 const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); 1555 DEBUGLOG(5, "ZSTD_decompressSequences_body"); 1556 (void)frame; 1557 1558 /* Regen sequences */ 1559 if (nbSeq) { 1560 seqState_t seqState; 1561 dctx->fseEntropy = 1; 1562 { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1563 RETURN_ERROR_IF( 1564 ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)), 1565 corruption_detected, ""); 1566 ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); 1567 ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); 1568 ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); 1569 assert(dst != NULL); 1570 1571 ZSTD_STATIC_ASSERT( 1572 BIT_DStream_unfinished < BIT_DStream_completed && 1573 BIT_DStream_endOfBuffer < BIT_DStream_completed && 1574 BIT_DStream_completed < BIT_DStream_overflow); 1575 1576 #if defined(__x86_64__) 1577 __asm__(".p2align 6"); 1578 __asm__("nop"); 1579 # if __GNUC__ >= 7 1580 __asm__(".p2align 5"); 1581 __asm__("nop"); 1582 __asm__(".p2align 3"); 1583 # else 1584 __asm__(".p2align 4"); 1585 __asm__("nop"); 1586 __asm__(".p2align 3"); 1587 # endif 1588 #endif 1589 1590 for ( ; ; ) { 1591 seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); 1592 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); 1593 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1594 assert(!ZSTD_isError(oneSeqSize)); 1595 if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1596 #endif 1597 if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1598 return oneSeqSize; 1599 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1600 op += oneSeqSize; 1601 if (UNLIKELY(!--nbSeq)) 1602 break; 1603 BIT_reloadDStream(&(seqState.DStream)); 1604 } 1605 1606 /* check if reached exact end */ 1607 DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); 1608 RETURN_ERROR_IF(nbSeq, corruption_detected, ""); 1609 RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); 1610 /* save reps for next block */ 1611 { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } 1612 } 1613 1614 /* last literal segment */ 1615 { size_t const lastLLSize = litEnd - litPtr; 1616 RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1617 if (op != NULL) { 1618 ZSTD_memcpy(op, litPtr, lastLLSize); 1619 op += lastLLSize; 1620 } 1621 } 1622 1623 return op-ostart; 1624 } 1625 1626 static size_t 1627 ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, 1628 void* dst, size_t maxDstSize, 1629 const void* seqStart, size_t seqSize, int nbSeq, 1630 const ZSTD_longOffset_e isLongOffset, 1631 const int frame) 1632 { 1633 return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1634 } 1635 1636 static size_t 1637 ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, 1638 void* dst, size_t maxDstSize, 1639 const void* seqStart, size_t seqSize, int nbSeq, 1640 const ZSTD_longOffset_e isLongOffset, 1641 const int frame) 1642 { 1643 return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1644 } 1645 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1646 1647 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1648 1649 FORCE_INLINE_TEMPLATE size_t 1650 ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, 1651 const BYTE* const prefixStart, const BYTE* const dictEnd) 1652 { 1653 prefetchPos += sequence.litLength; 1654 { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; 1655 const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. 1656 * No consequence though : memory address is only used for prefetching, not for dereferencing */ 1657 PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ 1658 } 1659 return prefetchPos + sequence.matchLength; 1660 } 1661 1662 /* This decoding function employs prefetching 1663 * to reduce latency impact of cache misses. 1664 * It's generally employed when block contains a significant portion of long-distance matches 1665 * or when coupled with a "cold" dictionary */ 1666 FORCE_INLINE_TEMPLATE size_t 1667 ZSTD_decompressSequencesLong_body( 1668 ZSTD_DCtx* dctx, 1669 void* dst, size_t maxDstSize, 1670 const void* seqStart, size_t seqSize, int nbSeq, 1671 const ZSTD_longOffset_e isLongOffset, 1672 const int frame) 1673 { 1674 const BYTE* ip = (const BYTE*)seqStart; 1675 const BYTE* const iend = ip + seqSize; 1676 BYTE* const ostart = (BYTE*)dst; 1677 BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize; 1678 BYTE* op = ostart; 1679 const BYTE* litPtr = dctx->litPtr; 1680 const BYTE* litBufferEnd = dctx->litBufferEnd; 1681 const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); 1682 const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); 1683 const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); 1684 (void)frame; 1685 1686 /* Regen sequences */ 1687 if (nbSeq) { 1688 #define STORED_SEQS 8 1689 #define STORED_SEQS_MASK (STORED_SEQS-1) 1690 #define ADVANCED_SEQS STORED_SEQS 1691 seq_t sequences[STORED_SEQS]; 1692 int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); 1693 seqState_t seqState; 1694 int seqNb; 1695 size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */ 1696 1697 dctx->fseEntropy = 1; 1698 { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1699 assert(dst != NULL); 1700 assert(iend >= ip); 1701 RETURN_ERROR_IF( 1702 ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), 1703 corruption_detected, ""); 1704 ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); 1705 ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); 1706 ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); 1707 1708 /* prepare in advance */ 1709 for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) { 1710 seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); 1711 prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1712 sequences[seqNb] = sequence; 1713 } 1714 RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, ""); 1715 1716 /* decompress without stomping litBuffer */ 1717 for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) { 1718 seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); 1719 size_t oneSeqSize; 1720 1721 if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) 1722 { 1723 /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ 1724 const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1725 if (leftoverLit) 1726 { 1727 RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1728 ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1729 sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit; 1730 op += leftoverLit; 1731 } 1732 litPtr = dctx->litExtraBuffer; 1733 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1734 dctx->litBufferLocation = ZSTD_not_in_dst; 1735 oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1736 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1737 assert(!ZSTD_isError(oneSeqSize)); 1738 if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); 1739 #endif 1740 if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1741 1742 prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1743 sequences[seqNb & STORED_SEQS_MASK] = sequence; 1744 op += oneSeqSize; 1745 } 1746 else 1747 { 1748 /* lit buffer is either wholly contained in first or second split, or not split at all*/ 1749 oneSeqSize = dctx->litBufferLocation == ZSTD_split ? 1750 ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : 1751 ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1752 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1753 assert(!ZSTD_isError(oneSeqSize)); 1754 if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); 1755 #endif 1756 if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1757 1758 prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1759 sequences[seqNb & STORED_SEQS_MASK] = sequence; 1760 op += oneSeqSize; 1761 } 1762 } 1763 RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, ""); 1764 1765 /* finish queue */ 1766 seqNb -= seqAdvance; 1767 for ( ; seqNb<nbSeq ; seqNb++) { 1768 seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]); 1769 if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) 1770 { 1771 const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1772 if (leftoverLit) 1773 { 1774 RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1775 ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1776 sequence->litLength -= leftoverLit; 1777 op += leftoverLit; 1778 } 1779 litPtr = dctx->litExtraBuffer; 1780 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1781 dctx->litBufferLocation = ZSTD_not_in_dst; 1782 { 1783 size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1784 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1785 assert(!ZSTD_isError(oneSeqSize)); 1786 if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); 1787 #endif 1788 if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1789 op += oneSeqSize; 1790 } 1791 } 1792 else 1793 { 1794 size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? 1795 ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : 1796 ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1797 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1798 assert(!ZSTD_isError(oneSeqSize)); 1799 if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); 1800 #endif 1801 if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1802 op += oneSeqSize; 1803 } 1804 } 1805 1806 /* save reps for next block */ 1807 { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } 1808 } 1809 1810 /* last literal segment */ 1811 if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */ 1812 { 1813 size_t const lastLLSize = litBufferEnd - litPtr; 1814 RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); 1815 if (op != NULL) { 1816 ZSTD_memmove(op, litPtr, lastLLSize); 1817 op += lastLLSize; 1818 } 1819 litPtr = dctx->litExtraBuffer; 1820 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1821 } 1822 { size_t const lastLLSize = litBufferEnd - litPtr; 1823 RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1824 if (op != NULL) { 1825 ZSTD_memmove(op, litPtr, lastLLSize); 1826 op += lastLLSize; 1827 } 1828 } 1829 1830 return op-ostart; 1831 } 1832 1833 static size_t 1834 ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, 1835 void* dst, size_t maxDstSize, 1836 const void* seqStart, size_t seqSize, int nbSeq, 1837 const ZSTD_longOffset_e isLongOffset, 1838 const int frame) 1839 { 1840 return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1841 } 1842 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ 1843 1844 1845 1846 #if DYNAMIC_BMI2 1847 1848 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1849 static BMI2_TARGET_ATTRIBUTE size_t 1850 DONT_VECTORIZE 1851 ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, 1852 void* dst, size_t maxDstSize, 1853 const void* seqStart, size_t seqSize, int nbSeq, 1854 const ZSTD_longOffset_e isLongOffset, 1855 const int frame) 1856 { 1857 return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1858 } 1859 static BMI2_TARGET_ATTRIBUTE size_t 1860 DONT_VECTORIZE 1861 ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, 1862 void* dst, size_t maxDstSize, 1863 const void* seqStart, size_t seqSize, int nbSeq, 1864 const ZSTD_longOffset_e isLongOffset, 1865 const int frame) 1866 { 1867 return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1868 } 1869 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1870 1871 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1872 static BMI2_TARGET_ATTRIBUTE size_t 1873 ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, 1874 void* dst, size_t maxDstSize, 1875 const void* seqStart, size_t seqSize, int nbSeq, 1876 const ZSTD_longOffset_e isLongOffset, 1877 const int frame) 1878 { 1879 return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1880 } 1881 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ 1882 1883 #endif /* DYNAMIC_BMI2 */ 1884 1885 typedef size_t (*ZSTD_decompressSequences_t)( 1886 ZSTD_DCtx* dctx, 1887 void* dst, size_t maxDstSize, 1888 const void* seqStart, size_t seqSize, int nbSeq, 1889 const ZSTD_longOffset_e isLongOffset, 1890 const int frame); 1891 1892 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1893 static size_t 1894 ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, 1895 const void* seqStart, size_t seqSize, int nbSeq, 1896 const ZSTD_longOffset_e isLongOffset, 1897 const int frame) 1898 { 1899 DEBUGLOG(5, "ZSTD_decompressSequences"); 1900 #if DYNAMIC_BMI2 1901 if (ZSTD_DCtx_get_bmi2(dctx)) { 1902 return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1903 } 1904 #endif 1905 return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1906 } 1907 static size_t 1908 ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, 1909 const void* seqStart, size_t seqSize, int nbSeq, 1910 const ZSTD_longOffset_e isLongOffset, 1911 const int frame) 1912 { 1913 DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); 1914 #if DYNAMIC_BMI2 1915 if (ZSTD_DCtx_get_bmi2(dctx)) { 1916 return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1917 } 1918 #endif 1919 return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1920 } 1921 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1922 1923 1924 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1925 /* ZSTD_decompressSequencesLong() : 1926 * decompression function triggered when a minimum share of offsets is considered "long", 1927 * aka out of cache. 1928 * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance". 1929 * This function will try to mitigate main memory latency through the use of prefetching */ 1930 static size_t 1931 ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, 1932 void* dst, size_t maxDstSize, 1933 const void* seqStart, size_t seqSize, int nbSeq, 1934 const ZSTD_longOffset_e isLongOffset, 1935 const int frame) 1936 { 1937 DEBUGLOG(5, "ZSTD_decompressSequencesLong"); 1938 #if DYNAMIC_BMI2 1939 if (ZSTD_DCtx_get_bmi2(dctx)) { 1940 return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1941 } 1942 #endif 1943 return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1944 } 1945 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ 1946 1947 1948 1949 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 1950 !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 1951 /* ZSTD_getLongOffsetsShare() : 1952 * condition : offTable must be valid 1953 * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) 1954 * compared to maximum possible of (1<<OffFSELog) */ 1955 static unsigned 1956 ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable) 1957 { 1958 const void* ptr = offTable; 1959 U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; 1960 const ZSTD_seqSymbol* table = offTable + 1; 1961 U32 const max = 1 << tableLog; 1962 U32 u, total = 0; 1963 DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog); 1964 1965 assert(max <= (1 << OffFSELog)); /* max not too large */ 1966 for (u=0; u<max; u++) { 1967 if (table[u].nbAdditionalBits > 22) total += 1; 1968 } 1969 1970 assert(tableLog <= OffFSELog); 1971 total <<= (OffFSELog - tableLog); /* scale to OffFSELog */ 1972 1973 return total; 1974 } 1975 #endif 1976 1977 size_t 1978 ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, 1979 void* dst, size_t dstCapacity, 1980 const void* src, size_t srcSize, const int frame, const streaming_operation streaming) 1981 { /* blockType == blockCompressed */ 1982 const BYTE* ip = (const BYTE*)src; 1983 /* isLongOffset must be true if there are long offsets. 1984 * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN. 1985 * We don't expect that to be the case in 64-bit mode. 1986 * In block mode, window size is not known, so we have to be conservative. 1987 * (note: but it could be evaluated from current-lowLimit) 1988 */ 1989 ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)))); 1990 DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); 1991 1992 RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); 1993 1994 /* Decode literals section */ 1995 { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); 1996 DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); 1997 if (ZSTD_isError(litCSize)) return litCSize; 1998 ip += litCSize; 1999 srcSize -= litCSize; 2000 } 2001 2002 /* Build Decoding Tables */ 2003 { 2004 /* These macros control at build-time which decompressor implementation 2005 * we use. If neither is defined, we do some inspection and dispatch at 2006 * runtime. 2007 */ 2008 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 2009 !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 2010 int usePrefetchDecoder = dctx->ddictIsCold; 2011 #endif 2012 int nbSeq; 2013 size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); 2014 if (ZSTD_isError(seqHSize)) return seqHSize; 2015 ip += seqHSize; 2016 srcSize -= seqHSize; 2017 2018 RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); 2019 2020 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 2021 !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 2022 if ( !usePrefetchDecoder 2023 && (!frame || (dctx->fParams.windowSize > (1<<24))) 2024 && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */ 2025 U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); 2026 U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ 2027 usePrefetchDecoder = (shareLongOffsets >= minShare); 2028 } 2029 #endif 2030 2031 dctx->ddictIsCold = 0; 2032 2033 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 2034 !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 2035 if (usePrefetchDecoder) 2036 #endif 2037 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 2038 return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); 2039 #endif 2040 2041 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 2042 /* else */ 2043 if (dctx->litBufferLocation == ZSTD_split) 2044 return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); 2045 else 2046 return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); 2047 #endif 2048 } 2049 } 2050 2051 2052 void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) 2053 { 2054 if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ 2055 dctx->dictEnd = dctx->previousDstEnd; 2056 dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); 2057 dctx->prefixStart = dst; 2058 dctx->previousDstEnd = dst; 2059 } 2060 } 2061 2062 2063 size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, 2064 void* dst, size_t dstCapacity, 2065 const void* src, size_t srcSize) 2066 { 2067 size_t dSize; 2068 ZSTD_checkContinuity(dctx, dst, dstCapacity); 2069 dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming); 2070 dctx->previousDstEnd = (char*)dst + dSize; 2071 return dSize; 2072 } 2073