1 /* SPDX-License-Identifier: 0BSD */ 2 3 /** 4 * \file lzma/lzma12.h 5 * \brief LZMA1 and LZMA2 filters 6 * \note Never include this file directly. Use <lzma.h> instead. 7 */ 8 9 /* 10 * Author: Lasse Collin 11 */ 12 13 #ifndef LZMA_H_INTERNAL 14 # error Never include this file directly. Use <lzma.h> instead. 15 #endif 16 17 18 /** 19 * \brief LZMA1 Filter ID (for raw encoder/decoder only, not in .xz) 20 * 21 * LZMA1 is the very same thing as what was called just LZMA in LZMA Utils, 22 * 7-Zip, and LZMA SDK. It's called LZMA1 here to prevent developers from 23 * accidentally using LZMA when they actually want LZMA2. 24 */ 25 #define LZMA_FILTER_LZMA1 LZMA_VLI_C(0x4000000000000001) 26 27 /** 28 * \brief LZMA1 Filter ID with extended options (for raw encoder/decoder) 29 * 30 * This is like LZMA_FILTER_LZMA1 but with this ID a few extra options 31 * are supported in the lzma_options_lzma structure: 32 * 33 * - A flag to tell the encoder if the end of payload marker (EOPM) alias 34 * end of stream (EOS) marker must be written at the end of the stream. 35 * In contrast, LZMA_FILTER_LZMA1 always writes the end marker. 36 * 37 * - Decoder needs to be told the uncompressed size of the stream 38 * or that it is unknown (using the special value UINT64_MAX). 39 * If the size is known, a flag can be set to allow the presence of 40 * the end marker anyway. In contrast, LZMA_FILTER_LZMA1 always 41 * behaves as if the uncompressed size was unknown. 42 * 43 * This allows handling file formats where LZMA1 streams are used but where 44 * the end marker isn't allowed or where it might not (always) be present. 45 * This extended LZMA1 functionality is provided as a Filter ID for raw 46 * encoder and decoder instead of adding new encoder and decoder initialization 47 * functions because this way it is possible to also use extra filters, 48 * for example, LZMA_FILTER_X86 in a filter chain with LZMA_FILTER_LZMA1EXT, 49 * which might be needed to handle some file formats. 50 */ 51 #define LZMA_FILTER_LZMA1EXT LZMA_VLI_C(0x4000000000000002) 52 53 /** 54 * \brief LZMA2 Filter ID 55 * 56 * Usually you want this instead of LZMA1. Compared to LZMA1, LZMA2 adds 57 * support for LZMA_SYNC_FLUSH, uncompressed chunks (smaller expansion 58 * when trying to compress incompressible data), possibility to change 59 * lc/lp/pb in the middle of encoding, and some other internal improvements. 60 */ 61 #define LZMA_FILTER_LZMA2 LZMA_VLI_C(0x21) 62 63 64 /** 65 * \brief Match finders 66 * 67 * Match finder has major effect on both speed and compression ratio. 68 * Usually hash chains are faster than binary trees. 69 * 70 * If you will use LZMA_SYNC_FLUSH often, the hash chains may be a better 71 * choice, because binary trees get much higher compression ratio penalty 72 * with LZMA_SYNC_FLUSH. 73 * 74 * The memory usage formulas are only rough estimates, which are closest to 75 * reality when dict_size is a power of two. The formulas are more complex 76 * in reality, and can also change a little between liblzma versions. Use 77 * lzma_raw_encoder_memusage() to get more accurate estimate of memory usage. 78 */ 79 typedef enum { 80 LZMA_MF_HC3 = 0x03, 81 /**< 82 * \brief Hash Chain with 2- and 3-byte hashing 83 * 84 * Minimum nice_len: 3 85 * 86 * Memory usage: 87 * - dict_size <= 16 MiB: dict_size * 7.5 88 * - dict_size > 16 MiB: dict_size * 5.5 + 64 MiB 89 */ 90 91 LZMA_MF_HC4 = 0x04, 92 /**< 93 * \brief Hash Chain with 2-, 3-, and 4-byte hashing 94 * 95 * Minimum nice_len: 4 96 * 97 * Memory usage: 98 * - dict_size <= 32 MiB: dict_size * 7.5 99 * - dict_size > 32 MiB: dict_size * 6.5 100 */ 101 102 LZMA_MF_BT2 = 0x12, 103 /**< 104 * \brief Binary Tree with 2-byte hashing 105 * 106 * Minimum nice_len: 2 107 * 108 * Memory usage: dict_size * 9.5 109 */ 110 111 LZMA_MF_BT3 = 0x13, 112 /**< 113 * \brief Binary Tree with 2- and 3-byte hashing 114 * 115 * Minimum nice_len: 3 116 * 117 * Memory usage: 118 * - dict_size <= 16 MiB: dict_size * 11.5 119 * - dict_size > 16 MiB: dict_size * 9.5 + 64 MiB 120 */ 121 122 LZMA_MF_BT4 = 0x14 123 /**< 124 * \brief Binary Tree with 2-, 3-, and 4-byte hashing 125 * 126 * Minimum nice_len: 4 127 * 128 * Memory usage: 129 * - dict_size <= 32 MiB: dict_size * 11.5 130 * - dict_size > 32 MiB: dict_size * 10.5 131 */ 132 } lzma_match_finder; 133 134 135 /** 136 * \brief Test if given match finder is supported 137 * 138 * It is safe to call this with a value that isn't listed in 139 * lzma_match_finder enumeration; the return value will be false. 140 * 141 * There is no way to list which match finders are available in this 142 * particular liblzma version and build. It would be useless, because 143 * a new match finder, which the application developer wasn't aware, 144 * could require giving additional options to the encoder that the older 145 * match finders don't need. 146 * 147 * \param match_finder Match finder ID 148 * 149 * \return lzma_bool: 150 * - true if the match finder is supported by this liblzma build. 151 * - false otherwise. 152 */ 153 extern LZMA_API(lzma_bool) lzma_mf_is_supported(lzma_match_finder match_finder) 154 lzma_nothrow lzma_attr_const; 155 156 157 /** 158 * \brief Compression modes 159 * 160 * This selects the function used to analyze the data produced by the match 161 * finder. 162 */ 163 typedef enum { 164 LZMA_MODE_FAST = 1, 165 /**< 166 * \brief Fast compression 167 * 168 * Fast mode is usually at its best when combined with 169 * a hash chain match finder. 170 */ 171 172 LZMA_MODE_NORMAL = 2 173 /**< 174 * \brief Normal compression 175 * 176 * This is usually notably slower than fast mode. Use this 177 * together with binary tree match finders to expose the 178 * full potential of the LZMA1 or LZMA2 encoder. 179 */ 180 } lzma_mode; 181 182 183 /** 184 * \brief Test if given compression mode is supported 185 * 186 * It is safe to call this with a value that isn't listed in lzma_mode 187 * enumeration; the return value will be false. 188 * 189 * There is no way to list which modes are available in this particular 190 * liblzma version and build. It would be useless, because a new compression 191 * mode, which the application developer wasn't aware, could require giving 192 * additional options to the encoder that the older modes don't need. 193 * 194 * \param mode Mode ID. 195 * 196 * \return lzma_bool: 197 * - true if the compression mode is supported by this liblzma 198 * build. 199 * - false otherwise. 200 */ 201 extern LZMA_API(lzma_bool) lzma_mode_is_supported(lzma_mode mode) 202 lzma_nothrow lzma_attr_const; 203 204 205 /** 206 * \brief Options specific to the LZMA1 and LZMA2 filters 207 * 208 * Since LZMA1 and LZMA2 share most of the code, it's simplest to share 209 * the options structure too. For encoding, all but the reserved variables 210 * need to be initialized unless specifically mentioned otherwise. 211 * lzma_lzma_preset() can be used to get a good starting point. 212 * 213 * For raw decoding, both LZMA1 and LZMA2 need dict_size, preset_dict, and 214 * preset_dict_size (if preset_dict != NULL). LZMA1 needs also lc, lp, and pb. 215 */ 216 typedef struct { 217 /** 218 * \brief Dictionary size in bytes 219 * 220 * Dictionary size indicates how many bytes of the recently processed 221 * uncompressed data is kept in memory. One method to reduce size of 222 * the uncompressed data is to store distance-length pairs, which 223 * indicate what data to repeat from the dictionary buffer. Thus, 224 * the bigger the dictionary, the better the compression ratio 225 * usually is. 226 * 227 * Maximum size of the dictionary depends on multiple things: 228 * - Memory usage limit 229 * - Available address space (not a problem on 64-bit systems) 230 * - Selected match finder (encoder only) 231 * 232 * Currently the maximum dictionary size for encoding is 1.5 GiB 233 * (i.e. (UINT32_C(1) << 30) + (UINT32_C(1) << 29)) even on 64-bit 234 * systems for certain match finder implementation reasons. In the 235 * future, there may be match finders that support bigger 236 * dictionaries. 237 * 238 * Decoder already supports dictionaries up to 4 GiB - 1 B (i.e. 239 * UINT32_MAX), so increasing the maximum dictionary size of the 240 * encoder won't cause problems for old decoders. 241 * 242 * Because extremely small dictionaries sizes would have unneeded 243 * overhead in the decoder, the minimum dictionary size is 4096 bytes. 244 * 245 * \note When decoding, too big dictionary does no other harm 246 * than wasting memory. 247 */ 248 uint32_t dict_size; 249 # define LZMA_DICT_SIZE_MIN UINT32_C(4096) 250 # define LZMA_DICT_SIZE_DEFAULT (UINT32_C(1) << 23) 251 252 /** 253 * \brief Pointer to an initial dictionary 254 * 255 * It is possible to initialize the LZ77 history window using 256 * a preset dictionary. It is useful when compressing many 257 * similar, relatively small chunks of data independently from 258 * each other. The preset dictionary should contain typical 259 * strings that occur in the files being compressed. The most 260 * probable strings should be near the end of the preset dictionary. 261 * 262 * This feature should be used only in special situations. For 263 * now, it works correctly only with raw encoding and decoding. 264 * Currently none of the container formats supported by 265 * liblzma allow preset dictionary when decoding, thus if 266 * you create a .xz or .lzma file with preset dictionary, it 267 * cannot be decoded with the regular decoder functions. In the 268 * future, the .xz format will likely get support for preset 269 * dictionary though. 270 */ 271 const uint8_t *preset_dict; 272 273 /** 274 * \brief Size of the preset dictionary 275 * 276 * Specifies the size of the preset dictionary. If the size is 277 * bigger than dict_size, only the last dict_size bytes are 278 * processed. 279 * 280 * This variable is read only when preset_dict is not NULL. 281 * If preset_dict is not NULL but preset_dict_size is zero, 282 * no preset dictionary is used (identical to only setting 283 * preset_dict to NULL). 284 */ 285 uint32_t preset_dict_size; 286 287 /** 288 * \brief Number of literal context bits 289 * 290 * How many of the highest bits of the previous uncompressed 291 * eight-bit byte (also known as 'literal') are taken into 292 * account when predicting the bits of the next literal. 293 * 294 * E.g. in typical English text, an upper-case letter is 295 * often followed by a lower-case letter, and a lower-case 296 * letter is usually followed by another lower-case letter. 297 * In the US-ASCII character set, the highest three bits are 010 298 * for upper-case letters and 011 for lower-case letters. 299 * When lc is at least 3, the literal coding can take advantage of 300 * this property in the uncompressed data. 301 * 302 * There is a limit that applies to literal context bits and literal 303 * position bits together: lc + lp <= 4. Without this limit the 304 * decoding could become very slow, which could have security related 305 * results in some cases like email servers doing virus scanning. 306 * This limit also simplifies the internal implementation in liblzma. 307 * 308 * There may be LZMA1 streams that have lc + lp > 4 (maximum possible 309 * lc would be 8). It is not possible to decode such streams with 310 * liblzma. 311 */ 312 uint32_t lc; 313 # define LZMA_LCLP_MIN 0 314 # define LZMA_LCLP_MAX 4 315 # define LZMA_LC_DEFAULT 3 316 317 /** 318 * \brief Number of literal position bits 319 * 320 * lp affects what kind of alignment in the uncompressed data is 321 * assumed when encoding literals. A literal is a single 8-bit byte. 322 * See pb below for more information about alignment. 323 */ 324 uint32_t lp; 325 # define LZMA_LP_DEFAULT 0 326 327 /** 328 * \brief Number of position bits 329 * 330 * pb affects what kind of alignment in the uncompressed data is 331 * assumed in general. The default means four-byte alignment 332 * (2^ pb =2^2=4), which is often a good choice when there's 333 * no better guess. 334 * 335 * When the alignment is known, setting pb accordingly may reduce 336 * the file size a little. E.g. with text files having one-byte 337 * alignment (US-ASCII, ISO-8859-*, UTF-8), setting pb=0 can 338 * improve compression slightly. For UTF-16 text, pb=1 is a good 339 * choice. If the alignment is an odd number like 3 bytes, pb=0 340 * might be the best choice. 341 * 342 * Even though the assumed alignment can be adjusted with pb and 343 * lp, LZMA1 and LZMA2 still slightly favor 16-byte alignment. 344 * It might be worth taking into account when designing file formats 345 * that are likely to be often compressed with LZMA1 or LZMA2. 346 */ 347 uint32_t pb; 348 # define LZMA_PB_MIN 0 349 # define LZMA_PB_MAX 4 350 # define LZMA_PB_DEFAULT 2 351 352 /** Compression mode */ 353 lzma_mode mode; 354 355 /** 356 * \brief Nice length of a match 357 * 358 * This determines how many bytes the encoder compares from the match 359 * candidates when looking for the best match. Once a match of at 360 * least nice_len bytes long is found, the encoder stops looking for 361 * better candidates and encodes the match. (Naturally, if the found 362 * match is actually longer than nice_len, the actual length is 363 * encoded; it's not truncated to nice_len.) 364 * 365 * Bigger values usually increase the compression ratio and 366 * compression time. For most files, 32 to 128 is a good value, 367 * which gives very good compression ratio at good speed. 368 * 369 * The exact minimum value depends on the match finder. The maximum 370 * is 273, which is the maximum length of a match that LZMA1 and 371 * LZMA2 can encode. 372 */ 373 uint32_t nice_len; 374 375 /** Match finder ID */ 376 lzma_match_finder mf; 377 378 /** 379 * \brief Maximum search depth in the match finder 380 * 381 * For every input byte, match finder searches through the hash chain 382 * or binary tree in a loop, each iteration going one step deeper in 383 * the chain or tree. The searching stops if 384 * - a match of at least nice_len bytes long is found; 385 * - all match candidates from the hash chain or binary tree have 386 * been checked; or 387 * - maximum search depth is reached. 388 * 389 * Maximum search depth is needed to prevent the match finder from 390 * wasting too much time in case there are lots of short match 391 * candidates. On the other hand, stopping the search before all 392 * candidates have been checked can reduce compression ratio. 393 * 394 * Setting depth to zero tells liblzma to use an automatic default 395 * value, that depends on the selected match finder and nice_len. 396 * The default is in the range [4, 200] or so (it may vary between 397 * liblzma versions). 398 * 399 * Using a bigger depth value than the default can increase 400 * compression ratio in some cases. There is no strict maximum value, 401 * but high values (thousands or millions) should be used with care: 402 * the encoder could remain fast enough with typical input, but 403 * malicious input could cause the match finder to slow down 404 * dramatically, possibly creating a denial of service attack. 405 */ 406 uint32_t depth; 407 408 /** 409 * \brief For LZMA_FILTER_LZMA1EXT: Extended flags 410 * 411 * This is used only with LZMA_FILTER_LZMA1EXT. 412 * 413 * Currently only one flag is supported, LZMA_LZMA1EXT_ALLOW_EOPM: 414 * 415 * - Encoder: If the flag is set, then end marker is written just 416 * like it is with LZMA_FILTER_LZMA1. Without this flag the 417 * end marker isn't written and the application has to store 418 * the uncompressed size somewhere outside the compressed stream. 419 * To decompress streams without the end marker, the application 420 * has to set the correct uncompressed size in ext_size_low and 421 * ext_size_high. 422 * 423 * - Decoder: If the uncompressed size in ext_size_low and 424 * ext_size_high is set to the special value UINT64_MAX 425 * (indicating unknown uncompressed size) then this flag is 426 * ignored and the end marker must always be present, that is, 427 * the behavior is identical to LZMA_FILTER_LZMA1. 428 * 429 * Otherwise, if this flag isn't set, then the input stream 430 * must not have the end marker; if the end marker is detected 431 * then it will result in LZMA_DATA_ERROR. This is useful when 432 * it is known that the stream must not have the end marker and 433 * strict validation is wanted. 434 * 435 * If this flag is set, then it is autodetected if the end marker 436 * is present after the specified number of uncompressed bytes 437 * has been decompressed (ext_size_low and ext_size_high). The 438 * end marker isn't allowed in any other position. This behavior 439 * is useful when uncompressed size is known but the end marker 440 * may or may not be present. This is the case, for example, 441 * in .7z files (valid .7z files that have the end marker in 442 * LZMA1 streams are rare but they do exist). 443 */ 444 uint32_t ext_flags; 445 # define LZMA_LZMA1EXT_ALLOW_EOPM UINT32_C(0x01) 446 447 /** 448 * \brief For LZMA_FILTER_LZMA1EXT: Uncompressed size (low bits) 449 * 450 * The 64-bit uncompressed size is needed for decompression with 451 * LZMA_FILTER_LZMA1EXT. The size is ignored by the encoder. 452 * 453 * The special value UINT64_MAX indicates that the uncompressed size 454 * is unknown and that the end of payload marker (also known as 455 * end of stream marker) must be present to indicate the end of 456 * the LZMA1 stream. Any other value indicates the expected 457 * uncompressed size of the LZMA1 stream. (If LZMA1 was used together 458 * with filters that change the size of the data then the uncompressed 459 * size of the LZMA1 stream could be different than the final 460 * uncompressed size of the filtered stream.) 461 * 462 * ext_size_low holds the least significant 32 bits of the 463 * uncompressed size. The most significant 32 bits must be set 464 * in ext_size_high. The macro lzma_ext_size_set(opt_lzma, u64size) 465 * can be used to set these members. 466 * 467 * The 64-bit uncompressed size is split into two uint32_t variables 468 * because there were no reserved uint64_t members and using the 469 * same options structure for LZMA_FILTER_LZMA1, LZMA_FILTER_LZMA1EXT, 470 * and LZMA_FILTER_LZMA2 was otherwise more convenient than having 471 * a new options structure for LZMA_FILTER_LZMA1EXT. (Replacing two 472 * uint32_t members with one uint64_t changes the ABI on some systems 473 * as the alignment of this struct can increase from 4 bytes to 8.) 474 */ 475 uint32_t ext_size_low; 476 477 /** 478 * \brief For LZMA_FILTER_LZMA1EXT: Uncompressed size (high bits) 479 * 480 * This holds the most significant 32 bits of the uncompressed size. 481 */ 482 uint32_t ext_size_high; 483 484 /* 485 * Reserved space to allow possible future extensions without 486 * breaking the ABI. You should not touch these, because the names 487 * of these variables may change. These are and will never be used 488 * with the currently supported options, so it is safe to leave these 489 * uninitialized. 490 */ 491 492 /** \private Reserved member. */ 493 uint32_t reserved_int4; 494 495 /** \private Reserved member. */ 496 uint32_t reserved_int5; 497 498 /** \private Reserved member. */ 499 uint32_t reserved_int6; 500 501 /** \private Reserved member. */ 502 uint32_t reserved_int7; 503 504 /** \private Reserved member. */ 505 uint32_t reserved_int8; 506 507 /** \private Reserved member. */ 508 lzma_reserved_enum reserved_enum1; 509 510 /** \private Reserved member. */ 511 lzma_reserved_enum reserved_enum2; 512 513 /** \private Reserved member. */ 514 lzma_reserved_enum reserved_enum3; 515 516 /** \private Reserved member. */ 517 lzma_reserved_enum reserved_enum4; 518 519 /** \private Reserved member. */ 520 void *reserved_ptr1; 521 522 /** \private Reserved member. */ 523 void *reserved_ptr2; 524 525 } lzma_options_lzma; 526 527 528 /** 529 * \brief Macro to set the 64-bit uncompressed size in ext_size_* 530 * 531 * This might be convenient when decoding using LZMA_FILTER_LZMA1EXT. 532 * This isn't used with LZMA_FILTER_LZMA1 or LZMA_FILTER_LZMA2. 533 */ 534 #define lzma_set_ext_size(opt_lzma2, u64size) \ 535 do { \ 536 (opt_lzma2).ext_size_low = (uint32_t)(u64size); \ 537 (opt_lzma2).ext_size_high = (uint32_t)((uint64_t)(u64size) >> 32); \ 538 } while (0) 539 540 541 /** 542 * \brief Set a compression preset to lzma_options_lzma structure 543 * 544 * 0 is the fastest and 9 is the slowest. These match the switches -0 .. -9 545 * of the xz command line tool. In addition, it is possible to bitwise-or 546 * flags to the preset. Currently only LZMA_PRESET_EXTREME is supported. 547 * The flags are defined in container.h, because the flags are used also 548 * with lzma_easy_encoder(). 549 * 550 * The preset levels are subject to changes between liblzma versions. 551 * 552 * This function is available only if LZMA1 or LZMA2 encoder has been enabled 553 * when building liblzma. 554 * 555 * If features (like certain match finders) have been disabled at build time, 556 * then the function may return success (false) even though the resulting 557 * LZMA1/LZMA2 options may not be usable for encoder initialization 558 * (LZMA_OPTIONS_ERROR). 559 * 560 * \param[out] options Pointer to LZMA1 or LZMA2 options to be filled 561 * \param preset Preset level bitwse-ORed with preset flags 562 * 563 * \return lzma_bool: 564 * - true if the preset is not supported (failure). 565 * - false otherwise (success). 566 */ 567 extern LZMA_API(lzma_bool) lzma_lzma_preset( 568 lzma_options_lzma *options, uint32_t preset) lzma_nothrow; 569