1 /** 2 * \file lzma/lzma12.h 3 * \brief LZMA1 and LZMA2 filters 4 */ 5 6 /* 7 * Author: Lasse Collin 8 * 9 * This file has been put into the public domain. 10 * You can do whatever you want with this file. 11 * 12 * See ../lzma.h for information about liblzma as a whole. 13 */ 14 15 #ifndef LZMA_H_INTERNAL 16 # error Never include this file directly. Use <lzma.h> instead. 17 #endif 18 19 20 /** 21 * \brief LZMA1 Filter ID (for raw encoder/decoder only, not in .xz) 22 * 23 * LZMA1 is the very same thing as what was called just LZMA in LZMA Utils, 24 * 7-Zip, and LZMA SDK. It's called LZMA1 here to prevent developers from 25 * accidentally using LZMA when they actually want LZMA2. 26 */ 27 #define LZMA_FILTER_LZMA1 LZMA_VLI_C(0x4000000000000001) 28 29 /** 30 * \brief LZMA1 Filter ID with extended options (for raw encoder/decoder) 31 * 32 * This is like LZMA_FILTER_LZMA1 but with this ID a few extra options 33 * are supported in the lzma_options_lzma structure: 34 * 35 * - A flag to tell the encoder if the end of payload marker (EOPM) alias 36 * end of stream (EOS) marker must be written at the end of the stream. 37 * In contrast, LZMA_FILTER_LZMA1 always writes the end marker. 38 * 39 * - Decoder needs to be told the uncompressed size of the stream 40 * or that it is unknown (using the special value UINT64_MAX). 41 * If the size is known, a flag can be set to allow the presence of 42 * the end marker anyway. In contrast, LZMA_FILTER_LZMA1 always 43 * behaves as if the uncompressed size was unknown. 44 * 45 * This allows handling file formats where LZMA1 streams are used but where 46 * the end marker isn't allowed or where it might not (always) be present. 47 * This extended LZMA1 functionality is provided as a Filter ID for raw 48 * encoder and decoder instead of adding new encoder and decoder initialization 49 * functions because this way it is possible to also use extra filters, 50 * for example, LZMA_FILTER_X86 in a filter chain with LZMA_FILTER_LZMA1EXT, 51 * which might be needed to handle some file formats. 52 */ 53 #define LZMA_FILTER_LZMA1EXT LZMA_VLI_C(0x4000000000000002) 54 55 /** 56 * \brief LZMA2 Filter ID 57 * 58 * Usually you want this instead of LZMA1. Compared to LZMA1, LZMA2 adds 59 * support for LZMA_SYNC_FLUSH, uncompressed chunks (smaller expansion 60 * when trying to compress uncompressible data), possibility to change 61 * lc/lp/pb in the middle of encoding, and some other internal improvements. 62 */ 63 #define LZMA_FILTER_LZMA2 LZMA_VLI_C(0x21) 64 65 66 /** 67 * \brief Match finders 68 * 69 * Match finder has major effect on both speed and compression ratio. 70 * Usually hash chains are faster than binary trees. 71 * 72 * If you will use LZMA_SYNC_FLUSH often, the hash chains may be a better 73 * choice, because binary trees get much higher compression ratio penalty 74 * with LZMA_SYNC_FLUSH. 75 * 76 * The memory usage formulas are only rough estimates, which are closest to 77 * reality when dict_size is a power of two. The formulas are more complex 78 * in reality, and can also change a little between liblzma versions. Use 79 * lzma_raw_encoder_memusage() to get more accurate estimate of memory usage. 80 */ 81 typedef enum { 82 LZMA_MF_HC3 = 0x03, 83 /**< 84 * \brief Hash Chain with 2- and 3-byte hashing 85 * 86 * Minimum nice_len: 3 87 * 88 * Memory usage: 89 * - dict_size <= 16 MiB: dict_size * 7.5 90 * - dict_size > 16 MiB: dict_size * 5.5 + 64 MiB 91 */ 92 93 LZMA_MF_HC4 = 0x04, 94 /**< 95 * \brief Hash Chain with 2-, 3-, and 4-byte hashing 96 * 97 * Minimum nice_len: 4 98 * 99 * Memory usage: 100 * - dict_size <= 32 MiB: dict_size * 7.5 101 * - dict_size > 32 MiB: dict_size * 6.5 102 */ 103 104 LZMA_MF_BT2 = 0x12, 105 /**< 106 * \brief Binary Tree with 2-byte hashing 107 * 108 * Minimum nice_len: 2 109 * 110 * Memory usage: dict_size * 9.5 111 */ 112 113 LZMA_MF_BT3 = 0x13, 114 /**< 115 * \brief Binary Tree with 2- and 3-byte hashing 116 * 117 * Minimum nice_len: 3 118 * 119 * Memory usage: 120 * - dict_size <= 16 MiB: dict_size * 11.5 121 * - dict_size > 16 MiB: dict_size * 9.5 + 64 MiB 122 */ 123 124 LZMA_MF_BT4 = 0x14 125 /**< 126 * \brief Binary Tree with 2-, 3-, and 4-byte hashing 127 * 128 * Minimum nice_len: 4 129 * 130 * Memory usage: 131 * - dict_size <= 32 MiB: dict_size * 11.5 132 * - dict_size > 32 MiB: dict_size * 10.5 133 */ 134 } lzma_match_finder; 135 136 137 /** 138 * \brief Test if given match finder is supported 139 * 140 * Return true if the given match finder is supported by this liblzma build. 141 * Otherwise false is returned. It is safe to call this with a value that 142 * isn't listed in lzma_match_finder enumeration; the return value will be 143 * false. 144 * 145 * There is no way to list which match finders are available in this 146 * particular liblzma version and build. It would be useless, because 147 * a new match finder, which the application developer wasn't aware, 148 * could require giving additional options to the encoder that the older 149 * match finders don't need. 150 */ 151 extern LZMA_API(lzma_bool) lzma_mf_is_supported(lzma_match_finder match_finder) 152 lzma_nothrow lzma_attr_const; 153 154 155 /** 156 * \brief Compression modes 157 * 158 * This selects the function used to analyze the data produced by the match 159 * finder. 160 */ 161 typedef enum { 162 LZMA_MODE_FAST = 1, 163 /**< 164 * \brief Fast compression 165 * 166 * Fast mode is usually at its best when combined with 167 * a hash chain match finder. 168 */ 169 170 LZMA_MODE_NORMAL = 2 171 /**< 172 * \brief Normal compression 173 * 174 * This is usually notably slower than fast mode. Use this 175 * together with binary tree match finders to expose the 176 * full potential of the LZMA1 or LZMA2 encoder. 177 */ 178 } lzma_mode; 179 180 181 /** 182 * \brief Test if given compression mode is supported 183 * 184 * Return true if the given compression mode is supported by this liblzma 185 * build. Otherwise false is returned. It is safe to call this with a value 186 * that isn't listed in lzma_mode enumeration; the return value will be false. 187 * 188 * There is no way to list which modes are available in this particular 189 * liblzma version and build. It would be useless, because a new compression 190 * mode, which the application developer wasn't aware, could require giving 191 * additional options to the encoder that the older modes don't need. 192 */ 193 extern LZMA_API(lzma_bool) lzma_mode_is_supported(lzma_mode mode) 194 lzma_nothrow lzma_attr_const; 195 196 197 /** 198 * \brief Options specific to the LZMA1 and LZMA2 filters 199 * 200 * Since LZMA1 and LZMA2 share most of the code, it's simplest to share 201 * the options structure too. For encoding, all but the reserved variables 202 * need to be initialized unless specifically mentioned otherwise. 203 * lzma_lzma_preset() can be used to get a good starting point. 204 * 205 * For raw decoding, both LZMA1 and LZMA2 need dict_size, preset_dict, and 206 * preset_dict_size (if preset_dict != NULL). LZMA1 needs also lc, lp, and pb. 207 */ 208 typedef struct { 209 /** 210 * \brief Dictionary size in bytes 211 * 212 * Dictionary size indicates how many bytes of the recently processed 213 * uncompressed data is kept in memory. One method to reduce size of 214 * the uncompressed data is to store distance-length pairs, which 215 * indicate what data to repeat from the dictionary buffer. Thus, 216 * the bigger the dictionary, the better the compression ratio 217 * usually is. 218 * 219 * Maximum size of the dictionary depends on multiple things: 220 * - Memory usage limit 221 * - Available address space (not a problem on 64-bit systems) 222 * - Selected match finder (encoder only) 223 * 224 * Currently the maximum dictionary size for encoding is 1.5 GiB 225 * (i.e. (UINT32_C(1) << 30) + (UINT32_C(1) << 29)) even on 64-bit 226 * systems for certain match finder implementation reasons. In the 227 * future, there may be match finders that support bigger 228 * dictionaries. 229 * 230 * Decoder already supports dictionaries up to 4 GiB - 1 B (i.e. 231 * UINT32_MAX), so increasing the maximum dictionary size of the 232 * encoder won't cause problems for old decoders. 233 * 234 * Because extremely small dictionaries sizes would have unneeded 235 * overhead in the decoder, the minimum dictionary size is 4096 bytes. 236 * 237 * \note When decoding, too big dictionary does no other harm 238 * than wasting memory. 239 */ 240 uint32_t dict_size; 241 # define LZMA_DICT_SIZE_MIN UINT32_C(4096) 242 # define LZMA_DICT_SIZE_DEFAULT (UINT32_C(1) << 23) 243 244 /** 245 * \brief Pointer to an initial dictionary 246 * 247 * It is possible to initialize the LZ77 history window using 248 * a preset dictionary. It is useful when compressing many 249 * similar, relatively small chunks of data independently from 250 * each other. The preset dictionary should contain typical 251 * strings that occur in the files being compressed. The most 252 * probable strings should be near the end of the preset dictionary. 253 * 254 * This feature should be used only in special situations. For 255 * now, it works correctly only with raw encoding and decoding. 256 * Currently none of the container formats supported by 257 * liblzma allow preset dictionary when decoding, thus if 258 * you create a .xz or .lzma file with preset dictionary, it 259 * cannot be decoded with the regular decoder functions. In the 260 * future, the .xz format will likely get support for preset 261 * dictionary though. 262 */ 263 const uint8_t *preset_dict; 264 265 /** 266 * \brief Size of the preset dictionary 267 * 268 * Specifies the size of the preset dictionary. If the size is 269 * bigger than dict_size, only the last dict_size bytes are 270 * processed. 271 * 272 * This variable is read only when preset_dict is not NULL. 273 * If preset_dict is not NULL but preset_dict_size is zero, 274 * no preset dictionary is used (identical to only setting 275 * preset_dict to NULL). 276 */ 277 uint32_t preset_dict_size; 278 279 /** 280 * \brief Number of literal context bits 281 * 282 * How many of the highest bits of the previous uncompressed 283 * eight-bit byte (also known as `literal') are taken into 284 * account when predicting the bits of the next literal. 285 * 286 * E.g. in typical English text, an upper-case letter is 287 * often followed by a lower-case letter, and a lower-case 288 * letter is usually followed by another lower-case letter. 289 * In the US-ASCII character set, the highest three bits are 010 290 * for upper-case letters and 011 for lower-case letters. 291 * When lc is at least 3, the literal coding can take advantage of 292 * this property in the uncompressed data. 293 * 294 * There is a limit that applies to literal context bits and literal 295 * position bits together: lc + lp <= 4. Without this limit the 296 * decoding could become very slow, which could have security related 297 * results in some cases like email servers doing virus scanning. 298 * This limit also simplifies the internal implementation in liblzma. 299 * 300 * There may be LZMA1 streams that have lc + lp > 4 (maximum possible 301 * lc would be 8). It is not possible to decode such streams with 302 * liblzma. 303 */ 304 uint32_t lc; 305 # define LZMA_LCLP_MIN 0 306 # define LZMA_LCLP_MAX 4 307 # define LZMA_LC_DEFAULT 3 308 309 /** 310 * \brief Number of literal position bits 311 * 312 * lp affects what kind of alignment in the uncompressed data is 313 * assumed when encoding literals. A literal is a single 8-bit byte. 314 * See pb below for more information about alignment. 315 */ 316 uint32_t lp; 317 # define LZMA_LP_DEFAULT 0 318 319 /** 320 * \brief Number of position bits 321 * 322 * pb affects what kind of alignment in the uncompressed data is 323 * assumed in general. The default means four-byte alignment 324 * (2^ pb =2^2=4), which is often a good choice when there's 325 * no better guess. 326 * 327 * When the alignment is known, setting pb accordingly may reduce 328 * the file size a little. E.g. with text files having one-byte 329 * alignment (US-ASCII, ISO-8859-*, UTF-8), setting pb=0 can 330 * improve compression slightly. For UTF-16 text, pb=1 is a good 331 * choice. If the alignment is an odd number like 3 bytes, pb=0 332 * might be the best choice. 333 * 334 * Even though the assumed alignment can be adjusted with pb and 335 * lp, LZMA1 and LZMA2 still slightly favor 16-byte alignment. 336 * It might be worth taking into account when designing file formats 337 * that are likely to be often compressed with LZMA1 or LZMA2. 338 */ 339 uint32_t pb; 340 # define LZMA_PB_MIN 0 341 # define LZMA_PB_MAX 4 342 # define LZMA_PB_DEFAULT 2 343 344 /** Compression mode */ 345 lzma_mode mode; 346 347 /** 348 * \brief Nice length of a match 349 * 350 * This determines how many bytes the encoder compares from the match 351 * candidates when looking for the best match. Once a match of at 352 * least nice_len bytes long is found, the encoder stops looking for 353 * better candidates and encodes the match. (Naturally, if the found 354 * match is actually longer than nice_len, the actual length is 355 * encoded; it's not truncated to nice_len.) 356 * 357 * Bigger values usually increase the compression ratio and 358 * compression time. For most files, 32 to 128 is a good value, 359 * which gives very good compression ratio at good speed. 360 * 361 * The exact minimum value depends on the match finder. The maximum 362 * is 273, which is the maximum length of a match that LZMA1 and 363 * LZMA2 can encode. 364 */ 365 uint32_t nice_len; 366 367 /** Match finder ID */ 368 lzma_match_finder mf; 369 370 /** 371 * \brief Maximum search depth in the match finder 372 * 373 * For every input byte, match finder searches through the hash chain 374 * or binary tree in a loop, each iteration going one step deeper in 375 * the chain or tree. The searching stops if 376 * - a match of at least nice_len bytes long is found; 377 * - all match candidates from the hash chain or binary tree have 378 * been checked; or 379 * - maximum search depth is reached. 380 * 381 * Maximum search depth is needed to prevent the match finder from 382 * wasting too much time in case there are lots of short match 383 * candidates. On the other hand, stopping the search before all 384 * candidates have been checked can reduce compression ratio. 385 * 386 * Setting depth to zero tells liblzma to use an automatic default 387 * value, that depends on the selected match finder and nice_len. 388 * The default is in the range [4, 200] or so (it may vary between 389 * liblzma versions). 390 * 391 * Using a bigger depth value than the default can increase 392 * compression ratio in some cases. There is no strict maximum value, 393 * but high values (thousands or millions) should be used with care: 394 * the encoder could remain fast enough with typical input, but 395 * malicious input could cause the match finder to slow down 396 * dramatically, possibly creating a denial of service attack. 397 */ 398 uint32_t depth; 399 400 /** 401 * \brief For LZMA_FILTER_LZMA1EXT: Extended flags 402 * 403 * This is used only with LZMA_FILTER_LZMA1EXT. 404 * 405 * Currently only one flag is supported, LZMA_LZMA1EXT_ALLOW_EOPM: 406 * 407 * - Encoder: If the flag is set, then end marker is written just 408 * like it is with LZMA_FILTER_LZMA1. Without this flag the 409 * end marker isn't written and the application has to store 410 * the uncompressed size somewhere outside the compressed stream. 411 * To decompress streams without the end marker, the appliation 412 * has to set the correct uncompressed size in ext_size_low and 413 * ext_size_high. 414 * 415 * - Decoder: If the uncompressed size in ext_size_low and 416 * ext_size_high is set to the special value UINT64_MAX 417 * (indicating unknown uncompressed size) then this flag is 418 * ignored and the end marker must always be present, that is, 419 * the behavior is identical to LZMA_FILTER_LZMA1. 420 * 421 * Otherwise, if this flag isn't set, then the input stream 422 * must not have the end marker; if the end marker is detected 423 * then it will result in LZMA_DATA_ERROR. This is useful when 424 * it is known that the stream must not have the end marker and 425 * strict validation is wanted. 426 * 427 * If this flag is set, then it is autodetected if the end marker 428 * is present after the specified number of uncompressed bytes 429 * has been decompressed (ext_size_low and ext_size_high). The 430 * end marker isn't allowed in any other position. This behavior 431 * is useful when uncompressed size is known but the end marker 432 * may or may not be present. This is the case, for example, 433 * in .7z files (valid .7z files that have the end marker in 434 * LZMA1 streams are rare but they do exist). 435 */ 436 uint32_t ext_flags; 437 # define LZMA_LZMA1EXT_ALLOW_EOPM UINT32_C(0x01) 438 439 /** 440 * \brief For LZMA_FILTER_LZMA1EXT: Uncompressed size (low bits) 441 * 442 * The 64-bit uncompressed size is needed for decompression with 443 * LZMA_FILTER_LZMA1EXT. The size is ignored by the encoder. 444 * 445 * The special value UINT64_MAX indicates that the uncompressed size 446 * is unknown and that the end of payload marker (also known as 447 * end of stream marker) must be present to indicate the end of 448 * the LZMA1 stream. Any other value indicates the expected 449 * uncompressed size of the LZMA1 stream. (If LZMA1 was used together 450 * with filters that change the size of the data then the uncompressed 451 * size of the LZMA1 stream could be different than the final 452 * uncompressed size of the filtered stream.) 453 * 454 * ext_size_low holds the least significant 32 bits of the 455 * uncompressed size. The most significant 32 bits must be set 456 * in ext_size_high. The macro lzma_ext_size_set(opt_lzma, u64size) 457 * can be used to set these members. 458 * 459 * The 64-bit uncompressed size is split into two uint32_t variables 460 * because there were no reserved uint64_t members and using the 461 * same options structure for LZMA_FILTER_LZMA1, LZMA_FILTER_LZMA1EXT, 462 * and LZMA_FILTER_LZMA2 was otherwise more convenient than having 463 * a new options structure for LZMA_FILTER_LZMA1EXT. (Replacing two 464 * uint32_t members with one uint64_t changes the ABI on some systems 465 * as the alignment of this struct can increase from 4 bytes to 8.) 466 */ 467 uint32_t ext_size_low; 468 469 /** 470 * \brief For LZMA_FILTER_LZMA1EXT: Uncompressed size (high bits) 471 * 472 * This holds the most significant 32 bits of the uncompressed size. 473 */ 474 uint32_t ext_size_high; 475 476 /* 477 * Reserved space to allow possible future extensions without 478 * breaking the ABI. You should not touch these, because the names 479 * of these variables may change. These are and will never be used 480 * with the currently supported options, so it is safe to leave these 481 * uninitialized. 482 */ 483 uint32_t reserved_int4; 484 uint32_t reserved_int5; 485 uint32_t reserved_int6; 486 uint32_t reserved_int7; 487 uint32_t reserved_int8; 488 lzma_reserved_enum reserved_enum1; 489 lzma_reserved_enum reserved_enum2; 490 lzma_reserved_enum reserved_enum3; 491 lzma_reserved_enum reserved_enum4; 492 void *reserved_ptr1; 493 void *reserved_ptr2; 494 495 } lzma_options_lzma; 496 497 498 /** 499 * \brief Macro to set the 64-bit uncompressed size in ext_size_* 500 * 501 * This might be convenient when decoding using LZMA_FILTER_LZMA1EXT. 502 * This isn't used with LZMA_FILTER_LZMA1 or LZMA_FILTER_LZMA2. 503 */ 504 #define lzma_set_ext_size(opt_lzma2, u64size) \ 505 do { \ 506 (opt_lzma2).ext_size_low = (uint32_t)(u64size); \ 507 (opt_lzma2).ext_size_high = (uint32_t)((uint64_t)(u64size) >> 32); \ 508 } while (0) 509 510 511 /** 512 * \brief Set a compression preset to lzma_options_lzma structure 513 * 514 * 0 is the fastest and 9 is the slowest. These match the switches -0 .. -9 515 * of the xz command line tool. In addition, it is possible to bitwise-or 516 * flags to the preset. Currently only LZMA_PRESET_EXTREME is supported. 517 * The flags are defined in container.h, because the flags are used also 518 * with lzma_easy_encoder(). 519 * 520 * The preset values are subject to changes between liblzma versions. 521 * 522 * This function is available only if LZMA1 or LZMA2 encoder has been enabled 523 * when building liblzma. 524 * 525 * \return On success, false is returned. If the preset is not 526 * supported, true is returned. 527 */ 528 extern LZMA_API(lzma_bool) lzma_lzma_preset( 529 lzma_options_lzma *options, uint32_t preset) lzma_nothrow; 530