1*3b35e7eeSXin LI // SPDX-License-Identifier: 0BSD 2*3b35e7eeSXin LI 381ad8388SMartin Matuska /////////////////////////////////////////////////////////////////////////////// 481ad8388SMartin Matuska // 581ad8388SMartin Matuska /// \file lz_encoder.h 681ad8388SMartin Matuska /// \brief LZ in window and match finder API 781ad8388SMartin Matuska /// 881ad8388SMartin Matuska // Authors: Igor Pavlov 981ad8388SMartin Matuska // Lasse Collin 1081ad8388SMartin Matuska // 1181ad8388SMartin Matuska /////////////////////////////////////////////////////////////////////////////// 1281ad8388SMartin Matuska 1381ad8388SMartin Matuska #ifndef LZMA_LZ_ENCODER_H 1481ad8388SMartin Matuska #define LZMA_LZ_ENCODER_H 1581ad8388SMartin Matuska 1681ad8388SMartin Matuska #include "common.h" 1781ad8388SMartin Matuska 1881ad8388SMartin Matuska 195ffb19acSXin LI // For now, the dictionary size is limited to 1.5 GiB. This may grow 205ffb19acSXin LI // in the future if needed, but it needs a little more work than just 215ffb19acSXin LI // changing this check. 225ffb19acSXin LI #define IS_ENC_DICT_SIZE_VALID(size) \ 235ffb19acSXin LI ((size) >= LZMA_DICT_SIZE_MIN \ 245ffb19acSXin LI && (size) <= (UINT32_C(1) << 30) + (UINT32_C(1) << 29)) 255ffb19acSXin LI 265ffb19acSXin LI 2781ad8388SMartin Matuska /// A table of these is used by the LZ-based encoder to hold 2881ad8388SMartin Matuska /// the length-distance pairs found by the match finder. 2981ad8388SMartin Matuska typedef struct { 3081ad8388SMartin Matuska uint32_t len; 3181ad8388SMartin Matuska uint32_t dist; 3281ad8388SMartin Matuska } lzma_match; 3381ad8388SMartin Matuska 3481ad8388SMartin Matuska 3581ad8388SMartin Matuska typedef struct lzma_mf_s lzma_mf; 3681ad8388SMartin Matuska struct lzma_mf_s { 3781ad8388SMartin Matuska /////////////// 3881ad8388SMartin Matuska // In Window // 3981ad8388SMartin Matuska /////////////// 4081ad8388SMartin Matuska 4181ad8388SMartin Matuska /// Pointer to buffer with data to be compressed 4281ad8388SMartin Matuska uint8_t *buffer; 4381ad8388SMartin Matuska 4481ad8388SMartin Matuska /// Total size of the allocated buffer (that is, including all 4581ad8388SMartin Matuska /// the extra space) 4681ad8388SMartin Matuska uint32_t size; 4781ad8388SMartin Matuska 4881ad8388SMartin Matuska /// Number of bytes that must be kept available in our input history. 4981ad8388SMartin Matuska /// That is, once keep_size_before bytes have been processed, 5081ad8388SMartin Matuska /// buffer[read_pos - keep_size_before] is the oldest byte that 5181ad8388SMartin Matuska /// must be available for reading. 5281ad8388SMartin Matuska uint32_t keep_size_before; 5381ad8388SMartin Matuska 5481ad8388SMartin Matuska /// Number of bytes that must be kept in buffer after read_pos. 5581ad8388SMartin Matuska /// That is, read_pos <= write_pos - keep_size_after as long as 5681ad8388SMartin Matuska /// action is LZMA_RUN; when action != LZMA_RUN, read_pos is allowed 5781ad8388SMartin Matuska /// to reach write_pos so that the last bytes get encoded too. 5881ad8388SMartin Matuska uint32_t keep_size_after; 5981ad8388SMartin Matuska 6081ad8388SMartin Matuska /// Match finders store locations of matches using 32-bit integers. 6181ad8388SMartin Matuska /// To avoid adjusting several megabytes of integers every time the 6281ad8388SMartin Matuska /// input window is moved with move_window, we only adjust the 6381ad8388SMartin Matuska /// offset of the buffer. Thus, buffer[value_in_hash_table - offset] 6481ad8388SMartin Matuska /// is the byte pointed by value_in_hash_table. 6581ad8388SMartin Matuska uint32_t offset; 6681ad8388SMartin Matuska 6781ad8388SMartin Matuska /// buffer[read_pos] is the next byte to run through the match 6881ad8388SMartin Matuska /// finder. This is incremented in the match finder once the byte 6981ad8388SMartin Matuska /// has been processed. 7081ad8388SMartin Matuska uint32_t read_pos; 7181ad8388SMartin Matuska 7281ad8388SMartin Matuska /// Number of bytes that have been ran through the match finder, but 7381ad8388SMartin Matuska /// which haven't been encoded by the LZ-based encoder yet. 7481ad8388SMartin Matuska uint32_t read_ahead; 7581ad8388SMartin Matuska 7681ad8388SMartin Matuska /// As long as read_pos is less than read_limit, there is enough 7781ad8388SMartin Matuska /// input available in buffer for at least one encoding loop. 7881ad8388SMartin Matuska /// 7981ad8388SMartin Matuska /// Because of the stateful API, read_limit may and will get greater 8081ad8388SMartin Matuska /// than read_pos quite often. This is taken into account when 8181ad8388SMartin Matuska /// calculating the value for keep_size_after. 8281ad8388SMartin Matuska uint32_t read_limit; 8381ad8388SMartin Matuska 8481ad8388SMartin Matuska /// buffer[write_pos] is the first byte that doesn't contain valid 8581ad8388SMartin Matuska /// uncompressed data; that is, the next input byte will be copied 8681ad8388SMartin Matuska /// to buffer[write_pos]. 8781ad8388SMartin Matuska uint32_t write_pos; 8881ad8388SMartin Matuska 8981ad8388SMartin Matuska /// Number of bytes not hashed before read_pos. This is needed to 9081ad8388SMartin Matuska /// restart the match finder after LZMA_SYNC_FLUSH. 9181ad8388SMartin Matuska uint32_t pending; 9281ad8388SMartin Matuska 9381ad8388SMartin Matuska ////////////////// 9481ad8388SMartin Matuska // Match Finder // 9581ad8388SMartin Matuska ////////////////// 9681ad8388SMartin Matuska 9781ad8388SMartin Matuska /// Find matches. Returns the number of distance-length pairs written 9881ad8388SMartin Matuska /// to the matches array. This is called only via lzma_mf_find(). 9981ad8388SMartin Matuska uint32_t (*find)(lzma_mf *mf, lzma_match *matches); 10081ad8388SMartin Matuska 10181ad8388SMartin Matuska /// Skips num bytes. This is like find() but doesn't make the 10281ad8388SMartin Matuska /// distance-length pairs available, thus being a little faster. 10381ad8388SMartin Matuska /// This is called only via mf_skip(). 10481ad8388SMartin Matuska void (*skip)(lzma_mf *mf, uint32_t num); 10581ad8388SMartin Matuska 10681ad8388SMartin Matuska uint32_t *hash; 10781ad8388SMartin Matuska uint32_t *son; 10881ad8388SMartin Matuska uint32_t cyclic_pos; 10981ad8388SMartin Matuska uint32_t cyclic_size; // Must be dictionary size + 1. 11081ad8388SMartin Matuska uint32_t hash_mask; 11181ad8388SMartin Matuska 11281ad8388SMartin Matuska /// Maximum number of loops in the match finder 11381ad8388SMartin Matuska uint32_t depth; 11481ad8388SMartin Matuska 11581ad8388SMartin Matuska /// Maximum length of a match that the match finder will try to find. 11681ad8388SMartin Matuska uint32_t nice_len; 11781ad8388SMartin Matuska 11881ad8388SMartin Matuska /// Maximum length of a match supported by the LZ-based encoder. 11981ad8388SMartin Matuska /// If the longest match found by the match finder is nice_len, 12081ad8388SMartin Matuska /// mf_find() tries to expand it up to match_len_max bytes. 12181ad8388SMartin Matuska uint32_t match_len_max; 12281ad8388SMartin Matuska 12381ad8388SMartin Matuska /// When running out of input, binary tree match finders need to know 12481ad8388SMartin Matuska /// if it is due to flushing or finishing. The action is used also 12581ad8388SMartin Matuska /// by the LZ-based encoders themselves. 12681ad8388SMartin Matuska lzma_action action; 12781ad8388SMartin Matuska 12881ad8388SMartin Matuska /// Number of elements in hash[] 12953200025SRui Paulo uint32_t hash_count; 13081ad8388SMartin Matuska 13181ad8388SMartin Matuska /// Number of elements in son[] 13281ad8388SMartin Matuska uint32_t sons_count; 13381ad8388SMartin Matuska }; 13481ad8388SMartin Matuska 13581ad8388SMartin Matuska 13681ad8388SMartin Matuska typedef struct { 13781ad8388SMartin Matuska /// Extra amount of data to keep available before the "actual" 13881ad8388SMartin Matuska /// dictionary. 13981ad8388SMartin Matuska size_t before_size; 14081ad8388SMartin Matuska 14181ad8388SMartin Matuska /// Size of the history buffer 14281ad8388SMartin Matuska size_t dict_size; 14381ad8388SMartin Matuska 14481ad8388SMartin Matuska /// Extra amount of data to keep available after the "actual" 14581ad8388SMartin Matuska /// dictionary. 14681ad8388SMartin Matuska size_t after_size; 14781ad8388SMartin Matuska 14881ad8388SMartin Matuska /// Maximum length of a match that the LZ-based encoder can accept. 14981ad8388SMartin Matuska /// This is used to extend matches of length nice_len to the 15081ad8388SMartin Matuska /// maximum possible length. 15181ad8388SMartin Matuska size_t match_len_max; 15281ad8388SMartin Matuska 15381ad8388SMartin Matuska /// Match finder will search matches up to this length. 15481ad8388SMartin Matuska /// This must be less than or equal to match_len_max. 15581ad8388SMartin Matuska size_t nice_len; 15681ad8388SMartin Matuska 15781ad8388SMartin Matuska /// Type of the match finder to use 15881ad8388SMartin Matuska lzma_match_finder match_finder; 15981ad8388SMartin Matuska 16081ad8388SMartin Matuska /// Maximum search depth 16181ad8388SMartin Matuska uint32_t depth; 16281ad8388SMartin Matuska 163*3b35e7eeSXin LI /// Initial dictionary for the match finder to search. 16481ad8388SMartin Matuska const uint8_t *preset_dict; 16581ad8388SMartin Matuska 166*3b35e7eeSXin LI /// If the preset dictionary is NULL, this value is ignored. 167*3b35e7eeSXin LI /// Otherwise this member must indicate the preset dictionary's 168*3b35e7eeSXin LI /// buffer size. If this size is larger than dict_size, then only 169*3b35e7eeSXin LI /// the dict_size sized tail of the preset_dict will be used. 17081ad8388SMartin Matuska uint32_t preset_dict_size; 17181ad8388SMartin Matuska 17281ad8388SMartin Matuska } lzma_lz_options; 17381ad8388SMartin Matuska 17481ad8388SMartin Matuska 17581ad8388SMartin Matuska // The total usable buffer space at any moment outside the match finder: 17681ad8388SMartin Matuska // before_size + dict_size + after_size + match_len_max 17781ad8388SMartin Matuska // 17881ad8388SMartin Matuska // In reality, there's some extra space allocated to prevent the number of 17981ad8388SMartin Matuska // memmove() calls reasonable. The bigger the dict_size is, the bigger 18081ad8388SMartin Matuska // this extra buffer will be since with bigger dictionaries memmove() would 18181ad8388SMartin Matuska // also take longer. 18281ad8388SMartin Matuska // 18381ad8388SMartin Matuska // A single encoder loop in the LZ-based encoder may call the match finder 18481ad8388SMartin Matuska // (mf_find() or mf_skip()) at most after_size times. In other words, 18581ad8388SMartin Matuska // a single encoder loop may increment lzma_mf.read_pos at most after_size 18681ad8388SMartin Matuska // times. Since matches are looked up to 18781ad8388SMartin Matuska // lzma_mf.buffer[lzma_mf.read_pos + match_len_max - 1], the total 18881ad8388SMartin Matuska // amount of extra buffer needed after dict_size becomes 18981ad8388SMartin Matuska // after_size + match_len_max. 19081ad8388SMartin Matuska // 19181ad8388SMartin Matuska // before_size has two uses. The first one is to keep literals available 19281ad8388SMartin Matuska // in cases when the LZ-based encoder has made some read ahead. 19381ad8388SMartin Matuska // TODO: Maybe this could be changed by making the LZ-based encoders to 19481ad8388SMartin Matuska // store the actual literals as they do with length-distance pairs. 19581ad8388SMartin Matuska // 19681ad8388SMartin Matuska // Algorithms such as LZMA2 first try to compress a chunk, and then check 19781ad8388SMartin Matuska // if the encoded result is smaller than the uncompressed one. If the chunk 1981f3ced26SXin LI // was incompressible, it is better to store it in uncompressed form in 19981ad8388SMartin Matuska // the output stream. To do this, the whole uncompressed chunk has to be 20081ad8388SMartin Matuska // still available in the history buffer. before_size achieves that. 20181ad8388SMartin Matuska 20281ad8388SMartin Matuska 20381ad8388SMartin Matuska typedef struct { 20481ad8388SMartin Matuska /// Data specific to the LZ-based encoder 2051456f0f9SXin LI void *coder; 20681ad8388SMartin Matuska 20781ad8388SMartin Matuska /// Function to encode from *dict to out[] 2081456f0f9SXin LI lzma_ret (*code)(void *coder, 20981ad8388SMartin Matuska lzma_mf *restrict mf, uint8_t *restrict out, 21081ad8388SMartin Matuska size_t *restrict out_pos, size_t out_size); 21181ad8388SMartin Matuska 21281ad8388SMartin Matuska /// Free allocated resources 2131456f0f9SXin LI void (*end)(void *coder, const lzma_allocator *allocator); 21481ad8388SMartin Matuska 21581ad8388SMartin Matuska /// Update the options in the middle of the encoding. 2161456f0f9SXin LI lzma_ret (*options_update)(void *coder, const lzma_filter *filter); 21781ad8388SMartin Matuska 21873ed8e77SXin LI /// Set maximum allowed output size 21973ed8e77SXin LI lzma_ret (*set_out_limit)(void *coder, uint64_t *uncomp_size, 22073ed8e77SXin LI uint64_t out_limit); 22173ed8e77SXin LI 22281ad8388SMartin Matuska } lzma_lz_encoder; 22381ad8388SMartin Matuska 22481ad8388SMartin Matuska 22581ad8388SMartin Matuska // Basic steps: 22681ad8388SMartin Matuska // 1. Input gets copied into the dictionary. 22781ad8388SMartin Matuska // 2. Data in dictionary gets run through the match finder byte by byte. 22881ad8388SMartin Matuska // 3. The literals and matches are encoded using e.g. LZMA. 22981ad8388SMartin Matuska // 23081ad8388SMartin Matuska // The bytes that have been ran through the match finder, but not encoded yet, 231*3b35e7eeSXin LI // are called 'read ahead'. 23281ad8388SMartin Matuska 23381ad8388SMartin Matuska 23473ed8e77SXin LI /// Get how many bytes the match finder hashes in its initial step. 23573ed8e77SXin LI /// This is also the minimum nice_len value with the match finder. 23673ed8e77SXin LI static inline uint32_t 23773ed8e77SXin LI mf_get_hash_bytes(lzma_match_finder match_finder) 23873ed8e77SXin LI { 23973ed8e77SXin LI return (uint32_t)match_finder & 0x0F; 24073ed8e77SXin LI } 24173ed8e77SXin LI 24273ed8e77SXin LI 24381ad8388SMartin Matuska /// Get pointer to the first byte not ran through the match finder 24481ad8388SMartin Matuska static inline const uint8_t * 24581ad8388SMartin Matuska mf_ptr(const lzma_mf *mf) 24681ad8388SMartin Matuska { 24781ad8388SMartin Matuska return mf->buffer + mf->read_pos; 24881ad8388SMartin Matuska } 24981ad8388SMartin Matuska 25081ad8388SMartin Matuska 25181ad8388SMartin Matuska /// Get the number of bytes that haven't been ran through the match finder yet. 25281ad8388SMartin Matuska static inline uint32_t 25381ad8388SMartin Matuska mf_avail(const lzma_mf *mf) 25481ad8388SMartin Matuska { 25581ad8388SMartin Matuska return mf->write_pos - mf->read_pos; 25681ad8388SMartin Matuska } 25781ad8388SMartin Matuska 25881ad8388SMartin Matuska 25981ad8388SMartin Matuska /// Get the number of bytes that haven't been encoded yet (some of these 26081ad8388SMartin Matuska /// bytes may have been ran through the match finder though). 26181ad8388SMartin Matuska static inline uint32_t 26281ad8388SMartin Matuska mf_unencoded(const lzma_mf *mf) 26381ad8388SMartin Matuska { 26481ad8388SMartin Matuska return mf->write_pos - mf->read_pos + mf->read_ahead; 26581ad8388SMartin Matuska } 26681ad8388SMartin Matuska 26781ad8388SMartin Matuska 26881ad8388SMartin Matuska /// Calculate the absolute offset from the beginning of the most recent 26981ad8388SMartin Matuska /// dictionary reset. Only the lowest four bits are important, so there's no 27081ad8388SMartin Matuska /// problem that we don't know the 64-bit size of the data encoded so far. 27181ad8388SMartin Matuska /// 27281ad8388SMartin Matuska /// NOTE: When moving the input window, we need to do it so that the lowest 27381ad8388SMartin Matuska /// bits of dict->read_pos are not modified to keep this macro working 27481ad8388SMartin Matuska /// as intended. 27581ad8388SMartin Matuska static inline uint32_t 27681ad8388SMartin Matuska mf_position(const lzma_mf *mf) 27781ad8388SMartin Matuska { 27881ad8388SMartin Matuska return mf->read_pos - mf->read_ahead; 27981ad8388SMartin Matuska } 28081ad8388SMartin Matuska 28181ad8388SMartin Matuska 28281ad8388SMartin Matuska /// Since everything else begins with mf_, use it also for lzma_mf_find(). 28381ad8388SMartin Matuska #define mf_find lzma_mf_find 28481ad8388SMartin Matuska 28581ad8388SMartin Matuska 28681ad8388SMartin Matuska /// Skip the given number of bytes. This is used when a good match was found. 28781ad8388SMartin Matuska /// For example, if mf_find() finds a match of 200 bytes long, the first byte 28881ad8388SMartin Matuska /// of that match was already consumed by mf_find(), and the rest 199 bytes 28981ad8388SMartin Matuska /// have to be skipped with mf_skip(mf, 199). 29081ad8388SMartin Matuska static inline void 29181ad8388SMartin Matuska mf_skip(lzma_mf *mf, uint32_t amount) 29281ad8388SMartin Matuska { 29381ad8388SMartin Matuska if (amount != 0) { 29481ad8388SMartin Matuska mf->skip(mf, amount); 29581ad8388SMartin Matuska mf->read_ahead += amount; 29681ad8388SMartin Matuska } 29781ad8388SMartin Matuska } 29881ad8388SMartin Matuska 29981ad8388SMartin Matuska 30081ad8388SMartin Matuska /// Copies at most *left number of bytes from the history buffer 30181ad8388SMartin Matuska /// to out[]. This is needed by LZMA2 to encode uncompressed chunks. 30281ad8388SMartin Matuska static inline void 30381ad8388SMartin Matuska mf_read(lzma_mf *mf, uint8_t *out, size_t *out_pos, size_t out_size, 30481ad8388SMartin Matuska size_t *left) 30581ad8388SMartin Matuska { 30681ad8388SMartin Matuska const size_t out_avail = out_size - *out_pos; 307e0f0e66dSMartin Matuska const size_t copy_size = my_min(out_avail, *left); 30881ad8388SMartin Matuska 30981ad8388SMartin Matuska assert(mf->read_ahead == 0); 31081ad8388SMartin Matuska assert(mf->read_pos >= *left); 31181ad8388SMartin Matuska 31281ad8388SMartin Matuska memcpy(out + *out_pos, mf->buffer + mf->read_pos - *left, 31381ad8388SMartin Matuska copy_size); 31481ad8388SMartin Matuska 31581ad8388SMartin Matuska *out_pos += copy_size; 31681ad8388SMartin Matuska *left -= copy_size; 31781ad8388SMartin Matuska return; 31881ad8388SMartin Matuska } 31981ad8388SMartin Matuska 32081ad8388SMartin Matuska 32181ad8388SMartin Matuska extern lzma_ret lzma_lz_encoder_init( 32253200025SRui Paulo lzma_next_coder *next, const lzma_allocator *allocator, 32381ad8388SMartin Matuska const lzma_filter_info *filters, 32481ad8388SMartin Matuska lzma_ret (*lz_init)(lzma_lz_encoder *lz, 32573ed8e77SXin LI const lzma_allocator *allocator, 32673ed8e77SXin LI lzma_vli id, const void *options, 32781ad8388SMartin Matuska lzma_lz_options *lz_options)); 32881ad8388SMartin Matuska 32981ad8388SMartin Matuska 33081ad8388SMartin Matuska extern uint64_t lzma_lz_encoder_memusage(const lzma_lz_options *lz_options); 33181ad8388SMartin Matuska 33281ad8388SMartin Matuska 33381ad8388SMartin Matuska // These are only for LZ encoder's internal use. 33481ad8388SMartin Matuska extern uint32_t lzma_mf_find( 33581ad8388SMartin Matuska lzma_mf *mf, uint32_t *count, lzma_match *matches); 33681ad8388SMartin Matuska 33781ad8388SMartin Matuska extern uint32_t lzma_mf_hc3_find(lzma_mf *dict, lzma_match *matches); 33881ad8388SMartin Matuska extern void lzma_mf_hc3_skip(lzma_mf *dict, uint32_t amount); 33981ad8388SMartin Matuska 34081ad8388SMartin Matuska extern uint32_t lzma_mf_hc4_find(lzma_mf *dict, lzma_match *matches); 34181ad8388SMartin Matuska extern void lzma_mf_hc4_skip(lzma_mf *dict, uint32_t amount); 34281ad8388SMartin Matuska 34381ad8388SMartin Matuska extern uint32_t lzma_mf_bt2_find(lzma_mf *dict, lzma_match *matches); 34481ad8388SMartin Matuska extern void lzma_mf_bt2_skip(lzma_mf *dict, uint32_t amount); 34581ad8388SMartin Matuska 34681ad8388SMartin Matuska extern uint32_t lzma_mf_bt3_find(lzma_mf *dict, lzma_match *matches); 34781ad8388SMartin Matuska extern void lzma_mf_bt3_skip(lzma_mf *dict, uint32_t amount); 34881ad8388SMartin Matuska 34981ad8388SMartin Matuska extern uint32_t lzma_mf_bt4_find(lzma_mf *dict, lzma_match *matches); 35081ad8388SMartin Matuska extern void lzma_mf_bt4_skip(lzma_mf *dict, uint32_t amount); 35181ad8388SMartin Matuska 35281ad8388SMartin Matuska #endif 353