1 // SPDX-License-Identifier: 0BSD 2 3 /////////////////////////////////////////////////////////////////////////////// 4 // 5 /// \file lz_decoder.h 6 /// \brief LZ out window 7 /// 8 // Authors: Igor Pavlov 9 // Lasse Collin 10 // 11 /////////////////////////////////////////////////////////////////////////////// 12 13 #ifndef LZMA_LZ_DECODER_H 14 #define LZMA_LZ_DECODER_H 15 16 #include "common.h" 17 18 #ifdef HAVE_IMMINTRIN_H 19 # include <immintrin.h> 20 #endif 21 22 23 // dict_repeat() implementation variant: 24 // 0 = Byte-by-byte copying only. 25 // 1 = Use memcpy() for non-overlapping copies. 26 // 2 = Use x86 SSE2 for non-overlapping copies. 27 #ifndef LZMA_LZ_DECODER_CONFIG 28 # if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ 29 && defined(HAVE_IMMINTRIN_H) \ 30 && (defined(__SSE2__) || defined(_M_X64) \ 31 || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)) 32 # define LZMA_LZ_DECODER_CONFIG 2 33 # else 34 # define LZMA_LZ_DECODER_CONFIG 1 35 # endif 36 #endif 37 38 /// Byte-by-byte and memcpy() copy exactly the amount needed. Other methods 39 /// can copy up to LZ_DICT_EXTRA bytes more than requested, and this amount 40 /// of extra space is needed at the end of the allocated dictionary buffer. 41 /// 42 /// NOTE: If this is increased, update LZMA_DICT_REPEAT_MAX too. 43 #if LZMA_LZ_DECODER_CONFIG >= 2 44 # define LZ_DICT_EXTRA 32 45 #else 46 # define LZ_DICT_EXTRA 0 47 #endif 48 49 /// Maximum number of bytes that dict_repeat() may copy. The allocated 50 /// dictionary buffer will be 2 * LZ_DICT_REPEAT_MAX + LZMA_DICT_EXTRA bytes 51 /// larger than the actual dictionary size: 52 /// 53 /// (1) Every time the decoder reaches the end of the dictionary buffer, 54 /// the last LZ_DICT_REPEAT_MAX bytes will be copied to the beginning. 55 /// This way dict_repeat() will only need to copy from one place, 56 /// never from both the end and beginning of the buffer. 57 /// 58 /// (2) The other LZ_DICT_REPEAT_MAX bytes is kept as a buffer between 59 /// the oldest byte still in the dictionary and the current write 60 /// position. This way dict_repeat() with the maximum valid distance 61 /// won't need memmove() as the copying cannot overlap. 62 /// 63 /// (3) LZ_DICT_EXTRA bytes are required at the end of the dictionary buffer 64 /// so that extra copying done by dict_repeat() won't write or read past 65 /// the end of the allocated buffer. This amount is *not* counted as part 66 /// of lzma_dict.size. 67 /// 68 /// Note that memcpy() still cannot be used if distance < len. 69 /// 70 /// LZMA's longest match length is 273 bytes. The LZMA decoder looks at 71 /// the lowest four bits of the dictionary position, thus 273 must be 72 /// rounded up to the next multiple of 16 (288). In addition, optimized 73 /// dict_repeat() copies 32 bytes at a time, thus this must also be 74 /// a multiple of 32. 75 #define LZ_DICT_REPEAT_MAX 288 76 77 /// Initial position in lzma_dict.buf when the dictionary is empty. 78 #define LZ_DICT_INIT_POS (2 * LZ_DICT_REPEAT_MAX) 79 80 81 typedef struct { 82 /// Pointer to the dictionary buffer. 83 uint8_t *buf; 84 85 /// Write position in dictionary. The next byte will be written to 86 /// buf[pos]. 87 size_t pos; 88 89 /// Indicates how full the dictionary is. This is used by 90 /// dict_is_distance_valid() to detect corrupt files that would 91 /// read beyond the beginning of the dictionary. 92 size_t full; 93 94 /// Write limit 95 size_t limit; 96 97 /// Allocated size of buf. This is 2 * LZ_DICT_REPEAT_MAX bytes 98 /// larger than the actual dictionary size. This is enforced by 99 /// how the value for "full" is set; it can be at most 100 /// "size - 2 * LZ_DICT_REPEAT_MAX". 101 size_t size; 102 103 /// True once the dictionary has become full and the writing position 104 /// has been wrapped in decode_buffer() in lz_decoder.c. 105 bool has_wrapped; 106 107 /// True when dictionary should be reset before decoding more data. 108 bool need_reset; 109 110 } lzma_dict; 111 112 113 typedef struct { 114 size_t dict_size; 115 const uint8_t *preset_dict; 116 size_t preset_dict_size; 117 } lzma_lz_options; 118 119 120 typedef struct { 121 /// Data specific to the LZ-based decoder 122 void *coder; 123 124 /// Function to decode from in[] to *dict 125 lzma_ret (*code)(void *coder, 126 lzma_dict *restrict dict, const uint8_t *restrict in, 127 size_t *restrict in_pos, size_t in_size); 128 129 void (*reset)(void *coder, const void *options); 130 131 /// Set the uncompressed size. If uncompressed_size == LZMA_VLI_UNKNOWN 132 /// then allow_eopm will always be true. 133 void (*set_uncompressed)(void *coder, lzma_vli uncompressed_size, 134 bool allow_eopm); 135 136 /// Free allocated resources 137 void (*end)(void *coder, const lzma_allocator *allocator); 138 139 } lzma_lz_decoder; 140 141 142 #define LZMA_LZ_DECODER_INIT \ 143 (lzma_lz_decoder){ \ 144 .coder = NULL, \ 145 .code = NULL, \ 146 .reset = NULL, \ 147 .set_uncompressed = NULL, \ 148 .end = NULL, \ 149 } 150 151 152 extern lzma_ret lzma_lz_decoder_init(lzma_next_coder *next, 153 const lzma_allocator *allocator, 154 const lzma_filter_info *filters, 155 lzma_ret (*lz_init)(lzma_lz_decoder *lz, 156 const lzma_allocator *allocator, 157 lzma_vli id, const void *options, 158 lzma_lz_options *lz_options)); 159 160 extern uint64_t lzma_lz_decoder_memusage(size_t dictionary_size); 161 162 163 ////////////////////// 164 // Inline functions // 165 ////////////////////// 166 167 /// Get a byte from the history buffer. 168 static inline uint8_t 169 dict_get(const lzma_dict *const dict, const uint32_t distance) 170 { 171 return dict->buf[dict->pos - distance - 1 172 + (distance < dict->pos 173 ? 0 : dict->size - LZ_DICT_REPEAT_MAX)]; 174 } 175 176 177 /// Optimized version of dict_get(dict, 0) 178 static inline uint8_t 179 dict_get0(const lzma_dict *const dict) 180 { 181 return dict->buf[dict->pos - 1]; 182 } 183 184 185 /// Test if dictionary is empty. 186 static inline bool 187 dict_is_empty(const lzma_dict *const dict) 188 { 189 return dict->full == 0; 190 } 191 192 193 /// Validate the match distance 194 static inline bool 195 dict_is_distance_valid(const lzma_dict *const dict, const size_t distance) 196 { 197 return dict->full > distance; 198 } 199 200 201 /// Repeat *len bytes at distance. 202 static inline bool 203 dict_repeat(lzma_dict *restrict dict, 204 uint32_t distance, uint32_t *restrict len) 205 { 206 // Don't write past the end of the dictionary. 207 const size_t dict_avail = dict->limit - dict->pos; 208 uint32_t left = my_min(dict_avail, *len); 209 *len -= left; 210 211 size_t back = dict->pos - distance - 1; 212 if (distance >= dict->pos) 213 back += dict->size - LZ_DICT_REPEAT_MAX; 214 215 #if LZMA_LZ_DECODER_CONFIG == 0 216 // Minimal byte-by-byte method. This might be the least bad choice 217 // if memcpy() isn't fast and there's no replacement for it below. 218 while (left-- > 0) { 219 dict->buf[dict->pos++] = dict->buf[back++]; 220 } 221 222 #else 223 // Because memcpy() or a similar method can be faster than copying 224 // byte by byte in a loop, the copying process is split into 225 // two cases. 226 if (distance < left) { 227 // Source and target areas overlap, thus we can't use 228 // memcpy() nor even memmove() safely. 229 do { 230 dict->buf[dict->pos++] = dict->buf[back++]; 231 } while (--left > 0); 232 } else { 233 # if LZMA_LZ_DECODER_CONFIG == 1 234 memcpy(dict->buf + dict->pos, dict->buf + back, left); 235 dict->pos += left; 236 237 # elif LZMA_LZ_DECODER_CONFIG == 2 238 // This can copy up to 32 bytes more than required. 239 // (If left == 0, we still copy 32 bytes.) 240 size_t pos = dict->pos; 241 dict->pos += left; 242 do { 243 const __m128i x0 = _mm_loadu_si128( 244 (__m128i *)(dict->buf + back)); 245 const __m128i x1 = _mm_loadu_si128( 246 (__m128i *)(dict->buf + back + 16)); 247 back += 32; 248 _mm_storeu_si128( 249 (__m128i *)(dict->buf + pos), x0); 250 _mm_storeu_si128( 251 (__m128i *)(dict->buf + pos + 16), x1); 252 pos += 32; 253 } while (pos < dict->pos); 254 255 # else 256 # error "Invalid LZMA_LZ_DECODER_CONFIG value" 257 # endif 258 } 259 #endif 260 261 // Update how full the dictionary is. 262 if (!dict->has_wrapped) 263 dict->full = dict->pos - LZ_DICT_INIT_POS; 264 265 return *len != 0; 266 } 267 268 269 static inline void 270 dict_put(lzma_dict *restrict dict, uint8_t byte) 271 { 272 dict->buf[dict->pos++] = byte; 273 274 if (!dict->has_wrapped) 275 dict->full = dict->pos - LZ_DICT_INIT_POS; 276 } 277 278 279 /// Puts one byte into the dictionary. Returns true if the dictionary was 280 /// already full and the byte couldn't be added. 281 static inline bool 282 dict_put_safe(lzma_dict *restrict dict, uint8_t byte) 283 { 284 if (unlikely(dict->pos == dict->limit)) 285 return true; 286 287 dict_put(dict, byte); 288 return false; 289 } 290 291 292 /// Copies arbitrary amount of data into the dictionary. 293 static inline void 294 dict_write(lzma_dict *restrict dict, const uint8_t *restrict in, 295 size_t *restrict in_pos, size_t in_size, 296 size_t *restrict left) 297 { 298 // NOTE: If we are being given more data than the size of the 299 // dictionary, it could be possible to optimize the LZ decoder 300 // so that not everything needs to go through the dictionary. 301 // This shouldn't be very common thing in practice though, and 302 // the slowdown of one extra memcpy() isn't bad compared to how 303 // much time it would have taken if the data were compressed. 304 305 if (in_size - *in_pos > *left) 306 in_size = *in_pos + *left; 307 308 *left -= lzma_bufcpy(in, in_pos, in_size, 309 dict->buf, &dict->pos, dict->limit); 310 311 if (!dict->has_wrapped) 312 dict->full = dict->pos - LZ_DICT_INIT_POS; 313 314 return; 315 } 316 317 318 static inline void 319 dict_reset(lzma_dict *dict) 320 { 321 dict->need_reset = true; 322 return; 323 } 324 325 #endif 326