1*3b35e7eeSXin LI // SPDX-License-Identifier: 0BSD 2*3b35e7eeSXin LI 381ad8388SMartin Matuska /////////////////////////////////////////////////////////////////////////////// 481ad8388SMartin Matuska // 581ad8388SMartin Matuska /// \file fastpos.h 681ad8388SMartin Matuska /// \brief Kind of two-bit version of bit scan reverse 781ad8388SMartin Matuska /// 881ad8388SMartin Matuska // Authors: Igor Pavlov 981ad8388SMartin Matuska // Lasse Collin 1081ad8388SMartin Matuska // 1181ad8388SMartin Matuska /////////////////////////////////////////////////////////////////////////////// 1281ad8388SMartin Matuska 1381ad8388SMartin Matuska #ifndef LZMA_FASTPOS_H 1481ad8388SMartin Matuska #define LZMA_FASTPOS_H 1581ad8388SMartin Matuska 1653200025SRui Paulo // LZMA encodes match distances by storing the highest two bits using 1753200025SRui Paulo // a six-bit value [0, 63], and then the missing lower bits. 1853200025SRui Paulo // Dictionary size is also stored using this encoding in the .xz 1981ad8388SMartin Matuska // file format header. 2081ad8388SMartin Matuska // 2181ad8388SMartin Matuska // fastpos.h provides a way to quickly find out the correct six-bit 2281ad8388SMartin Matuska // values. The following table gives some examples of this encoding: 2381ad8388SMartin Matuska // 2453200025SRui Paulo // dist return 2581ad8388SMartin Matuska // 0 0 2681ad8388SMartin Matuska // 1 1 2781ad8388SMartin Matuska // 2 2 2881ad8388SMartin Matuska // 3 3 2981ad8388SMartin Matuska // 4 4 3081ad8388SMartin Matuska // 5 4 3181ad8388SMartin Matuska // 6 5 3281ad8388SMartin Matuska // 7 5 3381ad8388SMartin Matuska // 8 6 3481ad8388SMartin Matuska // 11 6 3581ad8388SMartin Matuska // 12 7 3681ad8388SMartin Matuska // ... ... 3781ad8388SMartin Matuska // 15 7 3881ad8388SMartin Matuska // 16 8 3981ad8388SMartin Matuska // 17 8 4081ad8388SMartin Matuska // ... ... 4181ad8388SMartin Matuska // 23 8 4281ad8388SMartin Matuska // 24 9 4381ad8388SMartin Matuska // 25 9 4481ad8388SMartin Matuska // ... ... 4581ad8388SMartin Matuska // 4681ad8388SMartin Matuska // 4781ad8388SMartin Matuska // Provided functions or macros 4881ad8388SMartin Matuska // ---------------------------- 4981ad8388SMartin Matuska // 5053200025SRui Paulo // get_dist_slot(dist) is the basic version. get_dist_slot_2(dist) 5153200025SRui Paulo // assumes that dist >= FULL_DISTANCES, thus the result is at least 5253200025SRui Paulo // FULL_DISTANCES_BITS * 2. Using get_dist_slot(dist) instead of 5353200025SRui Paulo // get_dist_slot_2(dist) would give the same result, but get_dist_slot_2(dist) 5481ad8388SMartin Matuska // should be tiny bit faster due to the assumption being made. 5581ad8388SMartin Matuska // 5681ad8388SMartin Matuska // 5781ad8388SMartin Matuska // Size vs. speed 5881ad8388SMartin Matuska // -------------- 5981ad8388SMartin Matuska // 6081ad8388SMartin Matuska // With some CPUs that have fast BSR (bit scan reverse) instruction, the 6181ad8388SMartin Matuska // size optimized version is slightly faster than the bigger table based 6281ad8388SMartin Matuska // approach. Such CPUs include Intel Pentium Pro, Pentium II, Pentium III 6381ad8388SMartin Matuska // and Core 2 (possibly others). AMD K7 seems to have slower BSR, but that 6481ad8388SMartin Matuska // would still have speed roughly comparable to the table version. Older 6581ad8388SMartin Matuska // x86 CPUs like the original Pentium have very slow BSR; on those systems 6681ad8388SMartin Matuska // the table version is a lot faster. 6781ad8388SMartin Matuska // 6881ad8388SMartin Matuska // On some CPUs, the table version is a lot faster when using position 6981ad8388SMartin Matuska // dependent code, but with position independent code the size optimized 7081ad8388SMartin Matuska // version is slightly faster. This occurs at least on 32-bit SPARC (no 7181ad8388SMartin Matuska // ASM optimizations). 7281ad8388SMartin Matuska // 7381ad8388SMartin Matuska // I'm making the table version the default, because that has good speed 7481ad8388SMartin Matuska // on all systems I have tried. The size optimized version is sometimes 7581ad8388SMartin Matuska // slightly faster, but sometimes it is a lot slower. 7681ad8388SMartin Matuska 7781ad8388SMartin Matuska #ifdef HAVE_SMALL 7853200025SRui Paulo # define get_dist_slot(dist) \ 7953200025SRui Paulo ((dist) <= 4 ? (dist) : get_dist_slot_2(dist)) 8081ad8388SMartin Matuska 8181ad8388SMartin Matuska static inline uint32_t 8253200025SRui Paulo get_dist_slot_2(uint32_t dist) 8381ad8388SMartin Matuska { 8453200025SRui Paulo const uint32_t i = bsr32(dist); 8553200025SRui Paulo return (i + i) + ((dist >> (i - 1)) & 1); 8681ad8388SMartin Matuska } 8781ad8388SMartin Matuska 8881ad8388SMartin Matuska 8981ad8388SMartin Matuska #else 9081ad8388SMartin Matuska 9181ad8388SMartin Matuska #define FASTPOS_BITS 13 9281ad8388SMartin Matuska 93ca6a6373SXin LI lzma_attr_visibility_hidden 9481ad8388SMartin Matuska extern const uint8_t lzma_fastpos[1 << FASTPOS_BITS]; 9581ad8388SMartin Matuska 9681ad8388SMartin Matuska 9781ad8388SMartin Matuska #define fastpos_shift(extra, n) \ 9881ad8388SMartin Matuska ((extra) + (n) * (FASTPOS_BITS - 1)) 9981ad8388SMartin Matuska 10081ad8388SMartin Matuska #define fastpos_limit(extra, n) \ 10181ad8388SMartin Matuska (UINT32_C(1) << (FASTPOS_BITS + fastpos_shift(extra, n))) 10281ad8388SMartin Matuska 10353200025SRui Paulo #define fastpos_result(dist, extra, n) \ 104a8675d92SXin LI (uint32_t)(lzma_fastpos[(dist) >> fastpos_shift(extra, n)]) \ 10581ad8388SMartin Matuska + 2 * fastpos_shift(extra, n) 10681ad8388SMartin Matuska 10781ad8388SMartin Matuska 10881ad8388SMartin Matuska static inline uint32_t 10953200025SRui Paulo get_dist_slot(uint32_t dist) 11081ad8388SMartin Matuska { 11181ad8388SMartin Matuska // If it is small enough, we can pick the result directly from 11281ad8388SMartin Matuska // the precalculated table. 11353200025SRui Paulo if (dist < fastpos_limit(0, 0)) 11453200025SRui Paulo return lzma_fastpos[dist]; 11581ad8388SMartin Matuska 11653200025SRui Paulo if (dist < fastpos_limit(0, 1)) 11753200025SRui Paulo return fastpos_result(dist, 0, 1); 11881ad8388SMartin Matuska 11953200025SRui Paulo return fastpos_result(dist, 0, 2); 12081ad8388SMartin Matuska } 12181ad8388SMartin Matuska 12281ad8388SMartin Matuska 12381ad8388SMartin Matuska #ifdef FULL_DISTANCES_BITS 12481ad8388SMartin Matuska static inline uint32_t 12553200025SRui Paulo get_dist_slot_2(uint32_t dist) 12681ad8388SMartin Matuska { 12753200025SRui Paulo assert(dist >= FULL_DISTANCES); 12881ad8388SMartin Matuska 12953200025SRui Paulo if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 0)) 13053200025SRui Paulo return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 0); 13181ad8388SMartin Matuska 13253200025SRui Paulo if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 1)) 13353200025SRui Paulo return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 1); 13481ad8388SMartin Matuska 13553200025SRui Paulo return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 2); 13681ad8388SMartin Matuska } 13781ad8388SMartin Matuska #endif 13881ad8388SMartin Matuska 13981ad8388SMartin Matuska #endif 14081ad8388SMartin Matuska 14181ad8388SMartin Matuska #endif 142