1 // SPDX-License-Identifier: 0BSD 2 3 /////////////////////////////////////////////////////////////////////////////// 4 // 5 /// \file fastpos.h 6 /// \brief Kind of two-bit version of bit scan reverse 7 /// 8 // Authors: Igor Pavlov 9 // Lasse Collin 10 // 11 /////////////////////////////////////////////////////////////////////////////// 12 13 #ifndef LZMA_FASTPOS_H 14 #define LZMA_FASTPOS_H 15 16 // LZMA encodes match distances by storing the highest two bits using 17 // a six-bit value [0, 63], and then the missing lower bits. 18 // Dictionary size is also stored using this encoding in the .xz 19 // file format header. 20 // 21 // fastpos.h provides a way to quickly find out the correct six-bit 22 // values. The following table gives some examples of this encoding: 23 // 24 // dist return 25 // 0 0 26 // 1 1 27 // 2 2 28 // 3 3 29 // 4 4 30 // 5 4 31 // 6 5 32 // 7 5 33 // 8 6 34 // 11 6 35 // 12 7 36 // ... ... 37 // 15 7 38 // 16 8 39 // 17 8 40 // ... ... 41 // 23 8 42 // 24 9 43 // 25 9 44 // ... ... 45 // 46 // 47 // Provided functions or macros 48 // ---------------------------- 49 // 50 // get_dist_slot(dist) is the basic version. get_dist_slot_2(dist) 51 // assumes that dist >= FULL_DISTANCES, thus the result is at least 52 // FULL_DISTANCES_BITS * 2. Using get_dist_slot(dist) instead of 53 // get_dist_slot_2(dist) would give the same result, but get_dist_slot_2(dist) 54 // should be tiny bit faster due to the assumption being made. 55 // 56 // 57 // Size vs. speed 58 // -------------- 59 // 60 // With some CPUs that have fast BSR (bit scan reverse) instruction, the 61 // size optimized version is slightly faster than the bigger table based 62 // approach. Such CPUs include Intel Pentium Pro, Pentium II, Pentium III 63 // and Core 2 (possibly others). AMD K7 seems to have slower BSR, but that 64 // would still have speed roughly comparable to the table version. Older 65 // x86 CPUs like the original Pentium have very slow BSR; on those systems 66 // the table version is a lot faster. 67 // 68 // On some CPUs, the table version is a lot faster when using position 69 // dependent code, but with position independent code the size optimized 70 // version is slightly faster. This occurs at least on 32-bit SPARC (no 71 // ASM optimizations). 72 // 73 // I'm making the table version the default, because that has good speed 74 // on all systems I have tried. The size optimized version is sometimes 75 // slightly faster, but sometimes it is a lot slower. 76 77 #ifdef HAVE_SMALL 78 # define get_dist_slot(dist) \ 79 ((dist) <= 4 ? (dist) : get_dist_slot_2(dist)) 80 81 static inline uint32_t 82 get_dist_slot_2(uint32_t dist) 83 { 84 const uint32_t i = bsr32(dist); 85 return (i + i) + ((dist >> (i - 1)) & 1); 86 } 87 88 89 #else 90 91 #define FASTPOS_BITS 13 92 93 lzma_attr_visibility_hidden 94 extern const uint8_t lzma_fastpos[1 << FASTPOS_BITS]; 95 96 97 #define fastpos_shift(extra, n) \ 98 ((extra) + (n) * (FASTPOS_BITS - 1)) 99 100 #define fastpos_limit(extra, n) \ 101 (UINT32_C(1) << (FASTPOS_BITS + fastpos_shift(extra, n))) 102 103 #define fastpos_result(dist, extra, n) \ 104 (uint32_t)(lzma_fastpos[(dist) >> fastpos_shift(extra, n)]) \ 105 + 2 * fastpos_shift(extra, n) 106 107 108 static inline uint32_t 109 get_dist_slot(uint32_t dist) 110 { 111 // If it is small enough, we can pick the result directly from 112 // the precalculated table. 113 if (dist < fastpos_limit(0, 0)) 114 return lzma_fastpos[dist]; 115 116 if (dist < fastpos_limit(0, 1)) 117 return fastpos_result(dist, 0, 1); 118 119 return fastpos_result(dist, 0, 2); 120 } 121 122 123 #ifdef FULL_DISTANCES_BITS 124 static inline uint32_t 125 get_dist_slot_2(uint32_t dist) 126 { 127 assert(dist >= FULL_DISTANCES); 128 129 if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 0)) 130 return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 0); 131 132 if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 1)) 133 return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 1); 134 135 return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 2); 136 } 137 #endif 138 139 #endif 140 141 #endif 142