1 /////////////////////////////////////////////////////////////////////////////// 2 // 3 /// \file fastpos.h 4 /// \brief Kind of two-bit version of bit scan reverse 5 /// 6 // Authors: Igor Pavlov 7 // Lasse Collin 8 // 9 // This file has been put into the public domain. 10 // You can do whatever you want with this file. 11 // 12 /////////////////////////////////////////////////////////////////////////////// 13 14 #ifndef LZMA_FASTPOS_H 15 #define LZMA_FASTPOS_H 16 17 // LZMA encodes match distances by storing the highest two bits using 18 // a six-bit value [0, 63], and then the missing lower bits. 19 // Dictionary size is also stored using this encoding in the .xz 20 // file format header. 21 // 22 // fastpos.h provides a way to quickly find out the correct six-bit 23 // values. The following table gives some examples of this encoding: 24 // 25 // dist return 26 // 0 0 27 // 1 1 28 // 2 2 29 // 3 3 30 // 4 4 31 // 5 4 32 // 6 5 33 // 7 5 34 // 8 6 35 // 11 6 36 // 12 7 37 // ... ... 38 // 15 7 39 // 16 8 40 // 17 8 41 // ... ... 42 // 23 8 43 // 24 9 44 // 25 9 45 // ... ... 46 // 47 // 48 // Provided functions or macros 49 // ---------------------------- 50 // 51 // get_dist_slot(dist) is the basic version. get_dist_slot_2(dist) 52 // assumes that dist >= FULL_DISTANCES, thus the result is at least 53 // FULL_DISTANCES_BITS * 2. Using get_dist_slot(dist) instead of 54 // get_dist_slot_2(dist) would give the same result, but get_dist_slot_2(dist) 55 // should be tiny bit faster due to the assumption being made. 56 // 57 // 58 // Size vs. speed 59 // -------------- 60 // 61 // With some CPUs that have fast BSR (bit scan reverse) instruction, the 62 // size optimized version is slightly faster than the bigger table based 63 // approach. Such CPUs include Intel Pentium Pro, Pentium II, Pentium III 64 // and Core 2 (possibly others). AMD K7 seems to have slower BSR, but that 65 // would still have speed roughly comparable to the table version. Older 66 // x86 CPUs like the original Pentium have very slow BSR; on those systems 67 // the table version is a lot faster. 68 // 69 // On some CPUs, the table version is a lot faster when using position 70 // dependent code, but with position independent code the size optimized 71 // version is slightly faster. This occurs at least on 32-bit SPARC (no 72 // ASM optimizations). 73 // 74 // I'm making the table version the default, because that has good speed 75 // on all systems I have tried. The size optimized version is sometimes 76 // slightly faster, but sometimes it is a lot slower. 77 78 #ifdef HAVE_SMALL 79 # define get_dist_slot(dist) \ 80 ((dist) <= 4 ? (dist) : get_dist_slot_2(dist)) 81 82 static inline uint32_t 83 get_dist_slot_2(uint32_t dist) 84 { 85 const uint32_t i = bsr32(dist); 86 return (i + i) + ((dist >> (i - 1)) & 1); 87 } 88 89 90 #else 91 92 #define FASTPOS_BITS 13 93 94 lzma_attr_visibility_hidden 95 extern const uint8_t lzma_fastpos[1 << FASTPOS_BITS]; 96 97 98 #define fastpos_shift(extra, n) \ 99 ((extra) + (n) * (FASTPOS_BITS - 1)) 100 101 #define fastpos_limit(extra, n) \ 102 (UINT32_C(1) << (FASTPOS_BITS + fastpos_shift(extra, n))) 103 104 #define fastpos_result(dist, extra, n) \ 105 (uint32_t)(lzma_fastpos[(dist) >> fastpos_shift(extra, n)]) \ 106 + 2 * fastpos_shift(extra, n) 107 108 109 static inline uint32_t 110 get_dist_slot(uint32_t dist) 111 { 112 // If it is small enough, we can pick the result directly from 113 // the precalculated table. 114 if (dist < fastpos_limit(0, 0)) 115 return lzma_fastpos[dist]; 116 117 if (dist < fastpos_limit(0, 1)) 118 return fastpos_result(dist, 0, 1); 119 120 return fastpos_result(dist, 0, 2); 121 } 122 123 124 #ifdef FULL_DISTANCES_BITS 125 static inline uint32_t 126 get_dist_slot_2(uint32_t dist) 127 { 128 assert(dist >= FULL_DISTANCES); 129 130 if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 0)) 131 return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 0); 132 133 if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 1)) 134 return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 1); 135 136 return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 2); 137 } 138 #endif 139 140 #endif 141 142 #endif 143