xref: /freebsd/contrib/xz/src/liblzma/lzma/fastpos.h (revision 3b35e7ee8de9b0260149a2b77e87a2b9c7a36244)
1*3b35e7eeSXin LI // SPDX-License-Identifier: 0BSD
2*3b35e7eeSXin LI 
381ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
481ad8388SMartin Matuska //
581ad8388SMartin Matuska /// \file       fastpos.h
681ad8388SMartin Matuska /// \brief      Kind of two-bit version of bit scan reverse
781ad8388SMartin Matuska ///
881ad8388SMartin Matuska //  Authors:    Igor Pavlov
981ad8388SMartin Matuska //              Lasse Collin
1081ad8388SMartin Matuska //
1181ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
1281ad8388SMartin Matuska 
1381ad8388SMartin Matuska #ifndef LZMA_FASTPOS_H
1481ad8388SMartin Matuska #define LZMA_FASTPOS_H
1581ad8388SMartin Matuska 
1653200025SRui Paulo // LZMA encodes match distances by storing the highest two bits using
1753200025SRui Paulo // a six-bit value [0, 63], and then the missing lower bits.
1853200025SRui Paulo // Dictionary size is also stored using this encoding in the .xz
1981ad8388SMartin Matuska // file format header.
2081ad8388SMartin Matuska //
2181ad8388SMartin Matuska // fastpos.h provides a way to quickly find out the correct six-bit
2281ad8388SMartin Matuska // values. The following table gives some examples of this encoding:
2381ad8388SMartin Matuska //
2453200025SRui Paulo //     dist   return
2581ad8388SMartin Matuska //       0       0
2681ad8388SMartin Matuska //       1       1
2781ad8388SMartin Matuska //       2       2
2881ad8388SMartin Matuska //       3       3
2981ad8388SMartin Matuska //       4       4
3081ad8388SMartin Matuska //       5       4
3181ad8388SMartin Matuska //       6       5
3281ad8388SMartin Matuska //       7       5
3381ad8388SMartin Matuska //       8       6
3481ad8388SMartin Matuska //      11       6
3581ad8388SMartin Matuska //      12       7
3681ad8388SMartin Matuska //     ...      ...
3781ad8388SMartin Matuska //      15       7
3881ad8388SMartin Matuska //      16       8
3981ad8388SMartin Matuska //      17       8
4081ad8388SMartin Matuska //     ...      ...
4181ad8388SMartin Matuska //      23       8
4281ad8388SMartin Matuska //      24       9
4381ad8388SMartin Matuska //      25       9
4481ad8388SMartin Matuska //     ...      ...
4581ad8388SMartin Matuska //
4681ad8388SMartin Matuska //
4781ad8388SMartin Matuska // Provided functions or macros
4881ad8388SMartin Matuska // ----------------------------
4981ad8388SMartin Matuska //
5053200025SRui Paulo // get_dist_slot(dist) is the basic version. get_dist_slot_2(dist)
5153200025SRui Paulo // assumes that dist >= FULL_DISTANCES, thus the result is at least
5253200025SRui Paulo // FULL_DISTANCES_BITS * 2. Using get_dist_slot(dist) instead of
5353200025SRui Paulo // get_dist_slot_2(dist) would give the same result, but get_dist_slot_2(dist)
5481ad8388SMartin Matuska // should be tiny bit faster due to the assumption being made.
5581ad8388SMartin Matuska //
5681ad8388SMartin Matuska //
5781ad8388SMartin Matuska // Size vs. speed
5881ad8388SMartin Matuska // --------------
5981ad8388SMartin Matuska //
6081ad8388SMartin Matuska // With some CPUs that have fast BSR (bit scan reverse) instruction, the
6181ad8388SMartin Matuska // size optimized version is slightly faster than the bigger table based
6281ad8388SMartin Matuska // approach. Such CPUs include Intel Pentium Pro, Pentium II, Pentium III
6381ad8388SMartin Matuska // and Core 2 (possibly others). AMD K7 seems to have slower BSR, but that
6481ad8388SMartin Matuska // would still have speed roughly comparable to the table version. Older
6581ad8388SMartin Matuska // x86 CPUs like the original Pentium have very slow BSR; on those systems
6681ad8388SMartin Matuska // the table version is a lot faster.
6781ad8388SMartin Matuska //
6881ad8388SMartin Matuska // On some CPUs, the table version is a lot faster when using position
6981ad8388SMartin Matuska // dependent code, but with position independent code the size optimized
7081ad8388SMartin Matuska // version is slightly faster. This occurs at least on 32-bit SPARC (no
7181ad8388SMartin Matuska // ASM optimizations).
7281ad8388SMartin Matuska //
7381ad8388SMartin Matuska // I'm making the table version the default, because that has good speed
7481ad8388SMartin Matuska // on all systems I have tried. The size optimized version is sometimes
7581ad8388SMartin Matuska // slightly faster, but sometimes it is a lot slower.
7681ad8388SMartin Matuska 
7781ad8388SMartin Matuska #ifdef HAVE_SMALL
7853200025SRui Paulo #	define get_dist_slot(dist) \
7953200025SRui Paulo 		((dist) <= 4 ? (dist) : get_dist_slot_2(dist))
8081ad8388SMartin Matuska 
8181ad8388SMartin Matuska static inline uint32_t
8253200025SRui Paulo get_dist_slot_2(uint32_t dist)
8381ad8388SMartin Matuska {
8453200025SRui Paulo 	const uint32_t i = bsr32(dist);
8553200025SRui Paulo 	return (i + i) + ((dist >> (i - 1)) & 1);
8681ad8388SMartin Matuska }
8781ad8388SMartin Matuska 
8881ad8388SMartin Matuska 
8981ad8388SMartin Matuska #else
9081ad8388SMartin Matuska 
9181ad8388SMartin Matuska #define FASTPOS_BITS 13
9281ad8388SMartin Matuska 
93ca6a6373SXin LI lzma_attr_visibility_hidden
9481ad8388SMartin Matuska extern const uint8_t lzma_fastpos[1 << FASTPOS_BITS];
9581ad8388SMartin Matuska 
9681ad8388SMartin Matuska 
9781ad8388SMartin Matuska #define fastpos_shift(extra, n) \
9881ad8388SMartin Matuska 	((extra) + (n) * (FASTPOS_BITS - 1))
9981ad8388SMartin Matuska 
10081ad8388SMartin Matuska #define fastpos_limit(extra, n) \
10181ad8388SMartin Matuska 	(UINT32_C(1) << (FASTPOS_BITS + fastpos_shift(extra, n)))
10281ad8388SMartin Matuska 
10353200025SRui Paulo #define fastpos_result(dist, extra, n) \
104a8675d92SXin LI 	(uint32_t)(lzma_fastpos[(dist) >> fastpos_shift(extra, n)]) \
10581ad8388SMartin Matuska 			+ 2 * fastpos_shift(extra, n)
10681ad8388SMartin Matuska 
10781ad8388SMartin Matuska 
10881ad8388SMartin Matuska static inline uint32_t
10953200025SRui Paulo get_dist_slot(uint32_t dist)
11081ad8388SMartin Matuska {
11181ad8388SMartin Matuska 	// If it is small enough, we can pick the result directly from
11281ad8388SMartin Matuska 	// the precalculated table.
11353200025SRui Paulo 	if (dist < fastpos_limit(0, 0))
11453200025SRui Paulo 		return lzma_fastpos[dist];
11581ad8388SMartin Matuska 
11653200025SRui Paulo 	if (dist < fastpos_limit(0, 1))
11753200025SRui Paulo 		return fastpos_result(dist, 0, 1);
11881ad8388SMartin Matuska 
11953200025SRui Paulo 	return fastpos_result(dist, 0, 2);
12081ad8388SMartin Matuska }
12181ad8388SMartin Matuska 
12281ad8388SMartin Matuska 
12381ad8388SMartin Matuska #ifdef FULL_DISTANCES_BITS
12481ad8388SMartin Matuska static inline uint32_t
12553200025SRui Paulo get_dist_slot_2(uint32_t dist)
12681ad8388SMartin Matuska {
12753200025SRui Paulo 	assert(dist >= FULL_DISTANCES);
12881ad8388SMartin Matuska 
12953200025SRui Paulo 	if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 0))
13053200025SRui Paulo 		return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 0);
13181ad8388SMartin Matuska 
13253200025SRui Paulo 	if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 1))
13353200025SRui Paulo 		return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 1);
13481ad8388SMartin Matuska 
13553200025SRui Paulo 	return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 2);
13681ad8388SMartin Matuska }
13781ad8388SMartin Matuska #endif
13881ad8388SMartin Matuska 
13981ad8388SMartin Matuska #endif
14081ad8388SMartin Matuska 
14181ad8388SMartin Matuska #endif
142