xref: /linux/fs/bcachefs/extents_format.h (revision 4a4b30ea80d8cb5e8c4c62bb86201f4ea0d9b030)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _BCACHEFS_EXTENTS_FORMAT_H
3 #define _BCACHEFS_EXTENTS_FORMAT_H
4 
5 /*
6  * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
7  * preceded by checksum/compression information (bch_extent_crc32 or
8  * bch_extent_crc64).
9  *
10  * One major determining factor in the format of extents is how we handle and
11  * represent extents that have been partially overwritten and thus trimmed:
12  *
13  * If an extent is not checksummed or compressed, when the extent is trimmed we
14  * don't have to remember the extent we originally allocated and wrote: we can
15  * merely adjust ptr->offset to point to the start of the data that is currently
16  * live. The size field in struct bkey records the current (live) size of the
17  * extent, and is also used to mean "size of region on disk that we point to" in
18  * this case.
19  *
20  * Thus an extent that is not checksummed or compressed will consist only of a
21  * list of bch_extent_ptrs, with none of the fields in
22  * bch_extent_crc32/bch_extent_crc64.
23  *
24  * When an extent is checksummed or compressed, it's not possible to read only
25  * the data that is currently live: we have to read the entire extent that was
26  * originally written, and then return only the part of the extent that is
27  * currently live.
28  *
29  * Thus, in addition to the current size of the extent in struct bkey, we need
30  * to store the size of the originally allocated space - this is the
31  * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
32  * when the extent is trimmed, instead of modifying the offset field of the
33  * pointer, we keep a second smaller offset field - "offset into the original
34  * extent of the currently live region".
35  *
36  * The other major determining factor is replication and data migration:
37  *
38  * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
39  * write, we will initially write all the replicas in the same format, with the
40  * same checksum type and compression format - however, when copygc runs later (or
41  * tiering/cache promotion, anything that moves data), it is not in general
42  * going to rewrite all the pointers at once - one of the replicas may be in a
43  * bucket on one device that has very little fragmentation while another lives
44  * in a bucket that has become heavily fragmented, and thus is being rewritten
45  * sooner than the rest.
46  *
47  * Thus it will only move a subset of the pointers (or in the case of
48  * tiering/cache promotion perhaps add a single pointer without dropping any
49  * current pointers), and if the extent has been partially overwritten it must
50  * write only the currently live portion (or copygc would not be able to reduce
51  * fragmentation!) - which necessitates a different bch_extent_crc format for
52  * the new pointer.
53  *
54  * But in the interests of space efficiency, we don't want to store one
55  * bch_extent_crc for each pointer if we don't have to.
56  *
57  * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
58  * bch_extent_ptrs appended arbitrarily one after the other. We determine the
59  * type of a given entry with a scheme similar to utf8 (except we're encoding a
60  * type, not a size), encoding the type in the position of the first set bit:
61  *
62  * bch_extent_crc32	- 0b1
63  * bch_extent_ptr	- 0b10
64  * bch_extent_crc64	- 0b100
65  *
66  * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
67  * bch_extent_crc64 is the least constrained).
68  *
69  * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
70  * until the next bch_extent_crc32/64.
71  *
72  * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
73  * is neither checksummed nor compressed.
74  */
75 
76 #define BCH_EXTENT_ENTRY_TYPES()		\
77 	x(ptr,			0)		\
78 	x(crc32,		1)		\
79 	x(crc64,		2)		\
80 	x(crc128,		3)		\
81 	x(stripe_ptr,		4)		\
82 	x(rebalance,		5)		\
83 	x(flags,		6)
84 #define BCH_EXTENT_ENTRY_MAX	7
85 
86 enum bch_extent_entry_type {
87 #define x(f, n) BCH_EXTENT_ENTRY_##f = n,
88 	BCH_EXTENT_ENTRY_TYPES()
89 #undef x
90 };
91 
92 /* Compressed/uncompressed size are stored biased by 1: */
93 struct bch_extent_crc32 {
94 #if defined(__LITTLE_ENDIAN_BITFIELD)
95 	__u32			type:2,
96 				_compressed_size:7,
97 				_uncompressed_size:7,
98 				offset:7,
99 				_unused:1,
100 				csum_type:4,
101 				compression_type:4;
102 	__u32			csum;
103 #elif defined (__BIG_ENDIAN_BITFIELD)
104 	__u32			csum;
105 	__u32			compression_type:4,
106 				csum_type:4,
107 				_unused:1,
108 				offset:7,
109 				_uncompressed_size:7,
110 				_compressed_size:7,
111 				type:2;
112 #endif
113 } __packed __aligned(8);
114 
115 #define CRC32_SIZE_MAX		(1U << 7)
116 #define CRC32_NONCE_MAX		0
117 
118 struct bch_extent_crc64 {
119 #if defined(__LITTLE_ENDIAN_BITFIELD)
120 	__u64			type:3,
121 				_compressed_size:9,
122 				_uncompressed_size:9,
123 				offset:9,
124 				nonce:10,
125 				csum_type:4,
126 				compression_type:4,
127 				csum_hi:16;
128 #elif defined (__BIG_ENDIAN_BITFIELD)
129 	__u64			csum_hi:16,
130 				compression_type:4,
131 				csum_type:4,
132 				nonce:10,
133 				offset:9,
134 				_uncompressed_size:9,
135 				_compressed_size:9,
136 				type:3;
137 #endif
138 	__u64			csum_lo;
139 } __packed __aligned(8);
140 
141 #define CRC64_SIZE_MAX		(1U << 9)
142 #define CRC64_NONCE_MAX		((1U << 10) - 1)
143 
144 struct bch_extent_crc128 {
145 #if defined(__LITTLE_ENDIAN_BITFIELD)
146 	__u64			type:4,
147 				_compressed_size:13,
148 				_uncompressed_size:13,
149 				offset:13,
150 				nonce:13,
151 				csum_type:4,
152 				compression_type:4;
153 #elif defined (__BIG_ENDIAN_BITFIELD)
154 	__u64			compression_type:4,
155 				csum_type:4,
156 				nonce:13,
157 				offset:13,
158 				_uncompressed_size:13,
159 				_compressed_size:13,
160 				type:4;
161 #endif
162 	struct bch_csum		csum;
163 } __packed __aligned(8);
164 
165 #define CRC128_SIZE_MAX		(1U << 13)
166 #define CRC128_NONCE_MAX	((1U << 13) - 1)
167 
168 /*
169  * @reservation - pointer hasn't been written to, just reserved
170  */
171 struct bch_extent_ptr {
172 #if defined(__LITTLE_ENDIAN_BITFIELD)
173 	__u64			type:1,
174 				cached:1,
175 				unused:1,
176 				unwritten:1,
177 				offset:44, /* 8 petabytes */
178 				dev:8,
179 				gen:8;
180 #elif defined (__BIG_ENDIAN_BITFIELD)
181 	__u64			gen:8,
182 				dev:8,
183 				offset:44,
184 				unwritten:1,
185 				unused:1,
186 				cached:1,
187 				type:1;
188 #endif
189 } __packed __aligned(8);
190 
191 struct bch_extent_stripe_ptr {
192 #if defined(__LITTLE_ENDIAN_BITFIELD)
193 	__u64			type:5,
194 				block:8,
195 				redundancy:4,
196 				idx:47;
197 #elif defined (__BIG_ENDIAN_BITFIELD)
198 	__u64			idx:47,
199 				redundancy:4,
200 				block:8,
201 				type:5;
202 #endif
203 };
204 
205 #define BCH_EXTENT_FLAGS()		\
206 	x(poisoned,		0)
207 
208 enum bch_extent_flags_e {
209 #define x(n, v)	BCH_EXTENT_FLAG_##n = v,
210 	BCH_EXTENT_FLAGS()
211 #undef x
212 };
213 
214 struct bch_extent_flags {
215 #if defined(__LITTLE_ENDIAN_BITFIELD)
216 	__u64			type:7,
217 				flags:57;
218 #elif defined (__BIG_ENDIAN_BITFIELD)
219 	__u64			flags:57,
220 				type:7;
221 #endif
222 };
223 
224 /* bch_extent_rebalance: */
225 #include "rebalance_format.h"
226 
227 union bch_extent_entry {
228 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
229 	unsigned long			type;
230 #elif __BITS_PER_LONG == 32
231 	struct {
232 		unsigned long		pad;
233 		unsigned long		type;
234 	};
235 #else
236 #error edit for your odd byteorder.
237 #endif
238 
239 #define x(f, n) struct bch_extent_##f	f;
240 	BCH_EXTENT_ENTRY_TYPES()
241 #undef x
242 };
243 
244 struct bch_btree_ptr {
245 	struct bch_val		v;
246 
247 	__u64			_data[0];
248 	struct bch_extent_ptr	start[];
249 } __packed __aligned(8);
250 
251 struct bch_btree_ptr_v2 {
252 	struct bch_val		v;
253 
254 	__u64			mem_ptr;
255 	__le64			seq;
256 	__le16			sectors_written;
257 	__le16			flags;
258 	struct bpos		min_key;
259 	__u64			_data[0];
260 	struct bch_extent_ptr	start[];
261 } __packed __aligned(8);
262 
263 LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,	struct bch_btree_ptr_v2, flags, 0, 1);
264 
265 struct bch_extent {
266 	struct bch_val		v;
267 
268 	__u64			_data[0];
269 	union bch_extent_entry	start[];
270 } __packed __aligned(8);
271 
272 /* Maximum size (in u64s) a single pointer could be: */
273 #define BKEY_EXTENT_PTR_U64s_MAX\
274 	((sizeof(struct bch_extent_crc128) +			\
275 	  sizeof(struct bch_extent_ptr)) / sizeof(__u64))
276 
277 /* Maximum possible size of an entire extent value: */
278 #define BKEY_EXTENT_VAL_U64s_MAX				\
279 	(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
280 
281 /* * Maximum possible size of an entire extent, key + value: */
282 #define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
283 
284 /* Btree pointers don't carry around checksums: */
285 #define BKEY_BTREE_PTR_VAL_U64s_MAX				\
286 	((sizeof(struct bch_btree_ptr_v2) +			\
287 	  sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
288 #define BKEY_BTREE_PTR_U64s_MAX					\
289 	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
290 
291 struct bch_reservation {
292 	struct bch_val		v;
293 
294 	__le32			generation;
295 	__u8			nr_replicas;
296 	__u8			pad[3];
297 } __packed __aligned(8);
298 
299 struct bch_inline_data {
300 	struct bch_val		v;
301 	u8			data[];
302 };
303 
304 #endif /* _BCACHEFS_EXTENTS_FORMAT_H */
305