xref: /linux/fs/bcachefs/extents.c (revision 131c040bbb0f561ef68ad2ba6fcd28c97fa6d4cf)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4  *
5  * Code for managing the extent btree and dynamically updating the writeback
6  * dirty sector count.
7  */
8 
9 #include "bcachefs.h"
10 #include "bkey_methods.h"
11 #include "btree_cache.h"
12 #include "btree_gc.h"
13 #include "btree_io.h"
14 #include "btree_iter.h"
15 #include "buckets.h"
16 #include "checksum.h"
17 #include "compress.h"
18 #include "debug.h"
19 #include "disk_groups.h"
20 #include "error.h"
21 #include "extents.h"
22 #include "inode.h"
23 #include "journal.h"
24 #include "rebalance.h"
25 #include "replicas.h"
26 #include "super.h"
27 #include "super-io.h"
28 #include "trace.h"
29 #include "util.h"
30 
31 static unsigned bch2_crc_field_size_max[] = {
32 	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
33 	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
34 	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
35 };
36 
37 static void bch2_extent_crc_pack(union bch_extent_crc *,
38 				 struct bch_extent_crc_unpacked,
39 				 enum bch_extent_entry_type);
40 
bch2_dev_io_failures(struct bch_io_failures * f,unsigned dev)41 struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
42 						 unsigned dev)
43 {
44 	struct bch_dev_io_failures *i;
45 
46 	for (i = f->devs; i < f->devs + f->nr; i++)
47 		if (i->dev == dev)
48 			return i;
49 
50 	return NULL;
51 }
52 
bch2_mark_io_failure(struct bch_io_failures * failed,struct extent_ptr_decoded * p)53 void bch2_mark_io_failure(struct bch_io_failures *failed,
54 			  struct extent_ptr_decoded *p)
55 {
56 	struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
57 
58 	if (!f) {
59 		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
60 
61 		f = &failed->devs[failed->nr++];
62 		f->dev		= p->ptr.dev;
63 		f->idx		= p->idx;
64 		f->nr_failed	= 1;
65 		f->nr_retries	= 0;
66 	} else if (p->idx != f->idx) {
67 		f->idx		= p->idx;
68 		f->nr_failed	= 1;
69 		f->nr_retries	= 0;
70 	} else {
71 		f->nr_failed++;
72 	}
73 }
74 
dev_latency(struct bch_fs * c,unsigned dev)75 static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
76 {
77 	struct bch_dev *ca = bch2_dev_rcu(c, dev);
78 	return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
79 }
80 
81 /*
82  * returns true if p1 is better than p2:
83  */
ptr_better(struct bch_fs * c,const struct extent_ptr_decoded p1,const struct extent_ptr_decoded p2)84 static inline bool ptr_better(struct bch_fs *c,
85 			      const struct extent_ptr_decoded p1,
86 			      const struct extent_ptr_decoded p2)
87 {
88 	if (likely(!p1.idx && !p2.idx)) {
89 		u64 l1 = dev_latency(c, p1.ptr.dev);
90 		u64 l2 = dev_latency(c, p2.ptr.dev);
91 
92 		/*
93 		 * Square the latencies, to bias more in favor of the faster
94 		 * device - we never want to stop issuing reads to the slower
95 		 * device altogether, so that we can update our latency numbers:
96 		 */
97 		l1 *= l1;
98 		l2 *= l2;
99 
100 		/* Pick at random, biased in favor of the faster device: */
101 
102 		return bch2_get_random_u64_below(l1 + l2) > l1;
103 	}
104 
105 	if (bch2_force_reconstruct_read)
106 		return p1.idx > p2.idx;
107 
108 	return p1.idx < p2.idx;
109 }
110 
111 /*
112  * This picks a non-stale pointer, preferably from a device other than @avoid.
113  * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
114  * other devices, it will still pick a pointer from avoid.
115  */
bch2_bkey_pick_read_device(struct bch_fs * c,struct bkey_s_c k,struct bch_io_failures * failed,struct extent_ptr_decoded * pick)116 int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
117 			       struct bch_io_failures *failed,
118 			       struct extent_ptr_decoded *pick)
119 {
120 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
121 	const union bch_extent_entry *entry;
122 	struct extent_ptr_decoded p;
123 	struct bch_dev_io_failures *f;
124 	int ret = 0;
125 
126 	if (k.k->type == KEY_TYPE_error)
127 		return -BCH_ERR_key_type_error;
128 
129 	rcu_read_lock();
130 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
131 		/*
132 		 * Unwritten extent: no need to actually read, treat it as a
133 		 * hole and return 0s:
134 		 */
135 		if (p.ptr.unwritten) {
136 			ret = 0;
137 			break;
138 		}
139 
140 		/*
141 		 * If there are any dirty pointers it's an error if we can't
142 		 * read:
143 		 */
144 		if (!ret && !p.ptr.cached)
145 			ret = -BCH_ERR_no_device_to_read_from;
146 
147 		struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
148 
149 		if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
150 			continue;
151 
152 		f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
153 		if (f)
154 			p.idx = f->nr_failed < f->nr_retries
155 				? f->idx
156 				: f->idx + 1;
157 
158 		if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
159 			p.idx++;
160 
161 		if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
162 			p.idx++;
163 
164 		if (p.idx > (unsigned) p.has_ec)
165 			continue;
166 
167 		if (ret > 0 && !ptr_better(c, p, *pick))
168 			continue;
169 
170 		*pick = p;
171 		ret = 1;
172 	}
173 	rcu_read_unlock();
174 
175 	return ret;
176 }
177 
178 /* KEY_TYPE_btree_ptr: */
179 
bch2_btree_ptr_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)180 int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
181 			    struct bkey_validate_context from)
182 {
183 	int ret = 0;
184 
185 	bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
186 			 c, btree_ptr_val_too_big,
187 			 "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
188 
189 	ret = bch2_bkey_ptrs_validate(c, k, from);
190 fsck_err:
191 	return ret;
192 }
193 
bch2_btree_ptr_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)194 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
195 			    struct bkey_s_c k)
196 {
197 	bch2_bkey_ptrs_to_text(out, c, k);
198 }
199 
bch2_btree_ptr_v2_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)200 int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
201 			       struct bkey_validate_context from)
202 {
203 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
204 	int ret = 0;
205 
206 	bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
207 			 c, btree_ptr_v2_val_too_big,
208 			 "value too big (%zu > %zu)",
209 			 bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
210 
211 	bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
212 			 c, btree_ptr_v2_min_key_bad,
213 			 "min_key > key");
214 
215 	if ((from.flags & BCH_VALIDATE_write) &&
216 	    c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
217 		bkey_fsck_err_on(!bp.v->sectors_written,
218 				 c, btree_ptr_v2_written_0,
219 				 "sectors_written == 0");
220 
221 	ret = bch2_bkey_ptrs_validate(c, k, from);
222 fsck_err:
223 	return ret;
224 }
225 
bch2_btree_ptr_v2_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)226 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
227 			       struct bkey_s_c k)
228 {
229 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
230 
231 	prt_printf(out, "seq %llx written %u min_key %s",
232 	       le64_to_cpu(bp.v->seq),
233 	       le16_to_cpu(bp.v->sectors_written),
234 	       BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
235 
236 	bch2_bpos_to_text(out, bp.v->min_key);
237 	prt_printf(out, " ");
238 	bch2_bkey_ptrs_to_text(out, c, k);
239 }
240 
bch2_btree_ptr_v2_compat(enum btree_id btree_id,unsigned version,unsigned big_endian,int write,struct bkey_s k)241 void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
242 			      unsigned big_endian, int write,
243 			      struct bkey_s k)
244 {
245 	struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
246 
247 	compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
248 
249 	if (version < bcachefs_metadata_version_inode_btree_change &&
250 	    btree_id_is_extents(btree_id) &&
251 	    !bkey_eq(bp.v->min_key, POS_MIN))
252 		bp.v->min_key = write
253 			? bpos_nosnap_predecessor(bp.v->min_key)
254 			: bpos_nosnap_successor(bp.v->min_key);
255 }
256 
257 /* KEY_TYPE_extent: */
258 
bch2_extent_merge(struct bch_fs * c,struct bkey_s l,struct bkey_s_c r)259 bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
260 {
261 	struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
262 	struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
263 	union bch_extent_entry *en_l;
264 	const union bch_extent_entry *en_r;
265 	struct extent_ptr_decoded lp, rp;
266 	bool use_right_ptr;
267 
268 	en_l = l_ptrs.start;
269 	en_r = r_ptrs.start;
270 	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
271 		if (extent_entry_type(en_l) != extent_entry_type(en_r))
272 			return false;
273 
274 		en_l = extent_entry_next(en_l);
275 		en_r = extent_entry_next(en_r);
276 	}
277 
278 	if (en_l < l_ptrs.end || en_r < r_ptrs.end)
279 		return false;
280 
281 	en_l = l_ptrs.start;
282 	en_r = r_ptrs.start;
283 	lp.crc = bch2_extent_crc_unpack(l.k, NULL);
284 	rp.crc = bch2_extent_crc_unpack(r.k, NULL);
285 
286 	while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
287 	       __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
288 		if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
289 		    rp.ptr.offset + rp.crc.offset ||
290 		    lp.ptr.dev			!= rp.ptr.dev ||
291 		    lp.ptr.gen			!= rp.ptr.gen ||
292 		    lp.ptr.unwritten		!= rp.ptr.unwritten ||
293 		    lp.has_ec			!= rp.has_ec)
294 			return false;
295 
296 		/* Extents may not straddle buckets: */
297 		rcu_read_lock();
298 		struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
299 		bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
300 		rcu_read_unlock();
301 
302 		if (!same_bucket)
303 			return false;
304 
305 		if (lp.has_ec			!= rp.has_ec ||
306 		    (lp.has_ec &&
307 		     (lp.ec.block		!= rp.ec.block ||
308 		      lp.ec.redundancy		!= rp.ec.redundancy ||
309 		      lp.ec.idx			!= rp.ec.idx)))
310 			return false;
311 
312 		if (lp.crc.compression_type	!= rp.crc.compression_type ||
313 		    lp.crc.nonce		!= rp.crc.nonce)
314 			return false;
315 
316 		if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
317 		    lp.crc.uncompressed_size) {
318 			/* can use left extent's crc entry */
319 		} else if (lp.crc.live_size <= rp.crc.offset) {
320 			/* can use right extent's crc entry */
321 		} else {
322 			/* check if checksums can be merged: */
323 			if (lp.crc.csum_type		!= rp.crc.csum_type ||
324 			    lp.crc.nonce		!= rp.crc.nonce ||
325 			    crc_is_compressed(lp.crc) ||
326 			    !bch2_checksum_mergeable(lp.crc.csum_type))
327 				return false;
328 
329 			if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
330 			    rp.crc.offset)
331 				return false;
332 
333 			if (lp.crc.csum_type &&
334 			    lp.crc.uncompressed_size +
335 			    rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
336 				return false;
337 		}
338 
339 		en_l = extent_entry_next(en_l);
340 		en_r = extent_entry_next(en_r);
341 	}
342 
343 	en_l = l_ptrs.start;
344 	en_r = r_ptrs.start;
345 	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
346 		if (extent_entry_is_crc(en_l)) {
347 			struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
348 			struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
349 
350 			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
351 			    bch2_crc_field_size_max[extent_entry_type(en_l)])
352 				return false;
353 		}
354 
355 		en_l = extent_entry_next(en_l);
356 		en_r = extent_entry_next(en_r);
357 	}
358 
359 	use_right_ptr = false;
360 	en_l = l_ptrs.start;
361 	en_r = r_ptrs.start;
362 	while (en_l < l_ptrs.end) {
363 		if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
364 		    use_right_ptr)
365 			en_l->ptr = en_r->ptr;
366 
367 		if (extent_entry_is_crc(en_l)) {
368 			struct bch_extent_crc_unpacked crc_l =
369 				bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
370 			struct bch_extent_crc_unpacked crc_r =
371 				bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
372 
373 			use_right_ptr = false;
374 
375 			if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
376 			    crc_l.uncompressed_size) {
377 				/* can use left extent's crc entry */
378 			} else if (crc_l.live_size <= crc_r.offset) {
379 				/* can use right extent's crc entry */
380 				crc_r.offset -= crc_l.live_size;
381 				bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
382 						     extent_entry_type(en_l));
383 				use_right_ptr = true;
384 			} else {
385 				crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
386 								 crc_l.csum,
387 								 crc_r.csum,
388 								 crc_r.uncompressed_size << 9);
389 
390 				crc_l.uncompressed_size	+= crc_r.uncompressed_size;
391 				crc_l.compressed_size	+= crc_r.compressed_size;
392 				bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
393 						     extent_entry_type(en_l));
394 			}
395 		}
396 
397 		en_l = extent_entry_next(en_l);
398 		en_r = extent_entry_next(en_r);
399 	}
400 
401 	bch2_key_resize(l.k, l.k->size + r.k->size);
402 	return true;
403 }
404 
405 /* KEY_TYPE_reservation: */
406 
bch2_reservation_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)407 int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
408 			      struct bkey_validate_context from)
409 {
410 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
411 	int ret = 0;
412 
413 	bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX,
414 			 c, reservation_key_nr_replicas_invalid,
415 			 "invalid nr_replicas (%u)", r.v->nr_replicas);
416 fsck_err:
417 	return ret;
418 }
419 
bch2_reservation_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)420 void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
421 			      struct bkey_s_c k)
422 {
423 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
424 
425 	prt_printf(out, "generation %u replicas %u",
426 	       le32_to_cpu(r.v->generation),
427 	       r.v->nr_replicas);
428 }
429 
bch2_reservation_merge(struct bch_fs * c,struct bkey_s _l,struct bkey_s_c _r)430 bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
431 {
432 	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
433 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
434 
435 	if (l.v->generation != r.v->generation ||
436 	    l.v->nr_replicas != r.v->nr_replicas)
437 		return false;
438 
439 	bch2_key_resize(l.k, l.k->size + r.k->size);
440 	return true;
441 }
442 
443 /* Extent checksum entries: */
444 
445 /* returns true if not equal */
bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,struct bch_extent_crc_unpacked r)446 static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
447 					 struct bch_extent_crc_unpacked r)
448 {
449 	return (l.csum_type		!= r.csum_type ||
450 		l.compression_type	!= r.compression_type ||
451 		l.compressed_size	!= r.compressed_size ||
452 		l.uncompressed_size	!= r.uncompressed_size ||
453 		l.offset		!= r.offset ||
454 		l.live_size		!= r.live_size ||
455 		l.nonce			!= r.nonce ||
456 		bch2_crc_cmp(l.csum, r.csum));
457 }
458 
can_narrow_crc(struct bch_extent_crc_unpacked u,struct bch_extent_crc_unpacked n)459 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
460 				  struct bch_extent_crc_unpacked n)
461 {
462 	return !crc_is_compressed(u) &&
463 		u.csum_type &&
464 		u.uncompressed_size > u.live_size &&
465 		bch2_csum_type_is_encryption(u.csum_type) ==
466 		bch2_csum_type_is_encryption(n.csum_type);
467 }
468 
bch2_can_narrow_extent_crcs(struct bkey_s_c k,struct bch_extent_crc_unpacked n)469 bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
470 				 struct bch_extent_crc_unpacked n)
471 {
472 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
473 	struct bch_extent_crc_unpacked crc;
474 	const union bch_extent_entry *i;
475 
476 	if (!n.csum_type)
477 		return false;
478 
479 	bkey_for_each_crc(k.k, ptrs, crc, i)
480 		if (can_narrow_crc(crc, n))
481 			return true;
482 
483 	return false;
484 }
485 
486 /*
487  * We're writing another replica for this extent, so while we've got the data in
488  * memory we'll be computing a new checksum for the currently live data.
489  *
490  * If there are other replicas we aren't moving, and they are checksummed but
491  * not compressed, we can modify them to point to only the data that is
492  * currently live (so that readers won't have to bounce) while we've got the
493  * checksum we need:
494  */
bch2_bkey_narrow_crcs(struct bkey_i * k,struct bch_extent_crc_unpacked n)495 bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
496 {
497 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
498 	struct bch_extent_crc_unpacked u;
499 	struct extent_ptr_decoded p;
500 	union bch_extent_entry *i;
501 	bool ret = false;
502 
503 	/* Find a checksum entry that covers only live data: */
504 	if (!n.csum_type) {
505 		bkey_for_each_crc(&k->k, ptrs, u, i)
506 			if (!crc_is_compressed(u) &&
507 			    u.csum_type &&
508 			    u.live_size == u.uncompressed_size) {
509 				n = u;
510 				goto found;
511 			}
512 		return false;
513 	}
514 found:
515 	BUG_ON(crc_is_compressed(n));
516 	BUG_ON(n.offset);
517 	BUG_ON(n.live_size != k->k.size);
518 
519 restart_narrow_pointers:
520 	ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
521 
522 	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
523 		if (can_narrow_crc(p.crc, n)) {
524 			bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
525 			p.ptr.offset += p.crc.offset;
526 			p.crc = n;
527 			bch2_extent_ptr_decoded_append(k, &p);
528 			ret = true;
529 			goto restart_narrow_pointers;
530 		}
531 
532 	return ret;
533 }
534 
bch2_extent_crc_pack(union bch_extent_crc * dst,struct bch_extent_crc_unpacked src,enum bch_extent_entry_type type)535 static void bch2_extent_crc_pack(union bch_extent_crc *dst,
536 				 struct bch_extent_crc_unpacked src,
537 				 enum bch_extent_entry_type type)
538 {
539 #define set_common_fields(_dst, _src)					\
540 		_dst.type		= 1 << type;			\
541 		_dst.csum_type		= _src.csum_type,		\
542 		_dst.compression_type	= _src.compression_type,	\
543 		_dst._compressed_size	= _src.compressed_size - 1,	\
544 		_dst._uncompressed_size	= _src.uncompressed_size - 1,	\
545 		_dst.offset		= _src.offset
546 
547 	switch (type) {
548 	case BCH_EXTENT_ENTRY_crc32:
549 		set_common_fields(dst->crc32, src);
550 		dst->crc32.csum		= (u32 __force) *((__le32 *) &src.csum.lo);
551 		break;
552 	case BCH_EXTENT_ENTRY_crc64:
553 		set_common_fields(dst->crc64, src);
554 		dst->crc64.nonce	= src.nonce;
555 		dst->crc64.csum_lo	= (u64 __force) src.csum.lo;
556 		dst->crc64.csum_hi	= (u64 __force) *((__le16 *) &src.csum.hi);
557 		break;
558 	case BCH_EXTENT_ENTRY_crc128:
559 		set_common_fields(dst->crc128, src);
560 		dst->crc128.nonce	= src.nonce;
561 		dst->crc128.csum	= src.csum;
562 		break;
563 	default:
564 		BUG();
565 	}
566 #undef set_common_fields
567 }
568 
bch2_extent_crc_append(struct bkey_i * k,struct bch_extent_crc_unpacked new)569 void bch2_extent_crc_append(struct bkey_i *k,
570 			    struct bch_extent_crc_unpacked new)
571 {
572 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
573 	union bch_extent_crc *crc = (void *) ptrs.end;
574 	enum bch_extent_entry_type type;
575 
576 	if (bch_crc_bytes[new.csum_type]	<= 4 &&
577 	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
578 	    new.nonce				<= CRC32_NONCE_MAX)
579 		type = BCH_EXTENT_ENTRY_crc32;
580 	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
581 		   new.uncompressed_size	<= CRC64_SIZE_MAX &&
582 		   new.nonce			<= CRC64_NONCE_MAX)
583 		type = BCH_EXTENT_ENTRY_crc64;
584 	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
585 		   new.uncompressed_size	<= CRC128_SIZE_MAX &&
586 		   new.nonce			<= CRC128_NONCE_MAX)
587 		type = BCH_EXTENT_ENTRY_crc128;
588 	else
589 		BUG();
590 
591 	bch2_extent_crc_pack(crc, new, type);
592 
593 	k->k.u64s += extent_entry_u64s(ptrs.end);
594 
595 	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
596 }
597 
598 /* Generic code for keys with pointers: */
599 
bch2_bkey_nr_ptrs(struct bkey_s_c k)600 unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
601 {
602 	return bch2_bkey_devs(k).nr;
603 }
604 
bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)605 unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
606 {
607 	return k.k->type == KEY_TYPE_reservation
608 		? bkey_s_c_to_reservation(k).v->nr_replicas
609 		: bch2_bkey_dirty_devs(k).nr;
610 }
611 
bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)612 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
613 {
614 	unsigned ret = 0;
615 
616 	if (k.k->type == KEY_TYPE_reservation) {
617 		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
618 	} else {
619 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
620 		const union bch_extent_entry *entry;
621 		struct extent_ptr_decoded p;
622 
623 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
624 			ret += !p.ptr.cached && !crc_is_compressed(p.crc);
625 	}
626 
627 	return ret;
628 }
629 
bch2_bkey_sectors_compressed(struct bkey_s_c k)630 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
631 {
632 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
633 	const union bch_extent_entry *entry;
634 	struct extent_ptr_decoded p;
635 	unsigned ret = 0;
636 
637 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
638 		if (!p.ptr.cached && crc_is_compressed(p.crc))
639 			ret += p.crc.compressed_size;
640 
641 	return ret;
642 }
643 
bch2_bkey_is_incompressible(struct bkey_s_c k)644 bool bch2_bkey_is_incompressible(struct bkey_s_c k)
645 {
646 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
647 	const union bch_extent_entry *entry;
648 	struct bch_extent_crc_unpacked crc;
649 
650 	bkey_for_each_crc(k.k, ptrs, crc, entry)
651 		if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
652 			return true;
653 	return false;
654 }
655 
bch2_bkey_replicas(struct bch_fs * c,struct bkey_s_c k)656 unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
657 {
658 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
659 	const union bch_extent_entry *entry;
660 	struct extent_ptr_decoded p = { 0 };
661 	unsigned replicas = 0;
662 
663 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
664 		if (p.ptr.cached)
665 			continue;
666 
667 		if (p.has_ec)
668 			replicas += p.ec.redundancy;
669 
670 		replicas++;
671 
672 	}
673 
674 	return replicas;
675 }
676 
__extent_ptr_durability(struct bch_dev * ca,struct extent_ptr_decoded * p)677 static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
678 {
679 	if (p->ptr.cached)
680 		return 0;
681 
682 	return p->has_ec
683 		? p->ec.redundancy + 1
684 		: ca->mi.durability;
685 }
686 
bch2_extent_ptr_desired_durability(struct bch_fs * c,struct extent_ptr_decoded * p)687 unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
688 {
689 	struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
690 
691 	return ca ? __extent_ptr_durability(ca, p) : 0;
692 }
693 
bch2_extent_ptr_durability(struct bch_fs * c,struct extent_ptr_decoded * p)694 unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
695 {
696 	struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
697 
698 	if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
699 		return 0;
700 
701 	return __extent_ptr_durability(ca, p);
702 }
703 
bch2_bkey_durability(struct bch_fs * c,struct bkey_s_c k)704 unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
705 {
706 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
707 	const union bch_extent_entry *entry;
708 	struct extent_ptr_decoded p;
709 	unsigned durability = 0;
710 
711 	rcu_read_lock();
712 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
713 		durability += bch2_extent_ptr_durability(c, &p);
714 	rcu_read_unlock();
715 
716 	return durability;
717 }
718 
bch2_bkey_durability_safe(struct bch_fs * c,struct bkey_s_c k)719 static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
720 {
721 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
722 	const union bch_extent_entry *entry;
723 	struct extent_ptr_decoded p;
724 	unsigned durability = 0;
725 
726 	rcu_read_lock();
727 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
728 		if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
729 			durability += bch2_extent_ptr_durability(c, &p);
730 	rcu_read_unlock();
731 
732 	return durability;
733 }
734 
bch2_bkey_extent_entry_drop(struct bkey_i * k,union bch_extent_entry * entry)735 void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
736 {
737 	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
738 	union bch_extent_entry *next = extent_entry_next(entry);
739 
740 	memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
741 	k->k.u64s -= extent_entry_u64s(entry);
742 }
743 
bch2_extent_ptr_decoded_append(struct bkey_i * k,struct extent_ptr_decoded * p)744 void bch2_extent_ptr_decoded_append(struct bkey_i *k,
745 				    struct extent_ptr_decoded *p)
746 {
747 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
748 	struct bch_extent_crc_unpacked crc =
749 		bch2_extent_crc_unpack(&k->k, NULL);
750 	union bch_extent_entry *pos;
751 
752 	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
753 		pos = ptrs.start;
754 		goto found;
755 	}
756 
757 	bkey_for_each_crc(&k->k, ptrs, crc, pos)
758 		if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
759 			pos = extent_entry_next(pos);
760 			goto found;
761 		}
762 
763 	bch2_extent_crc_append(k, p->crc);
764 	pos = bkey_val_end(bkey_i_to_s(k));
765 found:
766 	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
767 	__extent_entry_insert(k, pos, to_entry(&p->ptr));
768 
769 	if (p->has_ec) {
770 		p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
771 		__extent_entry_insert(k, pos, to_entry(&p->ec));
772 	}
773 }
774 
extent_entry_prev(struct bkey_ptrs ptrs,union bch_extent_entry * entry)775 static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
776 					  union bch_extent_entry *entry)
777 {
778 	union bch_extent_entry *i = ptrs.start;
779 
780 	if (i == entry)
781 		return NULL;
782 
783 	while (extent_entry_next(i) != entry)
784 		i = extent_entry_next(i);
785 	return i;
786 }
787 
788 /*
789  * Returns pointer to the next entry after the one being dropped:
790  */
bch2_bkey_drop_ptr_noerror(struct bkey_s k,struct bch_extent_ptr * ptr)791 void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
792 {
793 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
794 	union bch_extent_entry *entry = to_entry(ptr), *next;
795 	bool drop_crc = true;
796 
797 	if (k.k->type == KEY_TYPE_stripe) {
798 		ptr->dev = BCH_SB_MEMBER_INVALID;
799 		return;
800 	}
801 
802 	EBUG_ON(ptr < &ptrs.start->ptr ||
803 		ptr >= &ptrs.end->ptr);
804 	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
805 
806 	for (next = extent_entry_next(entry);
807 	     next != ptrs.end;
808 	     next = extent_entry_next(next)) {
809 		if (extent_entry_is_crc(next)) {
810 			break;
811 		} else if (extent_entry_is_ptr(next)) {
812 			drop_crc = false;
813 			break;
814 		}
815 	}
816 
817 	extent_entry_drop(k, entry);
818 
819 	while ((entry = extent_entry_prev(ptrs, entry))) {
820 		if (extent_entry_is_ptr(entry))
821 			break;
822 
823 		if ((extent_entry_is_crc(entry) && drop_crc) ||
824 		    extent_entry_is_stripe_ptr(entry))
825 			extent_entry_drop(k, entry);
826 	}
827 }
828 
bch2_bkey_drop_ptr(struct bkey_s k,struct bch_extent_ptr * ptr)829 void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
830 {
831 	if (k.k->type != KEY_TYPE_stripe) {
832 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
833 		const union bch_extent_entry *entry;
834 		struct extent_ptr_decoded p;
835 
836 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
837 			if (p.ptr.dev == ptr->dev && p.has_ec) {
838 				ptr->dev = BCH_SB_MEMBER_INVALID;
839 				return;
840 			}
841 	}
842 
843 	bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
844 
845 	bch2_bkey_drop_ptr_noerror(k, ptr);
846 
847 	/*
848 	 * If we deleted all the dirty pointers and there's still cached
849 	 * pointers, we could set the cached pointers to dirty if they're not
850 	 * stale - but to do that correctly we'd need to grab an open_bucket
851 	 * reference so that we don't race with bucket reuse:
852 	 */
853 	if (have_dirty &&
854 	    !bch2_bkey_dirty_devs(k.s_c).nr) {
855 		k.k->type = KEY_TYPE_error;
856 		set_bkey_val_u64s(k.k, 0);
857 	} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
858 		k.k->type = KEY_TYPE_deleted;
859 		set_bkey_val_u64s(k.k, 0);
860 	}
861 }
862 
bch2_bkey_drop_device(struct bkey_s k,unsigned dev)863 void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
864 {
865 	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
866 }
867 
bch2_bkey_drop_device_noerror(struct bkey_s k,unsigned dev)868 void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
869 {
870 	bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
871 }
872 
bch2_bkey_has_device_c(struct bkey_s_c k,unsigned dev)873 const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
874 {
875 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
876 
877 	bkey_for_each_ptr(ptrs, ptr)
878 		if (ptr->dev == dev)
879 			return ptr;
880 
881 	return NULL;
882 }
883 
bch2_bkey_has_target(struct bch_fs * c,struct bkey_s_c k,unsigned target)884 bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
885 {
886 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
887 	struct bch_dev *ca;
888 	bool ret = false;
889 
890 	rcu_read_lock();
891 	bkey_for_each_ptr(ptrs, ptr)
892 		if (bch2_dev_in_target(c, ptr->dev, target) &&
893 		    (ca = bch2_dev_rcu(c, ptr->dev)) &&
894 		    (!ptr->cached ||
895 		     !dev_ptr_stale_rcu(ca, ptr))) {
896 			ret = true;
897 			break;
898 		}
899 	rcu_read_unlock();
900 
901 	return ret;
902 }
903 
bch2_bkey_matches_ptr(struct bch_fs * c,struct bkey_s_c k,struct bch_extent_ptr m,u64 offset)904 bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
905 			   struct bch_extent_ptr m, u64 offset)
906 {
907 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
908 	const union bch_extent_entry *entry;
909 	struct extent_ptr_decoded p;
910 
911 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
912 		if (p.ptr.dev	== m.dev &&
913 		    p.ptr.gen	== m.gen &&
914 		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
915 		    (s64) m.offset  - offset)
916 			return true;
917 
918 	return false;
919 }
920 
921 /*
922  * Returns true if two extents refer to the same data:
923  */
bch2_extents_match(struct bkey_s_c k1,struct bkey_s_c k2)924 bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
925 {
926 	if (k1.k->type != k2.k->type)
927 		return false;
928 
929 	if (bkey_extent_is_direct_data(k1.k)) {
930 		struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
931 		struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
932 		const union bch_extent_entry *entry1, *entry2;
933 		struct extent_ptr_decoded p1, p2;
934 
935 		if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
936 			return false;
937 
938 		bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
939 			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
940 				if (p1.ptr.dev		== p2.ptr.dev &&
941 				    p1.ptr.gen		== p2.ptr.gen &&
942 
943 				    /*
944 				     * This checks that the two pointers point
945 				     * to the same region on disk - adjusting
946 				     * for the difference in where the extents
947 				     * start, since one may have been trimmed:
948 				     */
949 				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
950 				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
951 
952 				    /*
953 				     * This additionally checks that the
954 				     * extents overlap on disk, since the
955 				     * previous check may trigger spuriously
956 				     * when one extent is immediately partially
957 				     * overwritten with another extent (so that
958 				     * on disk they are adjacent) and
959 				     * compression is in use:
960 				     */
961 				    ((p1.ptr.offset >= p2.ptr.offset &&
962 				      p1.ptr.offset  < p2.ptr.offset + p2.crc.compressed_size) ||
963 				     (p2.ptr.offset >= p1.ptr.offset &&
964 				      p2.ptr.offset  < p1.ptr.offset + p1.crc.compressed_size)))
965 					return true;
966 
967 		return false;
968 	} else {
969 		/* KEY_TYPE_deleted, etc. */
970 		return true;
971 	}
972 }
973 
974 struct bch_extent_ptr *
bch2_extent_has_ptr(struct bkey_s_c k1,struct extent_ptr_decoded p1,struct bkey_s k2)975 bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
976 {
977 	struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
978 	union bch_extent_entry *entry2;
979 	struct extent_ptr_decoded p2;
980 
981 	bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
982 		if (p1.ptr.dev		== p2.ptr.dev &&
983 		    p1.ptr.gen		== p2.ptr.gen &&
984 		    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
985 		    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
986 			return &entry2->ptr;
987 
988 	return NULL;
989 }
990 
want_cached_ptr(struct bch_fs * c,struct bch_io_opts * opts,struct bch_extent_ptr * ptr)991 static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
992 			    struct bch_extent_ptr *ptr)
993 {
994 	if (!opts->promote_target ||
995 	    !bch2_dev_in_target(c, ptr->dev, opts->promote_target))
996 		return false;
997 
998 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
999 
1000 	return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
1001 }
1002 
bch2_extent_ptr_set_cached(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s k,struct bch_extent_ptr * ptr)1003 void bch2_extent_ptr_set_cached(struct bch_fs *c,
1004 				struct bch_io_opts *opts,
1005 				struct bkey_s k,
1006 				struct bch_extent_ptr *ptr)
1007 {
1008 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1009 	union bch_extent_entry *entry;
1010 	struct extent_ptr_decoded p;
1011 
1012 	rcu_read_lock();
1013 	if (!want_cached_ptr(c, opts, ptr)) {
1014 		bch2_bkey_drop_ptr_noerror(k, ptr);
1015 		goto out;
1016 	}
1017 
1018 	/*
1019 	 * Stripes can't contain cached data, for - reasons.
1020 	 *
1021 	 * Possibly something we can fix in the future?
1022 	 */
1023 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
1024 		if (&entry->ptr == ptr) {
1025 			if (p.has_ec)
1026 				bch2_bkey_drop_ptr_noerror(k, ptr);
1027 			else
1028 				ptr->cached = true;
1029 			goto out;
1030 		}
1031 
1032 	BUG();
1033 out:
1034 	rcu_read_unlock();
1035 }
1036 
1037 /*
1038  * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
1039  *
1040  * Returns true if @k should be dropped entirely
1041  *
1042  * For existing keys, only called when btree nodes are being rewritten, not when
1043  * they're merely being compacted/resorted in memory.
1044  */
bch2_extent_normalize(struct bch_fs * c,struct bkey_s k)1045 bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
1046 {
1047 	struct bch_dev *ca;
1048 
1049 	rcu_read_lock();
1050 	bch2_bkey_drop_ptrs(k, ptr,
1051 		ptr->cached &&
1052 		(!(ca = bch2_dev_rcu(c, ptr->dev)) ||
1053 		 dev_ptr_stale_rcu(ca, ptr) > 0));
1054 	rcu_read_unlock();
1055 
1056 	return bkey_deleted(k.k);
1057 }
1058 
1059 /*
1060  * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
1061  *
1062  * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
1063  * the promote target.
1064  */
bch2_extent_normalize_by_opts(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s k)1065 bool bch2_extent_normalize_by_opts(struct bch_fs *c,
1066 				   struct bch_io_opts *opts,
1067 				   struct bkey_s k)
1068 {
1069 	struct bkey_ptrs ptrs;
1070 	bool have_cached_ptr;
1071 
1072 	rcu_read_lock();
1073 restart_drop_ptrs:
1074 	ptrs = bch2_bkey_ptrs(k);
1075 	have_cached_ptr = false;
1076 
1077 	bkey_for_each_ptr(ptrs, ptr)
1078 		if (ptr->cached) {
1079 			if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
1080 				bch2_bkey_drop_ptr(k, ptr);
1081 				goto restart_drop_ptrs;
1082 			}
1083 			have_cached_ptr = true;
1084 		}
1085 	rcu_read_unlock();
1086 
1087 	return bkey_deleted(k.k);
1088 }
1089 
bch2_extent_ptr_to_text(struct printbuf * out,struct bch_fs * c,const struct bch_extent_ptr * ptr)1090 void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
1091 {
1092 	out->atomic++;
1093 	rcu_read_lock();
1094 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1095 	if (!ca) {
1096 		prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
1097 			   (u64) ptr->offset, ptr->gen,
1098 			   ptr->cached ? " cached" : "");
1099 	} else {
1100 		u32 offset;
1101 		u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
1102 
1103 		prt_printf(out, "ptr: %u:%llu:%u gen %u",
1104 			   ptr->dev, b, offset, ptr->gen);
1105 		if (ca->mi.durability != 1)
1106 			prt_printf(out, " d=%u", ca->mi.durability);
1107 		if (ptr->cached)
1108 			prt_str(out, " cached");
1109 		if (ptr->unwritten)
1110 			prt_str(out, " unwritten");
1111 		int stale = dev_ptr_stale_rcu(ca, ptr);
1112 		if (stale > 0)
1113 			prt_printf(out, " stale");
1114 		else if (stale)
1115 			prt_printf(out, " invalid");
1116 	}
1117 	rcu_read_unlock();
1118 	--out->atomic;
1119 }
1120 
bch2_extent_crc_unpacked_to_text(struct printbuf * out,struct bch_extent_crc_unpacked * crc)1121 void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
1122 {
1123 	prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
1124 		   crc->compressed_size,
1125 		   crc->uncompressed_size,
1126 		   crc->offset, crc->nonce);
1127 	bch2_prt_csum_type(out, crc->csum_type);
1128 	prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
1129 	prt_str(out, " compress ");
1130 	bch2_prt_compression_type(out, crc->compression_type);
1131 }
1132 
bch2_extent_rebalance_to_text(struct printbuf * out,struct bch_fs * c,const struct bch_extent_rebalance * r)1133 static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
1134 					  const struct bch_extent_rebalance *r)
1135 {
1136 	prt_str(out, "rebalance:");
1137 
1138 	prt_printf(out, " replicas=%u", r->data_replicas);
1139 	if (r->data_replicas_from_inode)
1140 		prt_str(out, " (inode)");
1141 
1142 	prt_str(out, " checksum=");
1143 	bch2_prt_csum_opt(out, r->data_checksum);
1144 	if (r->data_checksum_from_inode)
1145 		prt_str(out, " (inode)");
1146 
1147 	if (r->background_compression || r->background_compression_from_inode) {
1148 		prt_str(out, " background_compression=");
1149 		bch2_compression_opt_to_text(out, r->background_compression);
1150 
1151 		if (r->background_compression_from_inode)
1152 			prt_str(out, " (inode)");
1153 	}
1154 
1155 	if (r->background_target || r->background_target_from_inode) {
1156 		prt_str(out, " background_target=");
1157 		if (c)
1158 			bch2_target_to_text(out, c, r->background_target);
1159 		else
1160 			prt_printf(out, "%u", r->background_target);
1161 
1162 		if (r->background_target_from_inode)
1163 			prt_str(out, " (inode)");
1164 	}
1165 
1166 	if (r->promote_target || r->promote_target_from_inode) {
1167 		prt_str(out, " promote_target=");
1168 		if (c)
1169 			bch2_target_to_text(out, c, r->promote_target);
1170 		else
1171 			prt_printf(out, "%u", r->promote_target);
1172 
1173 		if (r->promote_target_from_inode)
1174 			prt_str(out, " (inode)");
1175 	}
1176 
1177 	if (r->erasure_code || r->erasure_code_from_inode) {
1178 		prt_printf(out, " ec=%u", r->erasure_code);
1179 		if (r->erasure_code_from_inode)
1180 			prt_str(out, " (inode)");
1181 	}
1182 }
1183 
bch2_bkey_ptrs_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)1184 void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
1185 			    struct bkey_s_c k)
1186 {
1187 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1188 	const union bch_extent_entry *entry;
1189 	bool first = true;
1190 
1191 	if (c)
1192 		prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
1193 
1194 	bkey_extent_entry_for_each(ptrs, entry) {
1195 		if (!first)
1196 			prt_printf(out, " ");
1197 
1198 		switch (__extent_entry_type(entry)) {
1199 		case BCH_EXTENT_ENTRY_ptr:
1200 			bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
1201 			break;
1202 
1203 		case BCH_EXTENT_ENTRY_crc32:
1204 		case BCH_EXTENT_ENTRY_crc64:
1205 		case BCH_EXTENT_ENTRY_crc128: {
1206 			struct bch_extent_crc_unpacked crc =
1207 				bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
1208 
1209 			bch2_extent_crc_unpacked_to_text(out, &crc);
1210 			break;
1211 		}
1212 		case BCH_EXTENT_ENTRY_stripe_ptr: {
1213 			const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
1214 
1215 			prt_printf(out, "ec: idx %llu block %u",
1216 			       (u64) ec->idx, ec->block);
1217 			break;
1218 		}
1219 		case BCH_EXTENT_ENTRY_rebalance:
1220 			bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
1221 			break;
1222 
1223 		default:
1224 			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
1225 			return;
1226 		}
1227 
1228 		first = false;
1229 	}
1230 }
1231 
extent_ptr_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from,const struct bch_extent_ptr * ptr,unsigned size_ondisk,bool metadata)1232 static int extent_ptr_validate(struct bch_fs *c,
1233 			       struct bkey_s_c k,
1234 			       struct bkey_validate_context from,
1235 			       const struct bch_extent_ptr *ptr,
1236 			       unsigned size_ondisk,
1237 			       bool metadata)
1238 {
1239 	int ret = 0;
1240 
1241 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1242 	bkey_for_each_ptr(ptrs, ptr2)
1243 		bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
1244 				 c, ptr_to_duplicate_device,
1245 				 "multiple pointers to same device (%u)", ptr->dev);
1246 
1247 	/* bad pointers are repaired by check_fix_ptrs(): */
1248 	rcu_read_lock();
1249 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1250 	if (!ca) {
1251 		rcu_read_unlock();
1252 		return 0;
1253 	}
1254 	u32 bucket_offset;
1255 	u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
1256 	unsigned first_bucket	= ca->mi.first_bucket;
1257 	u64 nbuckets		= ca->mi.nbuckets;
1258 	unsigned bucket_size	= ca->mi.bucket_size;
1259 	rcu_read_unlock();
1260 
1261 	bkey_fsck_err_on(bucket >= nbuckets,
1262 			 c, ptr_after_last_bucket,
1263 			 "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
1264 	bkey_fsck_err_on(bucket < first_bucket,
1265 			 c, ptr_before_first_bucket,
1266 			 "pointer before first bucket (%llu < %u)", bucket, first_bucket);
1267 	bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size,
1268 			 c, ptr_spans_multiple_buckets,
1269 			 "pointer spans multiple buckets (%u + %u > %u)",
1270 		       bucket_offset, size_ondisk, bucket_size);
1271 fsck_err:
1272 	return ret;
1273 }
1274 
bch2_bkey_ptrs_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)1275 int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
1276 			    struct bkey_validate_context from)
1277 {
1278 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1279 	const union bch_extent_entry *entry;
1280 	struct bch_extent_crc_unpacked crc;
1281 	unsigned size_ondisk = k.k->size;
1282 	unsigned nonce = UINT_MAX;
1283 	unsigned nr_ptrs = 0;
1284 	bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
1285 	int ret = 0;
1286 
1287 	if (bkey_is_btree_ptr(k.k))
1288 		size_ondisk = btree_sectors(c);
1289 
1290 	bkey_extent_entry_for_each(ptrs, entry) {
1291 		bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
1292 				 c, extent_ptrs_invalid_entry,
1293 				 "invalid extent entry type (got %u, max %u)",
1294 				 __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
1295 
1296 		bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
1297 				 !extent_entry_is_ptr(entry),
1298 				 c, btree_ptr_has_non_ptr,
1299 				 "has non ptr field");
1300 
1301 		switch (extent_entry_type(entry)) {
1302 		case BCH_EXTENT_ENTRY_ptr:
1303 			ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
1304 			if (ret)
1305 				return ret;
1306 
1307 			bkey_fsck_err_on(entry->ptr.cached && have_ec,
1308 					 c, ptr_cached_and_erasure_coded,
1309 					 "cached, erasure coded ptr");
1310 
1311 			if (!entry->ptr.unwritten)
1312 				have_written = true;
1313 			else
1314 				have_unwritten = true;
1315 
1316 			have_ec = false;
1317 			crc_since_last_ptr = false;
1318 			nr_ptrs++;
1319 			break;
1320 		case BCH_EXTENT_ENTRY_crc32:
1321 		case BCH_EXTENT_ENTRY_crc64:
1322 		case BCH_EXTENT_ENTRY_crc128:
1323 			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
1324 
1325 			bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
1326 					 c, ptr_crc_csum_type_unknown,
1327 					 "invalid checksum type");
1328 			bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR,
1329 					 c, ptr_crc_compression_type_unknown,
1330 					 "invalid compression type");
1331 
1332 			bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
1333 					 c, ptr_crc_uncompressed_size_too_small,
1334 					 "checksum offset + key size > uncompressed size");
1335 			bkey_fsck_err_on(crc_is_encoded(crc) &&
1336 					 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
1337 					 (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
1338 					 c, ptr_crc_uncompressed_size_too_big,
1339 					 "too large encoded extent");
1340 			bkey_fsck_err_on(!crc_is_compressed(crc) &&
1341 					 crc.compressed_size != crc.uncompressed_size,
1342 					 c, ptr_crc_uncompressed_size_mismatch,
1343 					 "not compressed but compressed != uncompressed size");
1344 
1345 			if (bch2_csum_type_is_encryption(crc.csum_type)) {
1346 				if (nonce == UINT_MAX)
1347 					nonce = crc.offset + crc.nonce;
1348 				else if (nonce != crc.offset + crc.nonce)
1349 					bkey_fsck_err(c, ptr_crc_nonce_mismatch,
1350 						      "incorrect nonce");
1351 			}
1352 
1353 			bkey_fsck_err_on(crc_since_last_ptr,
1354 					 c, ptr_crc_redundant,
1355 					 "redundant crc entry");
1356 			crc_since_last_ptr = true;
1357 
1358 			size_ondisk = crc.compressed_size;
1359 			break;
1360 		case BCH_EXTENT_ENTRY_stripe_ptr:
1361 			bkey_fsck_err_on(have_ec,
1362 					 c, ptr_stripe_redundant,
1363 					 "redundant stripe entry");
1364 			have_ec = true;
1365 			break;
1366 		case BCH_EXTENT_ENTRY_rebalance: {
1367 			/*
1368 			 * this shouldn't be a fsck error, for forward
1369 			 * compatibility; the rebalance code should just refetch
1370 			 * the compression opt if it's unknown
1371 			 */
1372 #if 0
1373 			const struct bch_extent_rebalance *r = &entry->rebalance;
1374 
1375 			if (!bch2_compression_opt_valid(r->compression)) {
1376 				struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
1377 				prt_printf(err, "invalid compression opt %u:%u",
1378 					   opt.type, opt.level);
1379 				return -BCH_ERR_invalid_bkey;
1380 			}
1381 #endif
1382 			break;
1383 		}
1384 		}
1385 	}
1386 
1387 	bkey_fsck_err_on(!nr_ptrs,
1388 			 c, extent_ptrs_no_ptrs,
1389 			 "no ptrs");
1390 	bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX,
1391 			 c, extent_ptrs_too_many_ptrs,
1392 			 "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
1393 	bkey_fsck_err_on(have_written && have_unwritten,
1394 			 c, extent_ptrs_written_and_unwritten,
1395 			 "extent with unwritten and written ptrs");
1396 	bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten,
1397 			 c, extent_ptrs_unwritten,
1398 			 "has unwritten ptrs");
1399 	bkey_fsck_err_on(crc_since_last_ptr,
1400 			 c, extent_ptrs_redundant_crc,
1401 			 "redundant crc entry");
1402 	bkey_fsck_err_on(have_ec,
1403 			 c, extent_ptrs_redundant_stripe,
1404 			 "redundant stripe entry");
1405 fsck_err:
1406 	return ret;
1407 }
1408 
bch2_ptr_swab(struct bkey_s k)1409 void bch2_ptr_swab(struct bkey_s k)
1410 {
1411 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1412 	union bch_extent_entry *entry;
1413 	u64 *d;
1414 
1415 	for (d =  (u64 *) ptrs.start;
1416 	     d != (u64 *) ptrs.end;
1417 	     d++)
1418 		*d = swab64(*d);
1419 
1420 	for (entry = ptrs.start;
1421 	     entry < ptrs.end;
1422 	     entry = extent_entry_next(entry)) {
1423 		switch (__extent_entry_type(entry)) {
1424 		case BCH_EXTENT_ENTRY_ptr:
1425 			break;
1426 		case BCH_EXTENT_ENTRY_crc32:
1427 			entry->crc32.csum = swab32(entry->crc32.csum);
1428 			break;
1429 		case BCH_EXTENT_ENTRY_crc64:
1430 			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
1431 			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
1432 			break;
1433 		case BCH_EXTENT_ENTRY_crc128:
1434 			entry->crc128.csum.hi = (__force __le64)
1435 				swab64((__force u64) entry->crc128.csum.hi);
1436 			entry->crc128.csum.lo = (__force __le64)
1437 				swab64((__force u64) entry->crc128.csum.lo);
1438 			break;
1439 		case BCH_EXTENT_ENTRY_stripe_ptr:
1440 			break;
1441 		case BCH_EXTENT_ENTRY_rebalance:
1442 			break;
1443 		default:
1444 			/* Bad entry type: will be caught by validate() */
1445 			return;
1446 		}
1447 	}
1448 }
1449 
1450 /* Generic extent code: */
1451 
bch2_cut_front_s(struct bpos where,struct bkey_s k)1452 int bch2_cut_front_s(struct bpos where, struct bkey_s k)
1453 {
1454 	unsigned new_val_u64s = bkey_val_u64s(k.k);
1455 	int val_u64s_delta;
1456 	u64 sub;
1457 
1458 	if (bkey_le(where, bkey_start_pos(k.k)))
1459 		return 0;
1460 
1461 	EBUG_ON(bkey_gt(where, k.k->p));
1462 
1463 	sub = where.offset - bkey_start_offset(k.k);
1464 
1465 	k.k->size -= sub;
1466 
1467 	if (!k.k->size) {
1468 		k.k->type = KEY_TYPE_deleted;
1469 		new_val_u64s = 0;
1470 	}
1471 
1472 	switch (k.k->type) {
1473 	case KEY_TYPE_extent:
1474 	case KEY_TYPE_reflink_v: {
1475 		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1476 		union bch_extent_entry *entry;
1477 		bool seen_crc = false;
1478 
1479 		bkey_extent_entry_for_each(ptrs, entry) {
1480 			switch (extent_entry_type(entry)) {
1481 			case BCH_EXTENT_ENTRY_ptr:
1482 				if (!seen_crc)
1483 					entry->ptr.offset += sub;
1484 				break;
1485 			case BCH_EXTENT_ENTRY_crc32:
1486 				entry->crc32.offset += sub;
1487 				break;
1488 			case BCH_EXTENT_ENTRY_crc64:
1489 				entry->crc64.offset += sub;
1490 				break;
1491 			case BCH_EXTENT_ENTRY_crc128:
1492 				entry->crc128.offset += sub;
1493 				break;
1494 			case BCH_EXTENT_ENTRY_stripe_ptr:
1495 				break;
1496 			case BCH_EXTENT_ENTRY_rebalance:
1497 				break;
1498 			}
1499 
1500 			if (extent_entry_is_crc(entry))
1501 				seen_crc = true;
1502 		}
1503 
1504 		break;
1505 	}
1506 	case KEY_TYPE_reflink_p: {
1507 		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
1508 
1509 		SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
1510 		break;
1511 	}
1512 	case KEY_TYPE_inline_data:
1513 	case KEY_TYPE_indirect_inline_data: {
1514 		void *p = bkey_inline_data_p(k);
1515 		unsigned bytes = bkey_inline_data_bytes(k.k);
1516 
1517 		sub = min_t(u64, sub << 9, bytes);
1518 
1519 		memmove(p, p + sub, bytes - sub);
1520 
1521 		new_val_u64s -= sub >> 3;
1522 		break;
1523 	}
1524 	}
1525 
1526 	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
1527 	BUG_ON(val_u64s_delta < 0);
1528 
1529 	set_bkey_val_u64s(k.k, new_val_u64s);
1530 	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
1531 	return -val_u64s_delta;
1532 }
1533 
bch2_cut_back_s(struct bpos where,struct bkey_s k)1534 int bch2_cut_back_s(struct bpos where, struct bkey_s k)
1535 {
1536 	unsigned new_val_u64s = bkey_val_u64s(k.k);
1537 	int val_u64s_delta;
1538 	u64 len = 0;
1539 
1540 	if (bkey_ge(where, k.k->p))
1541 		return 0;
1542 
1543 	EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
1544 
1545 	len = where.offset - bkey_start_offset(k.k);
1546 
1547 	k.k->p.offset = where.offset;
1548 	k.k->size = len;
1549 
1550 	if (!len) {
1551 		k.k->type = KEY_TYPE_deleted;
1552 		new_val_u64s = 0;
1553 	}
1554 
1555 	switch (k.k->type) {
1556 	case KEY_TYPE_inline_data:
1557 	case KEY_TYPE_indirect_inline_data:
1558 		new_val_u64s = (bkey_inline_data_offset(k.k) +
1559 				min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
1560 		break;
1561 	}
1562 
1563 	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
1564 	BUG_ON(val_u64s_delta < 0);
1565 
1566 	set_bkey_val_u64s(k.k, new_val_u64s);
1567 	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
1568 	return -val_u64s_delta;
1569 }
1570