xref: /linux/fs/bcachefs/extents.c (revision 031fba65fc202abf1f193e321be7a2c274fd88ba)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4  *
5  * Code for managing the extent btree and dynamically updating the writeback
6  * dirty sector count.
7  */
8 
9 #include "bcachefs.h"
10 #include "bkey_methods.h"
11 #include "btree_gc.h"
12 #include "btree_io.h"
13 #include "btree_iter.h"
14 #include "buckets.h"
15 #include "checksum.h"
16 #include "debug.h"
17 #include "disk_groups.h"
18 #include "error.h"
19 #include "extents.h"
20 #include "inode.h"
21 #include "journal.h"
22 #include "replicas.h"
23 #include "super.h"
24 #include "super-io.h"
25 #include "trace.h"
26 #include "util.h"
27 
28 static unsigned bch2_crc_field_size_max[] = {
29 	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
30 	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
31 	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
32 };
33 
34 static void bch2_extent_crc_pack(union bch_extent_crc *,
35 				 struct bch_extent_crc_unpacked,
36 				 enum bch_extent_entry_type);
37 
38 static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
39 						   unsigned dev)
40 {
41 	struct bch_dev_io_failures *i;
42 
43 	for (i = f->devs; i < f->devs + f->nr; i++)
44 		if (i->dev == dev)
45 			return i;
46 
47 	return NULL;
48 }
49 
50 void bch2_mark_io_failure(struct bch_io_failures *failed,
51 			  struct extent_ptr_decoded *p)
52 {
53 	struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
54 
55 	if (!f) {
56 		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
57 
58 		f = &failed->devs[failed->nr++];
59 		f->dev		= p->ptr.dev;
60 		f->idx		= p->idx;
61 		f->nr_failed	= 1;
62 		f->nr_retries	= 0;
63 	} else if (p->idx != f->idx) {
64 		f->idx		= p->idx;
65 		f->nr_failed	= 1;
66 		f->nr_retries	= 0;
67 	} else {
68 		f->nr_failed++;
69 	}
70 }
71 
72 /*
73  * returns true if p1 is better than p2:
74  */
75 static inline bool ptr_better(struct bch_fs *c,
76 			      const struct extent_ptr_decoded p1,
77 			      const struct extent_ptr_decoded p2)
78 {
79 	if (likely(!p1.idx && !p2.idx)) {
80 		struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
81 		struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
82 
83 		u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
84 		u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
85 
86 		/* Pick at random, biased in favor of the faster device: */
87 
88 		return bch2_rand_range(l1 + l2) > l1;
89 	}
90 
91 	if (bch2_force_reconstruct_read)
92 		return p1.idx > p2.idx;
93 
94 	return p1.idx < p2.idx;
95 }
96 
97 /*
98  * This picks a non-stale pointer, preferably from a device other than @avoid.
99  * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
100  * other devices, it will still pick a pointer from avoid.
101  */
102 int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
103 			       struct bch_io_failures *failed,
104 			       struct extent_ptr_decoded *pick)
105 {
106 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
107 	const union bch_extent_entry *entry;
108 	struct extent_ptr_decoded p;
109 	struct bch_dev_io_failures *f;
110 	struct bch_dev *ca;
111 	int ret = 0;
112 
113 	if (k.k->type == KEY_TYPE_error)
114 		return -EIO;
115 
116 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
117 		/*
118 		 * Unwritten extent: no need to actually read, treat it as a
119 		 * hole and return 0s:
120 		 */
121 		if (p.ptr.unwritten)
122 			return 0;
123 
124 		ca = bch_dev_bkey_exists(c, p.ptr.dev);
125 
126 		/*
127 		 * If there are any dirty pointers it's an error if we can't
128 		 * read:
129 		 */
130 		if (!ret && !p.ptr.cached)
131 			ret = -EIO;
132 
133 		if (p.ptr.cached && ptr_stale(ca, &p.ptr))
134 			continue;
135 
136 		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
137 		if (f)
138 			p.idx = f->nr_failed < f->nr_retries
139 				? f->idx
140 				: f->idx + 1;
141 
142 		if (!p.idx &&
143 		    !bch2_dev_is_readable(ca))
144 			p.idx++;
145 
146 		if (bch2_force_reconstruct_read &&
147 		    !p.idx && p.has_ec)
148 			p.idx++;
149 
150 		if (p.idx >= (unsigned) p.has_ec + 1)
151 			continue;
152 
153 		if (ret > 0 && !ptr_better(c, p, *pick))
154 			continue;
155 
156 		*pick = p;
157 		ret = 1;
158 	}
159 
160 	return ret;
161 }
162 
163 /* KEY_TYPE_btree_ptr: */
164 
165 int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
166 			   enum bkey_invalid_flags flags,
167 			   struct printbuf *err)
168 {
169 	if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
170 		prt_printf(err, "value too big (%zu > %u)",
171 		       bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
172 		return -BCH_ERR_invalid_bkey;
173 	}
174 
175 	return bch2_bkey_ptrs_invalid(c, k, flags, err);
176 }
177 
178 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
179 			    struct bkey_s_c k)
180 {
181 	bch2_bkey_ptrs_to_text(out, c, k);
182 }
183 
184 int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
185 			      enum bkey_invalid_flags flags,
186 			      struct printbuf *err)
187 {
188 	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
189 		prt_printf(err, "value too big (%zu > %zu)",
190 		       bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
191 		return -BCH_ERR_invalid_bkey;
192 	}
193 
194 	return bch2_bkey_ptrs_invalid(c, k, flags, err);
195 }
196 
197 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
198 			       struct bkey_s_c k)
199 {
200 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
201 
202 	prt_printf(out, "seq %llx written %u min_key %s",
203 	       le64_to_cpu(bp.v->seq),
204 	       le16_to_cpu(bp.v->sectors_written),
205 	       BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
206 
207 	bch2_bpos_to_text(out, bp.v->min_key);
208 	prt_printf(out, " ");
209 	bch2_bkey_ptrs_to_text(out, c, k);
210 }
211 
212 void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
213 			      unsigned big_endian, int write,
214 			      struct bkey_s k)
215 {
216 	struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
217 
218 	compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
219 
220 	if (version < bcachefs_metadata_version_inode_btree_change &&
221 	    btree_id_is_extents(btree_id) &&
222 	    !bkey_eq(bp.v->min_key, POS_MIN))
223 		bp.v->min_key = write
224 			? bpos_nosnap_predecessor(bp.v->min_key)
225 			: bpos_nosnap_successor(bp.v->min_key);
226 }
227 
228 /* KEY_TYPE_extent: */
229 
230 bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
231 {
232 	struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
233 	struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
234 	union bch_extent_entry *en_l;
235 	const union bch_extent_entry *en_r;
236 	struct extent_ptr_decoded lp, rp;
237 	bool use_right_ptr;
238 	struct bch_dev *ca;
239 
240 	en_l = l_ptrs.start;
241 	en_r = r_ptrs.start;
242 	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
243 		if (extent_entry_type(en_l) != extent_entry_type(en_r))
244 			return false;
245 
246 		en_l = extent_entry_next(en_l);
247 		en_r = extent_entry_next(en_r);
248 	}
249 
250 	if (en_l < l_ptrs.end || en_r < r_ptrs.end)
251 		return false;
252 
253 	en_l = l_ptrs.start;
254 	en_r = r_ptrs.start;
255 	lp.crc = bch2_extent_crc_unpack(l.k, NULL);
256 	rp.crc = bch2_extent_crc_unpack(r.k, NULL);
257 
258 	while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
259 	       __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
260 		if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
261 		    rp.ptr.offset + rp.crc.offset ||
262 		    lp.ptr.dev			!= rp.ptr.dev ||
263 		    lp.ptr.gen			!= rp.ptr.gen ||
264 		    lp.ptr.unwritten		!= rp.ptr.unwritten ||
265 		    lp.has_ec			!= rp.has_ec)
266 			return false;
267 
268 		/* Extents may not straddle buckets: */
269 		ca = bch_dev_bkey_exists(c, lp.ptr.dev);
270 		if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr))
271 			return false;
272 
273 		if (lp.has_ec			!= rp.has_ec ||
274 		    (lp.has_ec &&
275 		     (lp.ec.block		!= rp.ec.block ||
276 		      lp.ec.redundancy		!= rp.ec.redundancy ||
277 		      lp.ec.idx			!= rp.ec.idx)))
278 			return false;
279 
280 		if (lp.crc.compression_type	!= rp.crc.compression_type ||
281 		    lp.crc.nonce		!= rp.crc.nonce)
282 			return false;
283 
284 		if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
285 		    lp.crc.uncompressed_size) {
286 			/* can use left extent's crc entry */
287 		} else if (lp.crc.live_size <= rp.crc.offset) {
288 			/* can use right extent's crc entry */
289 		} else {
290 			/* check if checksums can be merged: */
291 			if (lp.crc.csum_type		!= rp.crc.csum_type ||
292 			    lp.crc.nonce		!= rp.crc.nonce ||
293 			    crc_is_compressed(lp.crc) ||
294 			    !bch2_checksum_mergeable(lp.crc.csum_type))
295 				return false;
296 
297 			if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
298 			    rp.crc.offset)
299 				return false;
300 
301 			if (lp.crc.csum_type &&
302 			    lp.crc.uncompressed_size +
303 			    rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
304 				return false;
305 		}
306 
307 		en_l = extent_entry_next(en_l);
308 		en_r = extent_entry_next(en_r);
309 	}
310 
311 	en_l = l_ptrs.start;
312 	en_r = r_ptrs.start;
313 	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
314 		if (extent_entry_is_crc(en_l)) {
315 			struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
316 			struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
317 
318 			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
319 			    bch2_crc_field_size_max[extent_entry_type(en_l)])
320 				return false;
321 		}
322 
323 		en_l = extent_entry_next(en_l);
324 		en_r = extent_entry_next(en_r);
325 	}
326 
327 	use_right_ptr = false;
328 	en_l = l_ptrs.start;
329 	en_r = r_ptrs.start;
330 	while (en_l < l_ptrs.end) {
331 		if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
332 		    use_right_ptr)
333 			en_l->ptr = en_r->ptr;
334 
335 		if (extent_entry_is_crc(en_l)) {
336 			struct bch_extent_crc_unpacked crc_l =
337 				bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
338 			struct bch_extent_crc_unpacked crc_r =
339 				bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
340 
341 			use_right_ptr = false;
342 
343 			if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
344 			    crc_l.uncompressed_size) {
345 				/* can use left extent's crc entry */
346 			} else if (crc_l.live_size <= crc_r.offset) {
347 				/* can use right extent's crc entry */
348 				crc_r.offset -= crc_l.live_size;
349 				bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
350 						     extent_entry_type(en_l));
351 				use_right_ptr = true;
352 			} else {
353 				crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
354 								 crc_l.csum,
355 								 crc_r.csum,
356 								 crc_r.uncompressed_size << 9);
357 
358 				crc_l.uncompressed_size	+= crc_r.uncompressed_size;
359 				crc_l.compressed_size	+= crc_r.compressed_size;
360 				bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
361 						     extent_entry_type(en_l));
362 			}
363 		}
364 
365 		en_l = extent_entry_next(en_l);
366 		en_r = extent_entry_next(en_r);
367 	}
368 
369 	bch2_key_resize(l.k, l.k->size + r.k->size);
370 	return true;
371 }
372 
373 /* KEY_TYPE_reservation: */
374 
375 int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
376 			     enum bkey_invalid_flags flags,
377 			     struct printbuf *err)
378 {
379 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
380 
381 	if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
382 		prt_printf(err, "invalid nr_replicas (%u)",
383 		       r.v->nr_replicas);
384 		return -BCH_ERR_invalid_bkey;
385 	}
386 
387 	return 0;
388 }
389 
390 void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
391 			      struct bkey_s_c k)
392 {
393 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
394 
395 	prt_printf(out, "generation %u replicas %u",
396 	       le32_to_cpu(r.v->generation),
397 	       r.v->nr_replicas);
398 }
399 
400 bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
401 {
402 	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
403 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
404 
405 	if (l.v->generation != r.v->generation ||
406 	    l.v->nr_replicas != r.v->nr_replicas)
407 		return false;
408 
409 	bch2_key_resize(l.k, l.k->size + r.k->size);
410 	return true;
411 }
412 
413 /* Extent checksum entries: */
414 
415 /* returns true if not equal */
416 static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
417 					 struct bch_extent_crc_unpacked r)
418 {
419 	return (l.csum_type		!= r.csum_type ||
420 		l.compression_type	!= r.compression_type ||
421 		l.compressed_size	!= r.compressed_size ||
422 		l.uncompressed_size	!= r.uncompressed_size ||
423 		l.offset		!= r.offset ||
424 		l.live_size		!= r.live_size ||
425 		l.nonce			!= r.nonce ||
426 		bch2_crc_cmp(l.csum, r.csum));
427 }
428 
429 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
430 				  struct bch_extent_crc_unpacked n)
431 {
432 	return !crc_is_compressed(u) &&
433 		u.csum_type &&
434 		u.uncompressed_size > u.live_size &&
435 		bch2_csum_type_is_encryption(u.csum_type) ==
436 		bch2_csum_type_is_encryption(n.csum_type);
437 }
438 
439 bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
440 				 struct bch_extent_crc_unpacked n)
441 {
442 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
443 	struct bch_extent_crc_unpacked crc;
444 	const union bch_extent_entry *i;
445 
446 	if (!n.csum_type)
447 		return false;
448 
449 	bkey_for_each_crc(k.k, ptrs, crc, i)
450 		if (can_narrow_crc(crc, n))
451 			return true;
452 
453 	return false;
454 }
455 
456 /*
457  * We're writing another replica for this extent, so while we've got the data in
458  * memory we'll be computing a new checksum for the currently live data.
459  *
460  * If there are other replicas we aren't moving, and they are checksummed but
461  * not compressed, we can modify them to point to only the data that is
462  * currently live (so that readers won't have to bounce) while we've got the
463  * checksum we need:
464  */
465 bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
466 {
467 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
468 	struct bch_extent_crc_unpacked u;
469 	struct extent_ptr_decoded p;
470 	union bch_extent_entry *i;
471 	bool ret = false;
472 
473 	/* Find a checksum entry that covers only live data: */
474 	if (!n.csum_type) {
475 		bkey_for_each_crc(&k->k, ptrs, u, i)
476 			if (!crc_is_compressed(u) &&
477 			    u.csum_type &&
478 			    u.live_size == u.uncompressed_size) {
479 				n = u;
480 				goto found;
481 			}
482 		return false;
483 	}
484 found:
485 	BUG_ON(crc_is_compressed(n));
486 	BUG_ON(n.offset);
487 	BUG_ON(n.live_size != k->k.size);
488 
489 restart_narrow_pointers:
490 	ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
491 
492 	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
493 		if (can_narrow_crc(p.crc, n)) {
494 			bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
495 			p.ptr.offset += p.crc.offset;
496 			p.crc = n;
497 			bch2_extent_ptr_decoded_append(k, &p);
498 			ret = true;
499 			goto restart_narrow_pointers;
500 		}
501 
502 	return ret;
503 }
504 
505 static void bch2_extent_crc_pack(union bch_extent_crc *dst,
506 				 struct bch_extent_crc_unpacked src,
507 				 enum bch_extent_entry_type type)
508 {
509 #define set_common_fields(_dst, _src)					\
510 		_dst.type		= 1 << type;			\
511 		_dst.csum_type		= _src.csum_type,		\
512 		_dst.compression_type	= _src.compression_type,	\
513 		_dst._compressed_size	= _src.compressed_size - 1,	\
514 		_dst._uncompressed_size	= _src.uncompressed_size - 1,	\
515 		_dst.offset		= _src.offset
516 
517 	switch (type) {
518 	case BCH_EXTENT_ENTRY_crc32:
519 		set_common_fields(dst->crc32, src);
520 		dst->crc32.csum		= (u32 __force) *((__le32 *) &src.csum.lo);
521 		break;
522 	case BCH_EXTENT_ENTRY_crc64:
523 		set_common_fields(dst->crc64, src);
524 		dst->crc64.nonce	= src.nonce;
525 		dst->crc64.csum_lo	= (u64 __force) src.csum.lo;
526 		dst->crc64.csum_hi	= (u64 __force) *((__le16 *) &src.csum.hi);
527 		break;
528 	case BCH_EXTENT_ENTRY_crc128:
529 		set_common_fields(dst->crc128, src);
530 		dst->crc128.nonce	= src.nonce;
531 		dst->crc128.csum	= src.csum;
532 		break;
533 	default:
534 		BUG();
535 	}
536 #undef set_common_fields
537 }
538 
539 void bch2_extent_crc_append(struct bkey_i *k,
540 			    struct bch_extent_crc_unpacked new)
541 {
542 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
543 	union bch_extent_crc *crc = (void *) ptrs.end;
544 	enum bch_extent_entry_type type;
545 
546 	if (bch_crc_bytes[new.csum_type]	<= 4 &&
547 	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
548 	    new.nonce				<= CRC32_NONCE_MAX)
549 		type = BCH_EXTENT_ENTRY_crc32;
550 	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
551 		   new.uncompressed_size	<= CRC64_SIZE_MAX &&
552 		   new.nonce			<= CRC64_NONCE_MAX)
553 		type = BCH_EXTENT_ENTRY_crc64;
554 	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
555 		   new.uncompressed_size	<= CRC128_SIZE_MAX &&
556 		   new.nonce			<= CRC128_NONCE_MAX)
557 		type = BCH_EXTENT_ENTRY_crc128;
558 	else
559 		BUG();
560 
561 	bch2_extent_crc_pack(crc, new, type);
562 
563 	k->k.u64s += extent_entry_u64s(ptrs.end);
564 
565 	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
566 }
567 
568 /* Generic code for keys with pointers: */
569 
570 unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
571 {
572 	return bch2_bkey_devs(k).nr;
573 }
574 
575 unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
576 {
577 	return k.k->type == KEY_TYPE_reservation
578 		? bkey_s_c_to_reservation(k).v->nr_replicas
579 		: bch2_bkey_dirty_devs(k).nr;
580 }
581 
582 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
583 {
584 	unsigned ret = 0;
585 
586 	if (k.k->type == KEY_TYPE_reservation) {
587 		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
588 	} else {
589 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
590 		const union bch_extent_entry *entry;
591 		struct extent_ptr_decoded p;
592 
593 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
594 			ret += !p.ptr.cached && !crc_is_compressed(p.crc);
595 	}
596 
597 	return ret;
598 }
599 
600 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
601 {
602 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
603 	const union bch_extent_entry *entry;
604 	struct extent_ptr_decoded p;
605 	unsigned ret = 0;
606 
607 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
608 		if (!p.ptr.cached && crc_is_compressed(p.crc))
609 			ret += p.crc.compressed_size;
610 
611 	return ret;
612 }
613 
614 bool bch2_bkey_is_incompressible(struct bkey_s_c k)
615 {
616 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
617 	const union bch_extent_entry *entry;
618 	struct bch_extent_crc_unpacked crc;
619 
620 	bkey_for_each_crc(k.k, ptrs, crc, entry)
621 		if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
622 			return true;
623 	return false;
624 }
625 
626 unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
627 {
628 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
629 	const union bch_extent_entry *entry;
630 	struct extent_ptr_decoded p = { 0 };
631 	unsigned replicas = 0;
632 
633 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
634 		if (p.ptr.cached)
635 			continue;
636 
637 		if (p.has_ec)
638 			replicas += p.ec.redundancy;
639 
640 		replicas++;
641 
642 	}
643 
644 	return replicas;
645 }
646 
647 unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
648 {
649 	struct bch_dev *ca;
650 
651 	if (p->ptr.cached)
652 		return 0;
653 
654 	ca = bch_dev_bkey_exists(c, p->ptr.dev);
655 
656 	return ca->mi.durability +
657 		(p->has_ec
658 		 ? p->ec.redundancy
659 		 : 0);
660 }
661 
662 unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
663 {
664 	struct bch_dev *ca;
665 
666 	if (p->ptr.cached)
667 		return 0;
668 
669 	ca = bch_dev_bkey_exists(c, p->ptr.dev);
670 
671 	if (ca->mi.state == BCH_MEMBER_STATE_failed)
672 		return 0;
673 
674 	return ca->mi.durability +
675 		(p->has_ec
676 		 ? p->ec.redundancy
677 		 : 0);
678 }
679 
680 unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
681 {
682 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
683 	const union bch_extent_entry *entry;
684 	struct extent_ptr_decoded p;
685 	unsigned durability = 0;
686 
687 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
688 		durability += bch2_extent_ptr_durability(c, &p);
689 
690 	return durability;
691 }
692 
693 static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
694 {
695 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
696 	const union bch_extent_entry *entry;
697 	struct extent_ptr_decoded p;
698 	unsigned durability = 0;
699 
700 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
701 		if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
702 			durability += bch2_extent_ptr_durability(c, &p);
703 
704 	return durability;
705 }
706 
707 void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
708 {
709 	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
710 	union bch_extent_entry *next = extent_entry_next(entry);
711 
712 	memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
713 	k->k.u64s -= extent_entry_u64s(entry);
714 }
715 
716 void bch2_extent_ptr_decoded_append(struct bkey_i *k,
717 				    struct extent_ptr_decoded *p)
718 {
719 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
720 	struct bch_extent_crc_unpacked crc =
721 		bch2_extent_crc_unpack(&k->k, NULL);
722 	union bch_extent_entry *pos;
723 
724 	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
725 		pos = ptrs.start;
726 		goto found;
727 	}
728 
729 	bkey_for_each_crc(&k->k, ptrs, crc, pos)
730 		if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
731 			pos = extent_entry_next(pos);
732 			goto found;
733 		}
734 
735 	bch2_extent_crc_append(k, p->crc);
736 	pos = bkey_val_end(bkey_i_to_s(k));
737 found:
738 	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
739 	__extent_entry_insert(k, pos, to_entry(&p->ptr));
740 
741 	if (p->has_ec) {
742 		p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
743 		__extent_entry_insert(k, pos, to_entry(&p->ec));
744 	}
745 }
746 
747 static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
748 					  union bch_extent_entry *entry)
749 {
750 	union bch_extent_entry *i = ptrs.start;
751 
752 	if (i == entry)
753 		return NULL;
754 
755 	while (extent_entry_next(i) != entry)
756 		i = extent_entry_next(i);
757 	return i;
758 }
759 
760 static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
761 {
762 	union bch_extent_entry *next = extent_entry_next(entry);
763 
764 	/* stripes have ptrs, but their layout doesn't work with this code */
765 	BUG_ON(k.k->type == KEY_TYPE_stripe);
766 
767 	memmove_u64s_down(entry, next,
768 			  (u64 *) bkey_val_end(k) - (u64 *) next);
769 	k.k->u64s -= (u64 *) next - (u64 *) entry;
770 }
771 
772 /*
773  * Returns pointer to the next entry after the one being dropped:
774  */
775 union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
776 						   struct bch_extent_ptr *ptr)
777 {
778 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
779 	union bch_extent_entry *entry = to_entry(ptr), *next;
780 	union bch_extent_entry *ret = entry;
781 	bool drop_crc = true;
782 
783 	EBUG_ON(ptr < &ptrs.start->ptr ||
784 		ptr >= &ptrs.end->ptr);
785 	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
786 
787 	for (next = extent_entry_next(entry);
788 	     next != ptrs.end;
789 	     next = extent_entry_next(next)) {
790 		if (extent_entry_is_crc(next)) {
791 			break;
792 		} else if (extent_entry_is_ptr(next)) {
793 			drop_crc = false;
794 			break;
795 		}
796 	}
797 
798 	extent_entry_drop(k, entry);
799 
800 	while ((entry = extent_entry_prev(ptrs, entry))) {
801 		if (extent_entry_is_ptr(entry))
802 			break;
803 
804 		if ((extent_entry_is_crc(entry) && drop_crc) ||
805 		    extent_entry_is_stripe_ptr(entry)) {
806 			ret = (void *) ret - extent_entry_bytes(entry);
807 			extent_entry_drop(k, entry);
808 		}
809 	}
810 
811 	return ret;
812 }
813 
814 union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
815 					   struct bch_extent_ptr *ptr)
816 {
817 	bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
818 	union bch_extent_entry *ret =
819 		bch2_bkey_drop_ptr_noerror(k, ptr);
820 
821 	/*
822 	 * If we deleted all the dirty pointers and there's still cached
823 	 * pointers, we could set the cached pointers to dirty if they're not
824 	 * stale - but to do that correctly we'd need to grab an open_bucket
825 	 * reference so that we don't race with bucket reuse:
826 	 */
827 	if (have_dirty &&
828 	    !bch2_bkey_dirty_devs(k.s_c).nr) {
829 		k.k->type = KEY_TYPE_error;
830 		set_bkey_val_u64s(k.k, 0);
831 		ret = NULL;
832 	} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
833 		k.k->type = KEY_TYPE_deleted;
834 		set_bkey_val_u64s(k.k, 0);
835 		ret = NULL;
836 	}
837 
838 	return ret;
839 }
840 
841 void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
842 {
843 	struct bch_extent_ptr *ptr;
844 
845 	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
846 }
847 
848 void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
849 {
850 	struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
851 
852 	if (ptr)
853 		bch2_bkey_drop_ptr_noerror(k, ptr);
854 }
855 
856 const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
857 {
858 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
859 	const struct bch_extent_ptr *ptr;
860 
861 	bkey_for_each_ptr(ptrs, ptr)
862 		if (ptr->dev == dev)
863 			return ptr;
864 
865 	return NULL;
866 }
867 
868 bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
869 {
870 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
871 	const struct bch_extent_ptr *ptr;
872 
873 	bkey_for_each_ptr(ptrs, ptr)
874 		if (bch2_dev_in_target(c, ptr->dev, target) &&
875 		    (!ptr->cached ||
876 		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
877 			return true;
878 
879 	return false;
880 }
881 
882 bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
883 			   struct bch_extent_ptr m, u64 offset)
884 {
885 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
886 	const union bch_extent_entry *entry;
887 	struct extent_ptr_decoded p;
888 
889 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
890 		if (p.ptr.dev	== m.dev &&
891 		    p.ptr.gen	== m.gen &&
892 		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
893 		    (s64) m.offset  - offset)
894 			return true;
895 
896 	return false;
897 }
898 
899 /*
900  * Returns true if two extents refer to the same data:
901  */
902 bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
903 {
904 	if (k1.k->type != k2.k->type)
905 		return false;
906 
907 	if (bkey_extent_is_direct_data(k1.k)) {
908 		struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
909 		struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
910 		const union bch_extent_entry *entry1, *entry2;
911 		struct extent_ptr_decoded p1, p2;
912 
913 		if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
914 			return false;
915 
916 		bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
917 			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
918 				if (p1.ptr.dev		== p2.ptr.dev &&
919 				    p1.ptr.gen		== p2.ptr.gen &&
920 				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
921 				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
922 					return true;
923 
924 		return false;
925 	} else {
926 		/* KEY_TYPE_deleted, etc. */
927 		return true;
928 	}
929 }
930 
931 struct bch_extent_ptr *
932 bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
933 {
934 	struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
935 	union bch_extent_entry *entry2;
936 	struct extent_ptr_decoded p2;
937 
938 	bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
939 		if (p1.ptr.dev		== p2.ptr.dev &&
940 		    p1.ptr.gen		== p2.ptr.gen &&
941 		    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
942 		    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
943 			return &entry2->ptr;
944 
945 	return NULL;
946 }
947 
948 void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
949 {
950 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
951 	union bch_extent_entry *entry;
952 	union bch_extent_entry *ec = NULL;
953 
954 	bkey_extent_entry_for_each(ptrs, entry) {
955 		if (&entry->ptr == ptr) {
956 			ptr->cached = true;
957 			if (ec)
958 				extent_entry_drop(k, ec);
959 			return;
960 		}
961 
962 		if (extent_entry_is_stripe_ptr(entry))
963 			ec = entry;
964 		else if (extent_entry_is_ptr(entry))
965 			ec = NULL;
966 	}
967 
968 	BUG();
969 }
970 
971 /*
972  * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
973  *
974  * Returns true if @k should be dropped entirely
975  *
976  * For existing keys, only called when btree nodes are being rewritten, not when
977  * they're merely being compacted/resorted in memory.
978  */
979 bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
980 {
981 	struct bch_extent_ptr *ptr;
982 
983 	bch2_bkey_drop_ptrs(k, ptr,
984 		ptr->cached &&
985 		ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
986 
987 	return bkey_deleted(k.k);
988 }
989 
990 void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
991 			    struct bkey_s_c k)
992 {
993 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
994 	const union bch_extent_entry *entry;
995 	struct bch_extent_crc_unpacked crc;
996 	const struct bch_extent_ptr *ptr;
997 	const struct bch_extent_stripe_ptr *ec;
998 	struct bch_dev *ca;
999 	bool first = true;
1000 
1001 	if (c)
1002 		prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
1003 
1004 	bkey_extent_entry_for_each(ptrs, entry) {
1005 		if (!first)
1006 			prt_printf(out, " ");
1007 
1008 		switch (__extent_entry_type(entry)) {
1009 		case BCH_EXTENT_ENTRY_ptr:
1010 			ptr = entry_to_ptr(entry);
1011 			ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
1012 				? bch_dev_bkey_exists(c, ptr->dev)
1013 				: NULL;
1014 
1015 			if (!ca) {
1016 				prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
1017 				       (u64) ptr->offset, ptr->gen,
1018 				       ptr->cached ? " cached" : "");
1019 			} else {
1020 				u32 offset;
1021 				u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
1022 
1023 				prt_printf(out, "ptr: %u:%llu:%u gen %u",
1024 					   ptr->dev, b, offset, ptr->gen);
1025 				if (ptr->cached)
1026 					prt_str(out, " cached");
1027 				if (ptr->unwritten)
1028 					prt_str(out, " unwritten");
1029 				if (ca && ptr_stale(ca, ptr))
1030 					prt_printf(out, " stale");
1031 			}
1032 			break;
1033 		case BCH_EXTENT_ENTRY_crc32:
1034 		case BCH_EXTENT_ENTRY_crc64:
1035 		case BCH_EXTENT_ENTRY_crc128:
1036 			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
1037 
1038 			prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
1039 			       crc.compressed_size,
1040 			       crc.uncompressed_size,
1041 			       crc.offset, crc.nonce,
1042 			       bch2_csum_types[crc.csum_type],
1043 			       bch2_compression_types[crc.compression_type]);
1044 			break;
1045 		case BCH_EXTENT_ENTRY_stripe_ptr:
1046 			ec = &entry->stripe_ptr;
1047 
1048 			prt_printf(out, "ec: idx %llu block %u",
1049 			       (u64) ec->idx, ec->block);
1050 			break;
1051 		default:
1052 			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
1053 			return;
1054 		}
1055 
1056 		first = false;
1057 	}
1058 }
1059 
1060 static int extent_ptr_invalid(const struct bch_fs *c,
1061 			      struct bkey_s_c k,
1062 			      enum bkey_invalid_flags flags,
1063 			      const struct bch_extent_ptr *ptr,
1064 			      unsigned size_ondisk,
1065 			      bool metadata,
1066 			      struct printbuf *err)
1067 {
1068 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1069 	const struct bch_extent_ptr *ptr2;
1070 	u64 bucket;
1071 	u32 bucket_offset;
1072 	struct bch_dev *ca;
1073 
1074 	if (!bch2_dev_exists2(c, ptr->dev)) {
1075 		/*
1076 		 * If we're in the write path this key might have already been
1077 		 * overwritten, and we could be seeing a device that doesn't
1078 		 * exist anymore due to racing with device removal:
1079 		 */
1080 		if (flags & BKEY_INVALID_WRITE)
1081 			return 0;
1082 
1083 		prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
1084 		return -BCH_ERR_invalid_bkey;
1085 	}
1086 
1087 	ca = bch_dev_bkey_exists(c, ptr->dev);
1088 	bkey_for_each_ptr(ptrs, ptr2)
1089 		if (ptr != ptr2 && ptr->dev == ptr2->dev) {
1090 			prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
1091 			return -BCH_ERR_invalid_bkey;
1092 		}
1093 
1094 	bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
1095 
1096 	if (bucket >= ca->mi.nbuckets) {
1097 		prt_printf(err, "pointer past last bucket (%llu > %llu)",
1098 		       bucket, ca->mi.nbuckets);
1099 		return -BCH_ERR_invalid_bkey;
1100 	}
1101 
1102 	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
1103 		prt_printf(err, "pointer before first bucket (%llu < %u)",
1104 		       bucket, ca->mi.first_bucket);
1105 		return -BCH_ERR_invalid_bkey;
1106 	}
1107 
1108 	if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
1109 		prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
1110 		       bucket_offset, size_ondisk, ca->mi.bucket_size);
1111 		return -BCH_ERR_invalid_bkey;
1112 	}
1113 
1114 	return 0;
1115 }
1116 
1117 int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
1118 			   enum bkey_invalid_flags flags,
1119 			   struct printbuf *err)
1120 {
1121 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1122 	const union bch_extent_entry *entry;
1123 	struct bch_extent_crc_unpacked crc;
1124 	unsigned size_ondisk = k.k->size;
1125 	unsigned nonce = UINT_MAX;
1126 	unsigned nr_ptrs = 0;
1127 	bool unwritten = false, have_ec = false, crc_since_last_ptr = false;
1128 	int ret;
1129 
1130 	if (bkey_is_btree_ptr(k.k))
1131 		size_ondisk = btree_sectors(c);
1132 
1133 	bkey_extent_entry_for_each(ptrs, entry) {
1134 		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
1135 			prt_printf(err, "invalid extent entry type (got %u, max %u)",
1136 			       __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
1137 			return -BCH_ERR_invalid_bkey;
1138 		}
1139 
1140 		if (bkey_is_btree_ptr(k.k) &&
1141 		    !extent_entry_is_ptr(entry)) {
1142 			prt_printf(err, "has non ptr field");
1143 			return -BCH_ERR_invalid_bkey;
1144 		}
1145 
1146 		switch (extent_entry_type(entry)) {
1147 		case BCH_EXTENT_ENTRY_ptr:
1148 			ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
1149 						 size_ondisk, false, err);
1150 			if (ret)
1151 				return ret;
1152 
1153 			if (nr_ptrs && unwritten != entry->ptr.unwritten) {
1154 				prt_printf(err, "extent with unwritten and written ptrs");
1155 				return -BCH_ERR_invalid_bkey;
1156 			}
1157 
1158 			if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) {
1159 				prt_printf(err, "has unwritten ptrs");
1160 				return -BCH_ERR_invalid_bkey;
1161 			}
1162 
1163 			if (entry->ptr.cached && have_ec) {
1164 				prt_printf(err, "cached, erasure coded ptr");
1165 				return -BCH_ERR_invalid_bkey;
1166 			}
1167 
1168 			unwritten = entry->ptr.unwritten;
1169 			have_ec = false;
1170 			crc_since_last_ptr = false;
1171 			nr_ptrs++;
1172 			break;
1173 		case BCH_EXTENT_ENTRY_crc32:
1174 		case BCH_EXTENT_ENTRY_crc64:
1175 		case BCH_EXTENT_ENTRY_crc128:
1176 			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
1177 
1178 			if (crc.offset + crc.live_size >
1179 			    crc.uncompressed_size) {
1180 				prt_printf(err, "checksum offset + key size > uncompressed size");
1181 				return -BCH_ERR_invalid_bkey;
1182 			}
1183 
1184 			size_ondisk = crc.compressed_size;
1185 
1186 			if (!bch2_checksum_type_valid(c, crc.csum_type)) {
1187 				prt_printf(err, "invalid checksum type");
1188 				return -BCH_ERR_invalid_bkey;
1189 			}
1190 
1191 			if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
1192 				prt_printf(err, "invalid compression type");
1193 				return -BCH_ERR_invalid_bkey;
1194 			}
1195 
1196 			if (bch2_csum_type_is_encryption(crc.csum_type)) {
1197 				if (nonce == UINT_MAX)
1198 					nonce = crc.offset + crc.nonce;
1199 				else if (nonce != crc.offset + crc.nonce) {
1200 					prt_printf(err, "incorrect nonce");
1201 					return -BCH_ERR_invalid_bkey;
1202 				}
1203 			}
1204 
1205 			if (crc_since_last_ptr) {
1206 				prt_printf(err, "redundant crc entry");
1207 				return -BCH_ERR_invalid_bkey;
1208 			}
1209 			crc_since_last_ptr = true;
1210 			break;
1211 		case BCH_EXTENT_ENTRY_stripe_ptr:
1212 			if (have_ec) {
1213 				prt_printf(err, "redundant stripe entry");
1214 				return -BCH_ERR_invalid_bkey;
1215 			}
1216 			have_ec = true;
1217 			break;
1218 		case BCH_EXTENT_ENTRY_rebalance:
1219 			break;
1220 		}
1221 	}
1222 
1223 	if (!nr_ptrs) {
1224 		prt_str(err, "no ptrs");
1225 		return -BCH_ERR_invalid_bkey;
1226 	}
1227 
1228 	if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
1229 		prt_str(err, "too many ptrs");
1230 		return -BCH_ERR_invalid_bkey;
1231 	}
1232 
1233 	if (crc_since_last_ptr) {
1234 		prt_printf(err, "redundant crc entry");
1235 		return -BCH_ERR_invalid_bkey;
1236 	}
1237 
1238 	if (have_ec) {
1239 		prt_printf(err, "redundant stripe entry");
1240 		return -BCH_ERR_invalid_bkey;
1241 	}
1242 
1243 	return 0;
1244 }
1245 
1246 void bch2_ptr_swab(struct bkey_s k)
1247 {
1248 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1249 	union bch_extent_entry *entry;
1250 	u64 *d;
1251 
1252 	for (d =  (u64 *) ptrs.start;
1253 	     d != (u64 *) ptrs.end;
1254 	     d++)
1255 		*d = swab64(*d);
1256 
1257 	for (entry = ptrs.start;
1258 	     entry < ptrs.end;
1259 	     entry = extent_entry_next(entry)) {
1260 		switch (extent_entry_type(entry)) {
1261 		case BCH_EXTENT_ENTRY_ptr:
1262 			break;
1263 		case BCH_EXTENT_ENTRY_crc32:
1264 			entry->crc32.csum = swab32(entry->crc32.csum);
1265 			break;
1266 		case BCH_EXTENT_ENTRY_crc64:
1267 			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
1268 			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
1269 			break;
1270 		case BCH_EXTENT_ENTRY_crc128:
1271 			entry->crc128.csum.hi = (__force __le64)
1272 				swab64((__force u64) entry->crc128.csum.hi);
1273 			entry->crc128.csum.lo = (__force __le64)
1274 				swab64((__force u64) entry->crc128.csum.lo);
1275 			break;
1276 		case BCH_EXTENT_ENTRY_stripe_ptr:
1277 			break;
1278 		case BCH_EXTENT_ENTRY_rebalance:
1279 			break;
1280 		}
1281 	}
1282 }
1283 
1284 /* Generic extent code: */
1285 
1286 int bch2_cut_front_s(struct bpos where, struct bkey_s k)
1287 {
1288 	unsigned new_val_u64s = bkey_val_u64s(k.k);
1289 	int val_u64s_delta;
1290 	u64 sub;
1291 
1292 	if (bkey_le(where, bkey_start_pos(k.k)))
1293 		return 0;
1294 
1295 	EBUG_ON(bkey_gt(where, k.k->p));
1296 
1297 	sub = where.offset - bkey_start_offset(k.k);
1298 
1299 	k.k->size -= sub;
1300 
1301 	if (!k.k->size) {
1302 		k.k->type = KEY_TYPE_deleted;
1303 		new_val_u64s = 0;
1304 	}
1305 
1306 	switch (k.k->type) {
1307 	case KEY_TYPE_extent:
1308 	case KEY_TYPE_reflink_v: {
1309 		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1310 		union bch_extent_entry *entry;
1311 		bool seen_crc = false;
1312 
1313 		bkey_extent_entry_for_each(ptrs, entry) {
1314 			switch (extent_entry_type(entry)) {
1315 			case BCH_EXTENT_ENTRY_ptr:
1316 				if (!seen_crc)
1317 					entry->ptr.offset += sub;
1318 				break;
1319 			case BCH_EXTENT_ENTRY_crc32:
1320 				entry->crc32.offset += sub;
1321 				break;
1322 			case BCH_EXTENT_ENTRY_crc64:
1323 				entry->crc64.offset += sub;
1324 				break;
1325 			case BCH_EXTENT_ENTRY_crc128:
1326 				entry->crc128.offset += sub;
1327 				break;
1328 			case BCH_EXTENT_ENTRY_stripe_ptr:
1329 				break;
1330 			case BCH_EXTENT_ENTRY_rebalance:
1331 				break;
1332 			}
1333 
1334 			if (extent_entry_is_crc(entry))
1335 				seen_crc = true;
1336 		}
1337 
1338 		break;
1339 	}
1340 	case KEY_TYPE_reflink_p: {
1341 		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
1342 
1343 		le64_add_cpu(&p.v->idx, sub);
1344 		break;
1345 	}
1346 	case KEY_TYPE_inline_data:
1347 	case KEY_TYPE_indirect_inline_data: {
1348 		void *p = bkey_inline_data_p(k);
1349 		unsigned bytes = bkey_inline_data_bytes(k.k);
1350 
1351 		sub = min_t(u64, sub << 9, bytes);
1352 
1353 		memmove(p, p + sub, bytes - sub);
1354 
1355 		new_val_u64s -= sub >> 3;
1356 		break;
1357 	}
1358 	}
1359 
1360 	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
1361 	BUG_ON(val_u64s_delta < 0);
1362 
1363 	set_bkey_val_u64s(k.k, new_val_u64s);
1364 	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
1365 	return -val_u64s_delta;
1366 }
1367 
1368 int bch2_cut_back_s(struct bpos where, struct bkey_s k)
1369 {
1370 	unsigned new_val_u64s = bkey_val_u64s(k.k);
1371 	int val_u64s_delta;
1372 	u64 len = 0;
1373 
1374 	if (bkey_ge(where, k.k->p))
1375 		return 0;
1376 
1377 	EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
1378 
1379 	len = where.offset - bkey_start_offset(k.k);
1380 
1381 	k.k->p.offset = where.offset;
1382 	k.k->size = len;
1383 
1384 	if (!len) {
1385 		k.k->type = KEY_TYPE_deleted;
1386 		new_val_u64s = 0;
1387 	}
1388 
1389 	switch (k.k->type) {
1390 	case KEY_TYPE_inline_data:
1391 	case KEY_TYPE_indirect_inline_data:
1392 		new_val_u64s = (bkey_inline_data_offset(k.k) +
1393 				min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
1394 		break;
1395 	}
1396 
1397 	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
1398 	BUG_ON(val_u64s_delta < 0);
1399 
1400 	set_bkey_val_u64s(k.k, new_val_u64s);
1401 	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
1402 	return -val_u64s_delta;
1403 }
1404