xref: /linux/fs/bcachefs/extents.c (revision ff0905bbf991f4337b5ebc19c0d43525ebb0d96b)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4  *
5  * Code for managing the extent btree and dynamically updating the writeback
6  * dirty sector count.
7  */
8 
9 #include "bcachefs.h"
10 #include "bkey_methods.h"
11 #include "btree_cache.h"
12 #include "btree_gc.h"
13 #include "btree_io.h"
14 #include "btree_iter.h"
15 #include "buckets.h"
16 #include "checksum.h"
17 #include "compress.h"
18 #include "debug.h"
19 #include "disk_groups.h"
20 #include "error.h"
21 #include "extents.h"
22 #include "inode.h"
23 #include "journal.h"
24 #include "rebalance.h"
25 #include "replicas.h"
26 #include "super.h"
27 #include "super-io.h"
28 #include "trace.h"
29 #include "util.h"
30 
31 static const char * const bch2_extent_flags_strs[] = {
32 #define x(n, v)	[BCH_EXTENT_FLAG_##n] = #n,
33 	BCH_EXTENT_FLAGS()
34 #undef x
35 	NULL,
36 };
37 
38 static unsigned bch2_crc_field_size_max[] = {
39 	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
40 	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
41 	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
42 };
43 
44 static void bch2_extent_crc_pack(union bch_extent_crc *,
45 				 struct bch_extent_crc_unpacked,
46 				 enum bch_extent_entry_type);
47 
bch2_io_failures_to_text(struct printbuf * out,struct bch_fs * c,struct bch_io_failures * failed)48 void bch2_io_failures_to_text(struct printbuf *out,
49 			      struct bch_fs *c,
50 			      struct bch_io_failures *failed)
51 {
52 	static const char * const error_types[] = {
53 		"io", "checksum", "ec reconstruct", NULL
54 	};
55 
56 	for (struct bch_dev_io_failures *f = failed->devs;
57 	     f < failed->devs + failed->nr;
58 	     f++) {
59 		unsigned errflags =
60 			((!!f->failed_io)	<< 0) |
61 			((!!f->failed_csum_nr)	<< 1) |
62 			((!!f->failed_ec)	<< 2);
63 
64 		if (!errflags)
65 			continue;
66 
67 		bch2_printbuf_make_room(out, 1024);
68 		out->atomic++;
69 		scoped_guard(rcu) {
70 			struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev);
71 			if (ca)
72 				prt_str(out, ca->name);
73 			else
74 				prt_printf(out, "(invalid device %u)", f->dev);
75 		}
76 		--out->atomic;
77 
78 		prt_char(out, ' ');
79 
80 		if (is_power_of_2(errflags)) {
81 			prt_bitflags(out, error_types, errflags);
82 			prt_str(out, " error");
83 		} else {
84 			prt_str(out, "errors: ");
85 			prt_bitflags(out, error_types, errflags);
86 		}
87 		prt_newline(out);
88 	}
89 }
90 
bch2_dev_io_failures(struct bch_io_failures * f,unsigned dev)91 struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
92 						 unsigned dev)
93 {
94 	struct bch_dev_io_failures *i;
95 
96 	for (i = f->devs; i < f->devs + f->nr; i++)
97 		if (i->dev == dev)
98 			return i;
99 
100 	return NULL;
101 }
102 
bch2_mark_io_failure(struct bch_io_failures * failed,struct extent_ptr_decoded * p,bool csum_error)103 void bch2_mark_io_failure(struct bch_io_failures *failed,
104 			  struct extent_ptr_decoded *p,
105 			  bool csum_error)
106 {
107 	struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
108 
109 	if (!f) {
110 		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
111 
112 		f = &failed->devs[failed->nr++];
113 		memset(f, 0, sizeof(*f));
114 		f->dev = p->ptr.dev;
115 	}
116 
117 	if (p->do_ec_reconstruct)
118 		f->failed_ec = true;
119 	else if (!csum_error)
120 		f->failed_io = true;
121 	else
122 		f->failed_csum_nr++;
123 }
124 
bch2_mark_btree_validate_failure(struct bch_io_failures * failed,unsigned dev)125 void bch2_mark_btree_validate_failure(struct bch_io_failures *failed,
126 				      unsigned dev)
127 {
128 	struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev);
129 
130 	if (!f) {
131 		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
132 
133 		f = &failed->devs[failed->nr++];
134 		memset(f, 0, sizeof(*f));
135 		f->dev = dev;
136 	}
137 
138 	f->failed_btree_validate = true;
139 }
140 
dev_latency(struct bch_dev * ca)141 static inline u64 dev_latency(struct bch_dev *ca)
142 {
143 	return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
144 }
145 
dev_failed(struct bch_dev * ca)146 static inline int dev_failed(struct bch_dev *ca)
147 {
148 	return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
149 }
150 
151 /*
152  * returns true if p1 is better than p2:
153  */
ptr_better(struct bch_fs * c,const struct extent_ptr_decoded p1,u64 p1_latency,struct bch_dev * ca1,const struct extent_ptr_decoded p2,u64 p2_latency)154 static inline bool ptr_better(struct bch_fs *c,
155 			      const struct extent_ptr_decoded p1,
156 			      u64 p1_latency,
157 			      struct bch_dev *ca1,
158 			      const struct extent_ptr_decoded p2,
159 			      u64 p2_latency)
160 {
161 	struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
162 
163 	int failed_delta = dev_failed(ca1) - dev_failed(ca2);
164 	if (unlikely(failed_delta))
165 		return failed_delta < 0;
166 
167 	if (static_branch_unlikely(&bch2_force_reconstruct_read))
168 		return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
169 
170 	if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
171 		return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
172 
173 	int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
174 	if (unlikely(crc_retry_delta))
175 		return crc_retry_delta < 0;
176 
177 	/* Pick at random, biased in favor of the faster device: */
178 
179 	return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
180 }
181 
182 /*
183  * This picks a non-stale pointer, preferably from a device other than @avoid.
184  * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
185  * other devices, it will still pick a pointer from avoid.
186  */
bch2_bkey_pick_read_device(struct bch_fs * c,struct bkey_s_c k,struct bch_io_failures * failed,struct extent_ptr_decoded * pick,int dev)187 int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
188 			       struct bch_io_failures *failed,
189 			       struct extent_ptr_decoded *pick,
190 			       int dev)
191 {
192 	bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
193 	bool have_dirty_ptrs = false, have_pick = false;
194 
195 	if (k.k->type == KEY_TYPE_error)
196 		return bch_err_throw(c, key_type_error);
197 
198 	rcu_read_lock();
199 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
200 	const union bch_extent_entry *entry;
201 	struct extent_ptr_decoded p;
202 	u64 pick_latency;
203 
204 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
205 		have_dirty_ptrs |= !p.ptr.cached;
206 
207 		/*
208 		 * Unwritten extent: no need to actually read, treat it as a
209 		 * hole and return 0s:
210 		 */
211 		if (p.ptr.unwritten) {
212 			rcu_read_unlock();
213 			return 0;
214 		}
215 
216 		/* Are we being asked to read from a specific device? */
217 		if (dev >= 0 && p.ptr.dev != dev)
218 			continue;
219 
220 		struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
221 
222 		if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) {
223 			rcu_read_unlock();
224 			int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev);
225 			if (ret)
226 				return ret;
227 			rcu_read_lock();
228 		}
229 
230 		if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
231 			continue;
232 
233 		struct bch_dev_io_failures *f =
234 			unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
235 		if (unlikely(f)) {
236 			p.crc_retry_nr	   = f->failed_csum_nr;
237 			p.has_ec	  &= ~f->failed_ec;
238 
239 			if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
240 				have_io_errors	|= f->failed_io;
241 				have_io_errors	|= f->failed_btree_validate;
242 				have_io_errors	|= f->failed_ec;
243 			}
244 			have_csum_errors	|= !!f->failed_csum_nr;
245 
246 			if (p.has_ec && (f->failed_io || f->failed_csum_nr))
247 				p.do_ec_reconstruct = true;
248 			else if (f->failed_io ||
249 				 f->failed_btree_validate ||
250 				 f->failed_csum_nr > c->opts.checksum_err_retry_nr)
251 				continue;
252 		}
253 
254 		have_missing_devs |= ca && !bch2_dev_is_online(ca);
255 
256 		if (!ca || !bch2_dev_is_online(ca)) {
257 			if (!p.has_ec)
258 				continue;
259 			p.do_ec_reconstruct = true;
260 		}
261 
262 		if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec)
263 			p.do_ec_reconstruct = true;
264 
265 		u64 p_latency = dev_latency(ca);
266 		/*
267 		 * Square the latencies, to bias more in favor of the faster
268 		 * device - we never want to stop issuing reads to the slower
269 		 * device altogether, so that we can update our latency numbers:
270 		 */
271 		p_latency *= p_latency;
272 
273 		if (!have_pick ||
274 		    ptr_better(c,
275 			       p, p_latency, ca,
276 			       *pick, pick_latency)) {
277 			*pick = p;
278 			pick_latency = p_latency;
279 			have_pick = true;
280 		}
281 	}
282 	rcu_read_unlock();
283 
284 	if (have_pick)
285 		return 1;
286 	if (!have_dirty_ptrs)
287 		return 0;
288 	if (have_missing_devs)
289 		return bch_err_throw(c, no_device_to_read_from);
290 	if (have_csum_errors)
291 		return bch_err_throw(c, data_read_csum_err);
292 	if (have_io_errors)
293 		return bch_err_throw(c, data_read_io_err);
294 
295 	/*
296 	 * If we get here, we have pointers (bkey_ptrs_validate() ensures that),
297 	 * but they don't point to valid devices:
298 	 */
299 	return bch_err_throw(c, no_devices_valid);
300 }
301 
302 /* KEY_TYPE_btree_ptr: */
303 
bch2_btree_ptr_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)304 int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
305 			    struct bkey_validate_context from)
306 {
307 	int ret = 0;
308 
309 	bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
310 			 c, btree_ptr_val_too_big,
311 			 "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
312 
313 	ret = bch2_bkey_ptrs_validate(c, k, from);
314 fsck_err:
315 	return ret;
316 }
317 
bch2_btree_ptr_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)318 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
319 			    struct bkey_s_c k)
320 {
321 	bch2_bkey_ptrs_to_text(out, c, k);
322 }
323 
bch2_btree_ptr_v2_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)324 int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
325 			       struct bkey_validate_context from)
326 {
327 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
328 	int ret = 0;
329 
330 	bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
331 			 c, btree_ptr_v2_val_too_big,
332 			 "value too big (%zu > %zu)",
333 			 bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
334 
335 	bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
336 			 c, btree_ptr_v2_min_key_bad,
337 			 "min_key > key");
338 
339 	if ((from.flags & BCH_VALIDATE_write) &&
340 	    c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
341 		bkey_fsck_err_on(!bp.v->sectors_written,
342 				 c, btree_ptr_v2_written_0,
343 				 "sectors_written == 0");
344 
345 	ret = bch2_bkey_ptrs_validate(c, k, from);
346 fsck_err:
347 	return ret;
348 }
349 
bch2_btree_ptr_v2_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)350 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
351 			       struct bkey_s_c k)
352 {
353 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
354 
355 	prt_printf(out, "seq %llx written %u min_key %s",
356 	       le64_to_cpu(bp.v->seq),
357 	       le16_to_cpu(bp.v->sectors_written),
358 	       BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
359 
360 	bch2_bpos_to_text(out, bp.v->min_key);
361 	prt_printf(out, " ");
362 	bch2_bkey_ptrs_to_text(out, c, k);
363 }
364 
bch2_btree_ptr_v2_compat(enum btree_id btree_id,unsigned version,unsigned big_endian,int write,struct bkey_s k)365 void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
366 			      unsigned big_endian, int write,
367 			      struct bkey_s k)
368 {
369 	struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
370 
371 	compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
372 
373 	if (version < bcachefs_metadata_version_inode_btree_change &&
374 	    btree_id_is_extents(btree_id) &&
375 	    !bkey_eq(bp.v->min_key, POS_MIN))
376 		bp.v->min_key = write
377 			? bpos_nosnap_predecessor(bp.v->min_key)
378 			: bpos_nosnap_successor(bp.v->min_key);
379 }
380 
381 /* KEY_TYPE_extent: */
382 
bch2_extent_merge(struct bch_fs * c,struct bkey_s l,struct bkey_s_c r)383 bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
384 {
385 	struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
386 	struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
387 	union bch_extent_entry *en_l;
388 	const union bch_extent_entry *en_r;
389 	struct extent_ptr_decoded lp, rp;
390 	bool use_right_ptr;
391 
392 	en_l = l_ptrs.start;
393 	en_r = r_ptrs.start;
394 	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
395 		if (extent_entry_type(en_l) != extent_entry_type(en_r))
396 			return false;
397 
398 		en_l = extent_entry_next(en_l);
399 		en_r = extent_entry_next(en_r);
400 	}
401 
402 	if (en_l < l_ptrs.end || en_r < r_ptrs.end)
403 		return false;
404 
405 	en_l = l_ptrs.start;
406 	en_r = r_ptrs.start;
407 	lp.crc = bch2_extent_crc_unpack(l.k, NULL);
408 	rp.crc = bch2_extent_crc_unpack(r.k, NULL);
409 
410 	guard(rcu)();
411 
412 	while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
413 	       __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
414 		if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
415 		    rp.ptr.offset + rp.crc.offset ||
416 		    lp.ptr.dev			!= rp.ptr.dev ||
417 		    lp.ptr.gen			!= rp.ptr.gen ||
418 		    lp.ptr.unwritten		!= rp.ptr.unwritten ||
419 		    lp.has_ec			!= rp.has_ec)
420 			return false;
421 
422 		/* Extents may not straddle buckets: */
423 		struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
424 		bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
425 
426 		if (!same_bucket)
427 			return false;
428 
429 		if (lp.has_ec			!= rp.has_ec ||
430 		    (lp.has_ec &&
431 		     (lp.ec.block		!= rp.ec.block ||
432 		      lp.ec.redundancy		!= rp.ec.redundancy ||
433 		      lp.ec.idx			!= rp.ec.idx)))
434 			return false;
435 
436 		if (lp.crc.compression_type	!= rp.crc.compression_type ||
437 		    lp.crc.nonce		!= rp.crc.nonce)
438 			return false;
439 
440 		if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
441 		    lp.crc.uncompressed_size) {
442 			/* can use left extent's crc entry */
443 		} else if (lp.crc.live_size <= rp.crc.offset) {
444 			/* can use right extent's crc entry */
445 		} else {
446 			/* check if checksums can be merged: */
447 			if (lp.crc.csum_type		!= rp.crc.csum_type ||
448 			    lp.crc.nonce		!= rp.crc.nonce ||
449 			    crc_is_compressed(lp.crc) ||
450 			    !bch2_checksum_mergeable(lp.crc.csum_type))
451 				return false;
452 
453 			if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
454 			    rp.crc.offset)
455 				return false;
456 
457 			if (lp.crc.csum_type &&
458 			    lp.crc.uncompressed_size +
459 			    rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
460 				return false;
461 		}
462 
463 		en_l = extent_entry_next(en_l);
464 		en_r = extent_entry_next(en_r);
465 	}
466 
467 	en_l = l_ptrs.start;
468 	en_r = r_ptrs.start;
469 	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
470 		if (extent_entry_is_crc(en_l)) {
471 			struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
472 			struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
473 
474 			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
475 			    bch2_crc_field_size_max[extent_entry_type(en_l)])
476 				return false;
477 		}
478 
479 		en_l = extent_entry_next(en_l);
480 		en_r = extent_entry_next(en_r);
481 	}
482 
483 	use_right_ptr = false;
484 	en_l = l_ptrs.start;
485 	en_r = r_ptrs.start;
486 	while (en_l < l_ptrs.end) {
487 		if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
488 		    use_right_ptr)
489 			en_l->ptr = en_r->ptr;
490 
491 		if (extent_entry_is_crc(en_l)) {
492 			struct bch_extent_crc_unpacked crc_l =
493 				bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
494 			struct bch_extent_crc_unpacked crc_r =
495 				bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
496 
497 			use_right_ptr = false;
498 
499 			if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
500 			    crc_l.uncompressed_size) {
501 				/* can use left extent's crc entry */
502 			} else if (crc_l.live_size <= crc_r.offset) {
503 				/* can use right extent's crc entry */
504 				crc_r.offset -= crc_l.live_size;
505 				bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
506 						     extent_entry_type(en_l));
507 				use_right_ptr = true;
508 			} else {
509 				crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
510 								 crc_l.csum,
511 								 crc_r.csum,
512 								 crc_r.uncompressed_size << 9);
513 
514 				crc_l.uncompressed_size	+= crc_r.uncompressed_size;
515 				crc_l.compressed_size	+= crc_r.compressed_size;
516 				bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
517 						     extent_entry_type(en_l));
518 			}
519 		}
520 
521 		en_l = extent_entry_next(en_l);
522 		en_r = extent_entry_next(en_r);
523 	}
524 
525 	bch2_key_resize(l.k, l.k->size + r.k->size);
526 	return true;
527 }
528 
529 /* KEY_TYPE_reservation: */
530 
bch2_reservation_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)531 int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
532 			      struct bkey_validate_context from)
533 {
534 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
535 	int ret = 0;
536 
537 	bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX,
538 			 c, reservation_key_nr_replicas_invalid,
539 			 "invalid nr_replicas (%u)", r.v->nr_replicas);
540 fsck_err:
541 	return ret;
542 }
543 
bch2_reservation_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)544 void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
545 			      struct bkey_s_c k)
546 {
547 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
548 
549 	prt_printf(out, "generation %u replicas %u",
550 	       le32_to_cpu(r.v->generation),
551 	       r.v->nr_replicas);
552 }
553 
bch2_reservation_merge(struct bch_fs * c,struct bkey_s _l,struct bkey_s_c _r)554 bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
555 {
556 	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
557 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
558 
559 	if (l.v->generation != r.v->generation ||
560 	    l.v->nr_replicas != r.v->nr_replicas)
561 		return false;
562 
563 	bch2_key_resize(l.k, l.k->size + r.k->size);
564 	return true;
565 }
566 
567 /* Extent checksum entries: */
568 
569 /* returns true if not equal */
bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,struct bch_extent_crc_unpacked r)570 static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
571 					 struct bch_extent_crc_unpacked r)
572 {
573 	return (l.csum_type		!= r.csum_type ||
574 		l.compression_type	!= r.compression_type ||
575 		l.compressed_size	!= r.compressed_size ||
576 		l.uncompressed_size	!= r.uncompressed_size ||
577 		l.offset		!= r.offset ||
578 		l.live_size		!= r.live_size ||
579 		l.nonce			!= r.nonce ||
580 		bch2_crc_cmp(l.csum, r.csum));
581 }
582 
can_narrow_crc(struct bch_extent_crc_unpacked u,struct bch_extent_crc_unpacked n)583 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
584 				  struct bch_extent_crc_unpacked n)
585 {
586 	return !crc_is_compressed(u) &&
587 		u.csum_type &&
588 		u.uncompressed_size > u.live_size &&
589 		bch2_csum_type_is_encryption(u.csum_type) ==
590 		bch2_csum_type_is_encryption(n.csum_type);
591 }
592 
bch2_can_narrow_extent_crcs(struct bkey_s_c k,struct bch_extent_crc_unpacked n)593 bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
594 				 struct bch_extent_crc_unpacked n)
595 {
596 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
597 	struct bch_extent_crc_unpacked crc;
598 	const union bch_extent_entry *i;
599 
600 	if (!n.csum_type)
601 		return false;
602 
603 	bkey_for_each_crc(k.k, ptrs, crc, i)
604 		if (can_narrow_crc(crc, n))
605 			return true;
606 
607 	return false;
608 }
609 
610 /*
611  * We're writing another replica for this extent, so while we've got the data in
612  * memory we'll be computing a new checksum for the currently live data.
613  *
614  * If there are other replicas we aren't moving, and they are checksummed but
615  * not compressed, we can modify them to point to only the data that is
616  * currently live (so that readers won't have to bounce) while we've got the
617  * checksum we need:
618  */
bch2_bkey_narrow_crcs(struct bkey_i * k,struct bch_extent_crc_unpacked n)619 bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
620 {
621 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
622 	struct bch_extent_crc_unpacked u;
623 	struct extent_ptr_decoded p;
624 	union bch_extent_entry *i;
625 	bool ret = false;
626 
627 	/* Find a checksum entry that covers only live data: */
628 	if (!n.csum_type) {
629 		bkey_for_each_crc(&k->k, ptrs, u, i)
630 			if (!crc_is_compressed(u) &&
631 			    u.csum_type &&
632 			    u.live_size == u.uncompressed_size) {
633 				n = u;
634 				goto found;
635 			}
636 		return false;
637 	}
638 found:
639 	BUG_ON(crc_is_compressed(n));
640 	BUG_ON(n.offset);
641 	BUG_ON(n.live_size != k->k.size);
642 
643 restart_narrow_pointers:
644 	ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
645 
646 	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
647 		if (can_narrow_crc(p.crc, n)) {
648 			bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
649 			p.ptr.offset += p.crc.offset;
650 			p.crc = n;
651 			bch2_extent_ptr_decoded_append(k, &p);
652 			ret = true;
653 			goto restart_narrow_pointers;
654 		}
655 
656 	return ret;
657 }
658 
bch2_extent_crc_pack(union bch_extent_crc * dst,struct bch_extent_crc_unpacked src,enum bch_extent_entry_type type)659 static void bch2_extent_crc_pack(union bch_extent_crc *dst,
660 				 struct bch_extent_crc_unpacked src,
661 				 enum bch_extent_entry_type type)
662 {
663 #define common_fields(_src)						\
664 		.type			= BIT(type),			\
665 		.csum_type		= _src.csum_type,		\
666 		.compression_type	= _src.compression_type,	\
667 		._compressed_size	= _src.compressed_size - 1,	\
668 		._uncompressed_size	= _src.uncompressed_size - 1,	\
669 		.offset			= _src.offset
670 
671 	switch (type) {
672 	case BCH_EXTENT_ENTRY_crc32:
673 		dst->crc32		= (struct bch_extent_crc32) {
674 			common_fields(src),
675 			.csum		= (u32 __force) *((__le32 *) &src.csum.lo),
676 		};
677 		break;
678 	case BCH_EXTENT_ENTRY_crc64:
679 		dst->crc64		= (struct bch_extent_crc64) {
680 			common_fields(src),
681 			.nonce		= src.nonce,
682 			.csum_lo	= (u64 __force) src.csum.lo,
683 			.csum_hi	= (u64 __force) *((__le16 *) &src.csum.hi),
684 		};
685 		break;
686 	case BCH_EXTENT_ENTRY_crc128:
687 		dst->crc128		= (struct bch_extent_crc128) {
688 			common_fields(src),
689 			.nonce		= src.nonce,
690 			.csum		= src.csum,
691 		};
692 		break;
693 	default:
694 		BUG();
695 	}
696 #undef set_common_fields
697 }
698 
bch2_extent_crc_append(struct bkey_i * k,struct bch_extent_crc_unpacked new)699 void bch2_extent_crc_append(struct bkey_i *k,
700 			    struct bch_extent_crc_unpacked new)
701 {
702 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
703 	union bch_extent_crc *crc = (void *) ptrs.end;
704 	enum bch_extent_entry_type type;
705 
706 	if (bch_crc_bytes[new.csum_type]	<= 4 &&
707 	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
708 	    new.nonce				<= CRC32_NONCE_MAX)
709 		type = BCH_EXTENT_ENTRY_crc32;
710 	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
711 		   new.uncompressed_size	<= CRC64_SIZE_MAX &&
712 		   new.nonce			<= CRC64_NONCE_MAX)
713 		type = BCH_EXTENT_ENTRY_crc64;
714 	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
715 		   new.uncompressed_size	<= CRC128_SIZE_MAX &&
716 		   new.nonce			<= CRC128_NONCE_MAX)
717 		type = BCH_EXTENT_ENTRY_crc128;
718 	else
719 		BUG();
720 
721 	bch2_extent_crc_pack(crc, new, type);
722 
723 	k->k.u64s += extent_entry_u64s(ptrs.end);
724 
725 	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
726 }
727 
728 /* Generic code for keys with pointers: */
729 
bch2_bkey_nr_ptrs(struct bkey_s_c k)730 unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
731 {
732 	return bch2_bkey_devs(k).nr;
733 }
734 
bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)735 unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
736 {
737 	return k.k->type == KEY_TYPE_reservation
738 		? bkey_s_c_to_reservation(k).v->nr_replicas
739 		: bch2_bkey_dirty_devs(k).nr;
740 }
741 
bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)742 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
743 {
744 	unsigned ret = 0;
745 
746 	if (k.k->type == KEY_TYPE_reservation) {
747 		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
748 	} else {
749 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
750 		const union bch_extent_entry *entry;
751 		struct extent_ptr_decoded p;
752 
753 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
754 			ret += !p.ptr.cached && !crc_is_compressed(p.crc);
755 	}
756 
757 	return ret;
758 }
759 
bch2_bkey_sectors_compressed(struct bkey_s_c k)760 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
761 {
762 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
763 	const union bch_extent_entry *entry;
764 	struct extent_ptr_decoded p;
765 	unsigned ret = 0;
766 
767 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
768 		if (!p.ptr.cached && crc_is_compressed(p.crc))
769 			ret += p.crc.compressed_size;
770 
771 	return ret;
772 }
773 
bch2_bkey_is_incompressible(struct bkey_s_c k)774 bool bch2_bkey_is_incompressible(struct bkey_s_c k)
775 {
776 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
777 	const union bch_extent_entry *entry;
778 	struct bch_extent_crc_unpacked crc;
779 
780 	bkey_for_each_crc(k.k, ptrs, crc, entry)
781 		if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
782 			return true;
783 	return false;
784 }
785 
bch2_bkey_replicas(struct bch_fs * c,struct bkey_s_c k)786 unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
787 {
788 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
789 	const union bch_extent_entry *entry;
790 	struct extent_ptr_decoded p = { 0 };
791 	unsigned replicas = 0;
792 
793 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
794 		if (p.ptr.cached)
795 			continue;
796 
797 		if (p.has_ec)
798 			replicas += p.ec.redundancy;
799 
800 		replicas++;
801 
802 	}
803 
804 	return replicas;
805 }
806 
__extent_ptr_durability(struct bch_dev * ca,struct extent_ptr_decoded * p)807 static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
808 {
809 	if (p->ptr.cached)
810 		return 0;
811 
812 	return p->has_ec
813 		? p->ec.redundancy + 1
814 		: ca->mi.durability;
815 }
816 
bch2_extent_ptr_desired_durability(struct bch_fs * c,struct extent_ptr_decoded * p)817 unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
818 {
819 	struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
820 
821 	return ca ? __extent_ptr_durability(ca, p) : 0;
822 }
823 
bch2_extent_ptr_durability(struct bch_fs * c,struct extent_ptr_decoded * p)824 unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
825 {
826 	struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
827 
828 	if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
829 		return 0;
830 
831 	return __extent_ptr_durability(ca, p);
832 }
833 
bch2_bkey_durability(struct bch_fs * c,struct bkey_s_c k)834 unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
835 {
836 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
837 	const union bch_extent_entry *entry;
838 	struct extent_ptr_decoded p;
839 	unsigned durability = 0;
840 
841 	guard(rcu)();
842 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
843 		durability += bch2_extent_ptr_durability(c, &p);
844 	return durability;
845 }
846 
bch2_bkey_durability_safe(struct bch_fs * c,struct bkey_s_c k)847 static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
848 {
849 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
850 	const union bch_extent_entry *entry;
851 	struct extent_ptr_decoded p;
852 	unsigned durability = 0;
853 
854 	guard(rcu)();
855 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
856 		if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
857 			durability += bch2_extent_ptr_durability(c, &p);
858 	return durability;
859 }
860 
bch2_bkey_extent_entry_drop(struct bkey_i * k,union bch_extent_entry * entry)861 void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
862 {
863 	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
864 	union bch_extent_entry *next = extent_entry_next(entry);
865 
866 	memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
867 	k->k.u64s -= extent_entry_u64s(entry);
868 }
869 
bch2_extent_ptr_decoded_append(struct bkey_i * k,struct extent_ptr_decoded * p)870 void bch2_extent_ptr_decoded_append(struct bkey_i *k,
871 				    struct extent_ptr_decoded *p)
872 {
873 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
874 	struct bch_extent_crc_unpacked crc =
875 		bch2_extent_crc_unpack(&k->k, NULL);
876 	union bch_extent_entry *pos;
877 
878 	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
879 		pos = ptrs.start;
880 		goto found;
881 	}
882 
883 	bkey_for_each_crc(&k->k, ptrs, crc, pos)
884 		if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
885 			pos = extent_entry_next(pos);
886 			goto found;
887 		}
888 
889 	bch2_extent_crc_append(k, p->crc);
890 	pos = bkey_val_end(bkey_i_to_s(k));
891 found:
892 	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
893 	__extent_entry_insert(k, pos, to_entry(&p->ptr));
894 
895 	if (p->has_ec) {
896 		p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
897 		__extent_entry_insert(k, pos, to_entry(&p->ec));
898 	}
899 }
900 
extent_entry_prev(struct bkey_ptrs ptrs,union bch_extent_entry * entry)901 static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
902 					  union bch_extent_entry *entry)
903 {
904 	union bch_extent_entry *i = ptrs.start;
905 
906 	if (i == entry)
907 		return NULL;
908 
909 	while (extent_entry_next(i) != entry)
910 		i = extent_entry_next(i);
911 	return i;
912 }
913 
914 /*
915  * Returns pointer to the next entry after the one being dropped:
916  */
bch2_bkey_drop_ptr_noerror(struct bkey_s k,struct bch_extent_ptr * ptr)917 void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
918 {
919 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
920 	union bch_extent_entry *entry = to_entry(ptr), *next;
921 	bool drop_crc = true;
922 
923 	if (k.k->type == KEY_TYPE_stripe) {
924 		ptr->dev = BCH_SB_MEMBER_INVALID;
925 		return;
926 	}
927 
928 	EBUG_ON(ptr < &ptrs.start->ptr ||
929 		ptr >= &ptrs.end->ptr);
930 	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
931 
932 	for (next = extent_entry_next(entry);
933 	     next != ptrs.end;
934 	     next = extent_entry_next(next)) {
935 		if (extent_entry_is_crc(next)) {
936 			break;
937 		} else if (extent_entry_is_ptr(next)) {
938 			drop_crc = false;
939 			break;
940 		}
941 	}
942 
943 	extent_entry_drop(k, entry);
944 
945 	while ((entry = extent_entry_prev(ptrs, entry))) {
946 		if (extent_entry_is_ptr(entry))
947 			break;
948 
949 		if ((extent_entry_is_crc(entry) && drop_crc) ||
950 		    extent_entry_is_stripe_ptr(entry))
951 			extent_entry_drop(k, entry);
952 	}
953 }
954 
bch2_bkey_drop_ptr(struct bkey_s k,struct bch_extent_ptr * ptr)955 void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
956 {
957 	if (k.k->type != KEY_TYPE_stripe) {
958 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
959 		const union bch_extent_entry *entry;
960 		struct extent_ptr_decoded p;
961 
962 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
963 			if (p.ptr.dev == ptr->dev && p.has_ec) {
964 				ptr->dev = BCH_SB_MEMBER_INVALID;
965 				return;
966 			}
967 	}
968 
969 	bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
970 
971 	bch2_bkey_drop_ptr_noerror(k, ptr);
972 
973 	/*
974 	 * If we deleted all the dirty pointers and there's still cached
975 	 * pointers, we could set the cached pointers to dirty if they're not
976 	 * stale - but to do that correctly we'd need to grab an open_bucket
977 	 * reference so that we don't race with bucket reuse:
978 	 */
979 	if (have_dirty &&
980 	    !bch2_bkey_dirty_devs(k.s_c).nr) {
981 		k.k->type = KEY_TYPE_error;
982 		set_bkey_val_u64s(k.k, 0);
983 	} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
984 		k.k->type = KEY_TYPE_deleted;
985 		set_bkey_val_u64s(k.k, 0);
986 	}
987 }
988 
bch2_bkey_drop_device(struct bkey_s k,unsigned dev)989 void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
990 {
991 	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
992 }
993 
bch2_bkey_drop_device_noerror(struct bkey_s k,unsigned dev)994 void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
995 {
996 	bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
997 }
998 
bch2_bkey_has_device_c(struct bkey_s_c k,unsigned dev)999 const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
1000 {
1001 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1002 
1003 	bkey_for_each_ptr(ptrs, ptr)
1004 		if (ptr->dev == dev)
1005 			return ptr;
1006 
1007 	return NULL;
1008 }
1009 
bch2_bkey_has_target(struct bch_fs * c,struct bkey_s_c k,unsigned target)1010 bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
1011 {
1012 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1013 	struct bch_dev *ca;
1014 
1015 	guard(rcu)();
1016 	bkey_for_each_ptr(ptrs, ptr)
1017 		if (bch2_dev_in_target(c, ptr->dev, target) &&
1018 		    (ca = bch2_dev_rcu(c, ptr->dev)) &&
1019 		    (!ptr->cached ||
1020 		     !dev_ptr_stale_rcu(ca, ptr)))
1021 			return true;
1022 
1023 	return false;
1024 }
1025 
bch2_bkey_matches_ptr(struct bch_fs * c,struct bkey_s_c k,struct bch_extent_ptr m,u64 offset)1026 bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
1027 			   struct bch_extent_ptr m, u64 offset)
1028 {
1029 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1030 	const union bch_extent_entry *entry;
1031 	struct extent_ptr_decoded p;
1032 
1033 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
1034 		if (p.ptr.dev	== m.dev &&
1035 		    p.ptr.gen	== m.gen &&
1036 		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
1037 		    (s64) m.offset  - offset)
1038 			return true;
1039 
1040 	return false;
1041 }
1042 
1043 /*
1044  * Returns true if two extents refer to the same data:
1045  */
bch2_extents_match(struct bkey_s_c k1,struct bkey_s_c k2)1046 bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
1047 {
1048 	if (k1.k->type != k2.k->type)
1049 		return false;
1050 
1051 	if (bkey_extent_is_direct_data(k1.k)) {
1052 		struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
1053 		struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
1054 		const union bch_extent_entry *entry1, *entry2;
1055 		struct extent_ptr_decoded p1, p2;
1056 
1057 		if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
1058 			return false;
1059 
1060 		bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
1061 			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
1062 				if (p1.ptr.dev		== p2.ptr.dev &&
1063 				    p1.ptr.gen		== p2.ptr.gen &&
1064 
1065 				    /*
1066 				     * This checks that the two pointers point
1067 				     * to the same region on disk - adjusting
1068 				     * for the difference in where the extents
1069 				     * start, since one may have been trimmed:
1070 				     */
1071 				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
1072 				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
1073 
1074 				    /*
1075 				     * This additionally checks that the
1076 				     * extents overlap on disk, since the
1077 				     * previous check may trigger spuriously
1078 				     * when one extent is immediately partially
1079 				     * overwritten with another extent (so that
1080 				     * on disk they are adjacent) and
1081 				     * compression is in use:
1082 				     */
1083 				    ((p1.ptr.offset >= p2.ptr.offset &&
1084 				      p1.ptr.offset  < p2.ptr.offset + p2.crc.compressed_size) ||
1085 				     (p2.ptr.offset >= p1.ptr.offset &&
1086 				      p2.ptr.offset  < p1.ptr.offset + p1.crc.compressed_size)))
1087 					return true;
1088 
1089 		return false;
1090 	} else {
1091 		/* KEY_TYPE_deleted, etc. */
1092 		return true;
1093 	}
1094 }
1095 
1096 struct bch_extent_ptr *
bch2_extent_has_ptr(struct bkey_s_c k1,struct extent_ptr_decoded p1,struct bkey_s k2)1097 bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
1098 {
1099 	struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
1100 	union bch_extent_entry *entry2;
1101 	struct extent_ptr_decoded p2;
1102 
1103 	bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
1104 		if (p1.ptr.dev		== p2.ptr.dev &&
1105 		    p1.ptr.gen		== p2.ptr.gen &&
1106 		    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
1107 		    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
1108 			return &entry2->ptr;
1109 
1110 	return NULL;
1111 }
1112 
want_cached_ptr(struct bch_fs * c,struct bch_io_opts * opts,struct bch_extent_ptr * ptr)1113 static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
1114 			    struct bch_extent_ptr *ptr)
1115 {
1116 	unsigned target = opts->promote_target ?: opts->foreground_target;
1117 
1118 	if (target && !bch2_dev_in_target(c, ptr->dev, target))
1119 		return false;
1120 
1121 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1122 
1123 	return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
1124 }
1125 
bch2_extent_ptr_set_cached(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s k,struct bch_extent_ptr * ptr)1126 void bch2_extent_ptr_set_cached(struct bch_fs *c,
1127 				struct bch_io_opts *opts,
1128 				struct bkey_s k,
1129 				struct bch_extent_ptr *ptr)
1130 {
1131 	struct bkey_ptrs ptrs;
1132 	union bch_extent_entry *entry;
1133 	struct extent_ptr_decoded p;
1134 	bool have_cached_ptr;
1135 	unsigned drop_dev = ptr->dev;
1136 
1137 	guard(rcu)();
1138 restart_drop_ptrs:
1139 	ptrs = bch2_bkey_ptrs(k);
1140 	have_cached_ptr = false;
1141 
1142 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
1143 		/*
1144 		 * Check if it's erasure coded - stripes can't contain cached
1145 		 * data. Possibly something we can fix in the future?
1146 		 */
1147 		if (&entry->ptr == ptr && p.has_ec)
1148 			goto drop;
1149 
1150 		if (p.ptr.cached) {
1151 			if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) {
1152 				bch2_bkey_drop_ptr_noerror(k, &entry->ptr);
1153 				ptr = NULL;
1154 				goto restart_drop_ptrs;
1155 			}
1156 
1157 			have_cached_ptr = true;
1158 		}
1159 	}
1160 
1161 	if (!ptr)
1162 		bkey_for_each_ptr(ptrs, ptr2)
1163 			if (ptr2->dev == drop_dev)
1164 				ptr = ptr2;
1165 
1166 	if (have_cached_ptr || !want_cached_ptr(c, opts, ptr))
1167 		goto drop;
1168 
1169 	ptr->cached = true;
1170 	return;
1171 drop:
1172 	bch2_bkey_drop_ptr_noerror(k, ptr);
1173 }
1174 
1175 /*
1176  * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
1177  *
1178  * Returns true if @k should be dropped entirely
1179  *
1180  * For existing keys, only called when btree nodes are being rewritten, not when
1181  * they're merely being compacted/resorted in memory.
1182  */
bch2_extent_normalize(struct bch_fs * c,struct bkey_s k)1183 bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
1184 {
1185 	struct bch_dev *ca;
1186 
1187 	guard(rcu)();
1188 	bch2_bkey_drop_ptrs(k, ptr,
1189 		ptr->cached &&
1190 		(!(ca = bch2_dev_rcu(c, ptr->dev)) ||
1191 		 dev_ptr_stale_rcu(ca, ptr) > 0));
1192 
1193 	return bkey_deleted(k.k);
1194 }
1195 
1196 /*
1197  * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
1198  *
1199  * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
1200  * the promote target.
1201  */
bch2_extent_normalize_by_opts(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s k)1202 bool bch2_extent_normalize_by_opts(struct bch_fs *c,
1203 				   struct bch_io_opts *opts,
1204 				   struct bkey_s k)
1205 {
1206 	struct bkey_ptrs ptrs;
1207 	bool have_cached_ptr;
1208 
1209 	guard(rcu)();
1210 restart_drop_ptrs:
1211 	ptrs = bch2_bkey_ptrs(k);
1212 	have_cached_ptr = false;
1213 
1214 	bkey_for_each_ptr(ptrs, ptr)
1215 		if (ptr->cached) {
1216 			if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
1217 				bch2_bkey_drop_ptr(k, ptr);
1218 				goto restart_drop_ptrs;
1219 			}
1220 			have_cached_ptr = true;
1221 		}
1222 
1223 	return bkey_deleted(k.k);
1224 }
1225 
bch2_extent_ptr_to_text(struct printbuf * out,struct bch_fs * c,const struct bch_extent_ptr * ptr)1226 void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
1227 {
1228 	out->atomic++;
1229 	guard(rcu)();
1230 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1231 	if (!ca) {
1232 		prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
1233 			   (u64) ptr->offset, ptr->gen,
1234 			   ptr->cached ? " cached" : "");
1235 	} else {
1236 		u32 offset;
1237 		u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
1238 
1239 		prt_printf(out, "ptr: %u:%llu:%u gen %u",
1240 			   ptr->dev, b, offset, ptr->gen);
1241 		if (ca->mi.durability != 1)
1242 			prt_printf(out, " d=%u", ca->mi.durability);
1243 		if (ptr->cached)
1244 			prt_str(out, " cached");
1245 		if (ptr->unwritten)
1246 			prt_str(out, " unwritten");
1247 		int stale = dev_ptr_stale_rcu(ca, ptr);
1248 		if (stale > 0)
1249 			prt_printf(out, " stale");
1250 		else if (stale)
1251 			prt_printf(out, " invalid");
1252 	}
1253 	--out->atomic;
1254 }
1255 
bch2_extent_crc_unpacked_to_text(struct printbuf * out,struct bch_extent_crc_unpacked * crc)1256 void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
1257 {
1258 	prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
1259 		   crc->compressed_size,
1260 		   crc->uncompressed_size,
1261 		   crc->offset, crc->nonce);
1262 	bch2_prt_csum_type(out, crc->csum_type);
1263 	prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
1264 	prt_str(out, " compress ");
1265 	bch2_prt_compression_type(out, crc->compression_type);
1266 }
1267 
bch2_extent_rebalance_to_text(struct printbuf * out,struct bch_fs * c,const struct bch_extent_rebalance * r)1268 static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
1269 					  const struct bch_extent_rebalance *r)
1270 {
1271 	prt_str(out, "rebalance:");
1272 
1273 	prt_printf(out, " replicas=%u", r->data_replicas);
1274 	if (r->data_replicas_from_inode)
1275 		prt_str(out, " (inode)");
1276 
1277 	prt_str(out, " checksum=");
1278 	bch2_prt_csum_opt(out, r->data_checksum);
1279 	if (r->data_checksum_from_inode)
1280 		prt_str(out, " (inode)");
1281 
1282 	if (r->background_compression || r->background_compression_from_inode) {
1283 		prt_str(out, " background_compression=");
1284 		bch2_compression_opt_to_text(out, r->background_compression);
1285 
1286 		if (r->background_compression_from_inode)
1287 			prt_str(out, " (inode)");
1288 	}
1289 
1290 	if (r->background_target || r->background_target_from_inode) {
1291 		prt_str(out, " background_target=");
1292 		if (c)
1293 			bch2_target_to_text(out, c, r->background_target);
1294 		else
1295 			prt_printf(out, "%u", r->background_target);
1296 
1297 		if (r->background_target_from_inode)
1298 			prt_str(out, " (inode)");
1299 	}
1300 
1301 	if (r->promote_target || r->promote_target_from_inode) {
1302 		prt_str(out, " promote_target=");
1303 		if (c)
1304 			bch2_target_to_text(out, c, r->promote_target);
1305 		else
1306 			prt_printf(out, "%u", r->promote_target);
1307 
1308 		if (r->promote_target_from_inode)
1309 			prt_str(out, " (inode)");
1310 	}
1311 
1312 	if (r->erasure_code || r->erasure_code_from_inode) {
1313 		prt_printf(out, " ec=%u", r->erasure_code);
1314 		if (r->erasure_code_from_inode)
1315 			prt_str(out, " (inode)");
1316 	}
1317 }
1318 
bch2_bkey_ptrs_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)1319 void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
1320 			    struct bkey_s_c k)
1321 {
1322 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1323 	const union bch_extent_entry *entry;
1324 	bool first = true;
1325 
1326 	if (c)
1327 		prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
1328 
1329 	bkey_extent_entry_for_each(ptrs, entry) {
1330 		if (!first)
1331 			prt_printf(out, " ");
1332 
1333 		switch (__extent_entry_type(entry)) {
1334 		case BCH_EXTENT_ENTRY_ptr:
1335 			bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
1336 			break;
1337 
1338 		case BCH_EXTENT_ENTRY_crc32:
1339 		case BCH_EXTENT_ENTRY_crc64:
1340 		case BCH_EXTENT_ENTRY_crc128: {
1341 			struct bch_extent_crc_unpacked crc =
1342 				bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
1343 
1344 			bch2_extent_crc_unpacked_to_text(out, &crc);
1345 			break;
1346 		}
1347 		case BCH_EXTENT_ENTRY_stripe_ptr: {
1348 			const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
1349 
1350 			prt_printf(out, "ec: idx %llu block %u",
1351 			       (u64) ec->idx, ec->block);
1352 			break;
1353 		}
1354 		case BCH_EXTENT_ENTRY_rebalance:
1355 			bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
1356 			break;
1357 
1358 		case BCH_EXTENT_ENTRY_flags:
1359 			prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
1360 			break;
1361 
1362 		default:
1363 			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
1364 			return;
1365 		}
1366 
1367 		first = false;
1368 	}
1369 }
1370 
extent_ptr_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from,const struct bch_extent_ptr * ptr,unsigned size_ondisk,bool metadata)1371 static int extent_ptr_validate(struct bch_fs *c,
1372 			       struct bkey_s_c k,
1373 			       struct bkey_validate_context from,
1374 			       const struct bch_extent_ptr *ptr,
1375 			       unsigned size_ondisk,
1376 			       bool metadata)
1377 {
1378 	int ret = 0;
1379 
1380 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1381 	bkey_for_each_ptr(ptrs, ptr2)
1382 		bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
1383 				 c, ptr_to_duplicate_device,
1384 				 "multiple pointers to same device (%u)", ptr->dev);
1385 
1386 	/* bad pointers are repaired by check_fix_ptrs(): */
1387 	rcu_read_lock();
1388 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1389 	if (!ca) {
1390 		rcu_read_unlock();
1391 		return 0;
1392 	}
1393 	u32 bucket_offset;
1394 	u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
1395 	unsigned first_bucket	= ca->mi.first_bucket;
1396 	u64 nbuckets		= ca->mi.nbuckets;
1397 	unsigned bucket_size	= ca->mi.bucket_size;
1398 	rcu_read_unlock();
1399 
1400 	bkey_fsck_err_on(bucket >= nbuckets,
1401 			 c, ptr_after_last_bucket,
1402 			 "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
1403 	bkey_fsck_err_on(bucket < first_bucket,
1404 			 c, ptr_before_first_bucket,
1405 			 "pointer before first bucket (%llu < %u)", bucket, first_bucket);
1406 	bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size,
1407 			 c, ptr_spans_multiple_buckets,
1408 			 "pointer spans multiple buckets (%u + %u > %u)",
1409 		       bucket_offset, size_ondisk, bucket_size);
1410 fsck_err:
1411 	return ret;
1412 }
1413 
bch2_bkey_ptrs_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)1414 int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
1415 			    struct bkey_validate_context from)
1416 {
1417 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1418 	const union bch_extent_entry *entry;
1419 	struct bch_extent_crc_unpacked crc;
1420 	unsigned size_ondisk = k.k->size;
1421 	unsigned nonce = UINT_MAX;
1422 	unsigned nr_ptrs = 0;
1423 	bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
1424 	int ret = 0;
1425 
1426 	if (bkey_is_btree_ptr(k.k))
1427 		size_ondisk = btree_sectors(c);
1428 
1429 	bkey_extent_entry_for_each(ptrs, entry) {
1430 		bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
1431 				 c, extent_ptrs_invalid_entry,
1432 				 "invalid extent entry type (got %u, max %u)",
1433 				 __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
1434 
1435 		bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
1436 				 !extent_entry_is_ptr(entry),
1437 				 c, btree_ptr_has_non_ptr,
1438 				 "has non ptr field");
1439 
1440 		switch (extent_entry_type(entry)) {
1441 		case BCH_EXTENT_ENTRY_ptr:
1442 			ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
1443 			if (ret)
1444 				return ret;
1445 
1446 			bkey_fsck_err_on(entry->ptr.cached && have_ec,
1447 					 c, ptr_cached_and_erasure_coded,
1448 					 "cached, erasure coded ptr");
1449 
1450 			if (!entry->ptr.unwritten)
1451 				have_written = true;
1452 			else
1453 				have_unwritten = true;
1454 
1455 			have_ec = false;
1456 			crc_since_last_ptr = false;
1457 			nr_ptrs++;
1458 			break;
1459 		case BCH_EXTENT_ENTRY_crc32:
1460 		case BCH_EXTENT_ENTRY_crc64:
1461 		case BCH_EXTENT_ENTRY_crc128:
1462 			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
1463 
1464 			bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
1465 					 c, ptr_crc_csum_type_unknown,
1466 					 "invalid checksum type");
1467 			bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR,
1468 					 c, ptr_crc_compression_type_unknown,
1469 					 "invalid compression type");
1470 
1471 			bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
1472 					 c, ptr_crc_uncompressed_size_too_small,
1473 					 "checksum offset + key size > uncompressed size");
1474 			bkey_fsck_err_on(crc_is_encoded(crc) &&
1475 					 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
1476 					 (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
1477 					 c, ptr_crc_uncompressed_size_too_big,
1478 					 "too large encoded extent");
1479 			bkey_fsck_err_on(!crc_is_compressed(crc) &&
1480 					 crc.compressed_size != crc.uncompressed_size,
1481 					 c, ptr_crc_uncompressed_size_mismatch,
1482 					 "not compressed but compressed != uncompressed size");
1483 
1484 			if (bch2_csum_type_is_encryption(crc.csum_type)) {
1485 				if (nonce == UINT_MAX)
1486 					nonce = crc.offset + crc.nonce;
1487 				else if (nonce != crc.offset + crc.nonce)
1488 					bkey_fsck_err(c, ptr_crc_nonce_mismatch,
1489 						      "incorrect nonce");
1490 			}
1491 
1492 			bkey_fsck_err_on(crc_since_last_ptr,
1493 					 c, ptr_crc_redundant,
1494 					 "redundant crc entry");
1495 			crc_since_last_ptr = true;
1496 
1497 			size_ondisk = crc.compressed_size;
1498 			break;
1499 		case BCH_EXTENT_ENTRY_stripe_ptr:
1500 			bkey_fsck_err_on(have_ec,
1501 					 c, ptr_stripe_redundant,
1502 					 "redundant stripe entry");
1503 			have_ec = true;
1504 			break;
1505 		case BCH_EXTENT_ENTRY_rebalance: {
1506 			/*
1507 			 * this shouldn't be a fsck error, for forward
1508 			 * compatibility; the rebalance code should just refetch
1509 			 * the compression opt if it's unknown
1510 			 */
1511 #if 0
1512 			const struct bch_extent_rebalance *r = &entry->rebalance;
1513 
1514 			if (!bch2_compression_opt_valid(r->compression)) {
1515 				struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
1516 				prt_printf(err, "invalid compression opt %u:%u",
1517 					   opt.type, opt.level);
1518 				return bch_err_throw(c, invalid_bkey);
1519 			}
1520 #endif
1521 			break;
1522 		}
1523 		case BCH_EXTENT_ENTRY_flags:
1524 			bkey_fsck_err_on(entry != ptrs.start,
1525 					 c, extent_flags_not_at_start,
1526 					 "extent flags entry not at start");
1527 			break;
1528 		}
1529 	}
1530 
1531 	bkey_fsck_err_on(!nr_ptrs,
1532 			 c, extent_ptrs_no_ptrs,
1533 			 "no ptrs");
1534 	bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX,
1535 			 c, extent_ptrs_too_many_ptrs,
1536 			 "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
1537 	bkey_fsck_err_on(have_written && have_unwritten,
1538 			 c, extent_ptrs_written_and_unwritten,
1539 			 "extent with unwritten and written ptrs");
1540 	bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten,
1541 			 c, extent_ptrs_unwritten,
1542 			 "has unwritten ptrs");
1543 	bkey_fsck_err_on(crc_since_last_ptr,
1544 			 c, extent_ptrs_redundant_crc,
1545 			 "redundant crc entry");
1546 	bkey_fsck_err_on(have_ec,
1547 			 c, extent_ptrs_redundant_stripe,
1548 			 "redundant stripe entry");
1549 fsck_err:
1550 	return ret;
1551 }
1552 
bch2_ptr_swab(struct bkey_s k)1553 void bch2_ptr_swab(struct bkey_s k)
1554 {
1555 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1556 	union bch_extent_entry *entry;
1557 	u64 *d;
1558 
1559 	for (d =  (u64 *) ptrs.start;
1560 	     d != (u64 *) ptrs.end;
1561 	     d++)
1562 		*d = swab64(*d);
1563 
1564 	for (entry = ptrs.start;
1565 	     entry < ptrs.end;
1566 	     entry = extent_entry_next(entry)) {
1567 		switch (__extent_entry_type(entry)) {
1568 		case BCH_EXTENT_ENTRY_ptr:
1569 			break;
1570 		case BCH_EXTENT_ENTRY_crc32:
1571 			entry->crc32.csum = swab32(entry->crc32.csum);
1572 			break;
1573 		case BCH_EXTENT_ENTRY_crc64:
1574 			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
1575 			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
1576 			break;
1577 		case BCH_EXTENT_ENTRY_crc128:
1578 			entry->crc128.csum.hi = (__force __le64)
1579 				swab64((__force u64) entry->crc128.csum.hi);
1580 			entry->crc128.csum.lo = (__force __le64)
1581 				swab64((__force u64) entry->crc128.csum.lo);
1582 			break;
1583 		case BCH_EXTENT_ENTRY_stripe_ptr:
1584 			break;
1585 		case BCH_EXTENT_ENTRY_rebalance:
1586 			break;
1587 		default:
1588 			/* Bad entry type: will be caught by validate() */
1589 			return;
1590 		}
1591 	}
1592 }
1593 
bch2_bkey_extent_flags_set(struct bch_fs * c,struct bkey_i * k,u64 flags)1594 int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
1595 {
1596 	int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
1597 	if (ret)
1598 		return ret;
1599 
1600 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
1601 
1602 	if (ptrs.start != ptrs.end &&
1603 	    extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
1604 		ptrs.start->flags.flags = flags;
1605 	} else {
1606 		struct bch_extent_flags f = {
1607 			.type	= BIT(BCH_EXTENT_ENTRY_flags),
1608 			.flags	= flags,
1609 		};
1610 		__extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
1611 	}
1612 
1613 	return 0;
1614 }
1615 
1616 /* Generic extent code: */
1617 
bch2_cut_front_s(struct bpos where,struct bkey_s k)1618 int bch2_cut_front_s(struct bpos where, struct bkey_s k)
1619 {
1620 	unsigned new_val_u64s = bkey_val_u64s(k.k);
1621 	int val_u64s_delta;
1622 	u64 sub;
1623 
1624 	if (bkey_le(where, bkey_start_pos(k.k)))
1625 		return 0;
1626 
1627 	EBUG_ON(bkey_gt(where, k.k->p));
1628 
1629 	sub = where.offset - bkey_start_offset(k.k);
1630 
1631 	k.k->size -= sub;
1632 
1633 	if (!k.k->size) {
1634 		k.k->type = KEY_TYPE_deleted;
1635 		new_val_u64s = 0;
1636 	}
1637 
1638 	switch (k.k->type) {
1639 	case KEY_TYPE_extent:
1640 	case KEY_TYPE_reflink_v: {
1641 		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1642 		union bch_extent_entry *entry;
1643 		bool seen_crc = false;
1644 
1645 		bkey_extent_entry_for_each(ptrs, entry) {
1646 			switch (extent_entry_type(entry)) {
1647 			case BCH_EXTENT_ENTRY_ptr:
1648 				if (!seen_crc)
1649 					entry->ptr.offset += sub;
1650 				break;
1651 			case BCH_EXTENT_ENTRY_crc32:
1652 				entry->crc32.offset += sub;
1653 				break;
1654 			case BCH_EXTENT_ENTRY_crc64:
1655 				entry->crc64.offset += sub;
1656 				break;
1657 			case BCH_EXTENT_ENTRY_crc128:
1658 				entry->crc128.offset += sub;
1659 				break;
1660 			case BCH_EXTENT_ENTRY_stripe_ptr:
1661 			case BCH_EXTENT_ENTRY_rebalance:
1662 			case BCH_EXTENT_ENTRY_flags:
1663 				break;
1664 			}
1665 
1666 			if (extent_entry_is_crc(entry))
1667 				seen_crc = true;
1668 		}
1669 
1670 		break;
1671 	}
1672 	case KEY_TYPE_reflink_p: {
1673 		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
1674 
1675 		SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
1676 		break;
1677 	}
1678 	case KEY_TYPE_inline_data:
1679 	case KEY_TYPE_indirect_inline_data: {
1680 		void *p = bkey_inline_data_p(k);
1681 		unsigned bytes = bkey_inline_data_bytes(k.k);
1682 
1683 		sub = min_t(u64, sub << 9, bytes);
1684 
1685 		memmove(p, p + sub, bytes - sub);
1686 
1687 		new_val_u64s -= sub >> 3;
1688 		break;
1689 	}
1690 	}
1691 
1692 	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
1693 	BUG_ON(val_u64s_delta < 0);
1694 
1695 	set_bkey_val_u64s(k.k, new_val_u64s);
1696 	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
1697 	return -val_u64s_delta;
1698 }
1699 
bch2_cut_back_s(struct bpos where,struct bkey_s k)1700 int bch2_cut_back_s(struct bpos where, struct bkey_s k)
1701 {
1702 	unsigned new_val_u64s = bkey_val_u64s(k.k);
1703 	int val_u64s_delta;
1704 	u64 len = 0;
1705 
1706 	if (bkey_ge(where, k.k->p))
1707 		return 0;
1708 
1709 	EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
1710 
1711 	len = where.offset - bkey_start_offset(k.k);
1712 
1713 	k.k->p.offset = where.offset;
1714 	k.k->size = len;
1715 
1716 	if (!len) {
1717 		k.k->type = KEY_TYPE_deleted;
1718 		new_val_u64s = 0;
1719 	}
1720 
1721 	switch (k.k->type) {
1722 	case KEY_TYPE_inline_data:
1723 	case KEY_TYPE_indirect_inline_data:
1724 		new_val_u64s = (bkey_inline_data_offset(k.k) +
1725 				min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
1726 		break;
1727 	}
1728 
1729 	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
1730 	BUG_ON(val_u64s_delta < 0);
1731 
1732 	set_bkey_val_u64s(k.k, new_val_u64s);
1733 	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
1734 	return -val_u64s_delta;
1735 }
1736