xref: /linux/fs/bcachefs/btree_gc.c (revision 6f2a71a99ebd5dfaa7948a2e9c59eae94b741bd8)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4  * Copyright (C) 2014 Datera Inc.
5  */
6 
7 #include "bcachefs.h"
8 #include "alloc_background.h"
9 #include "alloc_foreground.h"
10 #include "backpointers.h"
11 #include "bkey_methods.h"
12 #include "bkey_buf.h"
13 #include "btree_journal_iter.h"
14 #include "btree_key_cache.h"
15 #include "btree_locking.h"
16 #include "btree_node_scan.h"
17 #include "btree_update_interior.h"
18 #include "btree_io.h"
19 #include "btree_gc.h"
20 #include "buckets.h"
21 #include "clock.h"
22 #include "debug.h"
23 #include "disk_accounting.h"
24 #include "ec.h"
25 #include "enumerated_ref.h"
26 #include "error.h"
27 #include "extents.h"
28 #include "journal.h"
29 #include "keylist.h"
30 #include "move.h"
31 #include "progress.h"
32 #include "recovery_passes.h"
33 #include "reflink.h"
34 #include "recovery.h"
35 #include "replicas.h"
36 #include "super-io.h"
37 #include "trace.h"
38 
39 #include <linux/slab.h>
40 #include <linux/bitops.h>
41 #include <linux/freezer.h>
42 #include <linux/kthread.h>
43 #include <linux/preempt.h>
44 #include <linux/rcupdate.h>
45 #include <linux/sched/task.h>
46 
47 #define DROP_THIS_NODE		10
48 #define DROP_PREV_NODE		11
49 #define DID_FILL_FROM_SCAN	12
50 
51 /*
52  * Returns true if it's a btree we can easily reconstruct, or otherwise won't
53  * cause data loss if it's missing:
54  */
btree_id_important(enum btree_id btree)55 static bool btree_id_important(enum btree_id btree)
56 {
57 	if (btree_id_is_alloc(btree))
58 		return false;
59 
60 	switch (btree) {
61 	case BTREE_ID_quotas:
62 	case BTREE_ID_snapshot_trees:
63 	case BTREE_ID_logged_ops:
64 	case BTREE_ID_rebalance_work:
65 	case BTREE_ID_subvolume_children:
66 		return false;
67 	default:
68 		return true;
69 	}
70 }
71 
72 static const char * const bch2_gc_phase_strs[] = {
73 #define x(n)	#n,
74 	GC_PHASES()
75 #undef x
76 	NULL
77 };
78 
bch2_gc_pos_to_text(struct printbuf * out,struct gc_pos * p)79 void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
80 {
81 	prt_str(out, bch2_gc_phase_strs[p->phase]);
82 	prt_char(out, ' ');
83 	bch2_btree_id_level_to_text(out, p->btree, p->level);
84 	prt_char(out, ' ');
85 	bch2_bpos_to_text(out, p->pos);
86 }
87 
unsafe_bkey_s_c_to_s(struct bkey_s_c k)88 static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
89 {
90 	return (struct bkey_s) {{{
91 		(struct bkey *) k.k,
92 		(struct bch_val *) k.v
93 	}}};
94 }
95 
__gc_pos_set(struct bch_fs * c,struct gc_pos new_pos)96 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
97 {
98 	preempt_disable();
99 	write_seqcount_begin(&c->gc_pos_lock);
100 	c->gc_pos = new_pos;
101 	write_seqcount_end(&c->gc_pos_lock);
102 	preempt_enable();
103 }
104 
gc_pos_set(struct bch_fs * c,struct gc_pos new_pos)105 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
106 {
107 	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
108 	__gc_pos_set(c, new_pos);
109 }
110 
btree_ptr_to_v2(struct btree * b,struct bkey_i_btree_ptr_v2 * dst)111 static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
112 {
113 	switch (b->key.k.type) {
114 	case KEY_TYPE_btree_ptr: {
115 		struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
116 
117 		dst->k.p		= src->k.p;
118 		dst->v.mem_ptr		= 0;
119 		dst->v.seq		= b->data->keys.seq;
120 		dst->v.sectors_written	= 0;
121 		dst->v.flags		= 0;
122 		dst->v.min_key		= b->data->min_key;
123 		set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
124 		memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
125 		break;
126 	}
127 	case KEY_TYPE_btree_ptr_v2:
128 		bkey_copy(&dst->k_i, &b->key);
129 		break;
130 	default:
131 		BUG();
132 	}
133 }
134 
set_node_min(struct bch_fs * c,struct btree * b,struct bpos new_min)135 static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
136 {
137 	struct bkey_i_btree_ptr_v2 *new;
138 	int ret;
139 
140 	if (c->opts.verbose) {
141 		struct printbuf buf = PRINTBUF;
142 
143 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
144 		prt_str(&buf, " -> ");
145 		bch2_bpos_to_text(&buf, new_min);
146 
147 		bch_info(c, "%s(): %s", __func__, buf.buf);
148 		printbuf_exit(&buf);
149 	}
150 
151 	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
152 	if (!new)
153 		return bch_err_throw(c, ENOMEM_gc_repair_key);
154 
155 	btree_ptr_to_v2(b, new);
156 	b->data->min_key	= new_min;
157 	new->v.min_key		= new_min;
158 	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
159 
160 	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
161 	if (ret) {
162 		kfree(new);
163 		return ret;
164 	}
165 
166 	bch2_btree_node_drop_keys_outside_node(b);
167 	bkey_copy(&b->key, &new->k_i);
168 	return 0;
169 }
170 
set_node_max(struct bch_fs * c,struct btree * b,struct bpos new_max)171 static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
172 {
173 	struct bkey_i_btree_ptr_v2 *new;
174 	int ret;
175 
176 	if (c->opts.verbose) {
177 		struct printbuf buf = PRINTBUF;
178 
179 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
180 		prt_str(&buf, " -> ");
181 		bch2_bpos_to_text(&buf, new_max);
182 
183 		bch_info(c, "%s(): %s", __func__, buf.buf);
184 		printbuf_exit(&buf);
185 	}
186 
187 	ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
188 	if (ret)
189 		return ret;
190 
191 	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
192 	if (!new)
193 		return bch_err_throw(c, ENOMEM_gc_repair_key);
194 
195 	btree_ptr_to_v2(b, new);
196 	b->data->max_key	= new_max;
197 	new->k.p		= new_max;
198 	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
199 
200 	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
201 	if (ret) {
202 		kfree(new);
203 		return ret;
204 	}
205 
206 	bch2_btree_node_drop_keys_outside_node(b);
207 
208 	mutex_lock(&c->btree_cache.lock);
209 	__bch2_btree_node_hash_remove(&c->btree_cache, b);
210 
211 	bkey_copy(&b->key, &new->k_i);
212 	ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
213 	BUG_ON(ret);
214 	mutex_unlock(&c->btree_cache.lock);
215 	return 0;
216 }
217 
btree_check_node_boundaries(struct btree_trans * trans,struct btree * b,struct btree * prev,struct btree * cur,struct bpos * pulled_from_scan)218 static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
219 				       struct btree *prev, struct btree *cur,
220 				       struct bpos *pulled_from_scan)
221 {
222 	struct bch_fs *c = trans->c;
223 	struct bpos expected_start = !prev
224 		? b->data->min_key
225 		: bpos_successor(prev->key.k.p);
226 	struct printbuf buf = PRINTBUF;
227 	int ret = 0;
228 
229 	BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
230 	       !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
231 			b->data->min_key));
232 
233 	if (bpos_eq(expected_start, cur->data->min_key))
234 		return 0;
235 
236 	prt_printf(&buf, "  at ");
237 	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
238 	prt_printf(&buf, ":\nparent: ");
239 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
240 
241 	if (prev) {
242 		prt_printf(&buf, "\nprev: ");
243 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
244 	}
245 
246 	prt_str(&buf, "\nnext: ");
247 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
248 
249 	if (bpos_lt(expected_start, cur->data->min_key)) {				/* gap */
250 		if (b->c.level == 1 &&
251 		    bpos_lt(*pulled_from_scan, cur->data->min_key)) {
252 			ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
253 						     expected_start,
254 						     bpos_predecessor(cur->data->min_key));
255 			if (ret)
256 				goto err;
257 
258 			*pulled_from_scan = cur->data->min_key;
259 			ret = DID_FILL_FROM_SCAN;
260 		} else {
261 			if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
262 					     "btree node with incorrect min_key%s", buf.buf))
263 				ret = set_node_min(c, cur, expected_start);
264 		}
265 	} else {									/* overlap */
266 		if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {	/* cur overwrites prev */
267 			if (bpos_ge(prev->data->min_key, cur->data->min_key)) {		/* fully? */
268 				if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
269 						     "btree node overwritten by next node%s", buf.buf))
270 					ret = DROP_PREV_NODE;
271 			} else {
272 				if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
273 						     "btree node with incorrect max_key%s", buf.buf))
274 					ret = set_node_max(c, prev,
275 							   bpos_predecessor(cur->data->min_key));
276 			}
277 		} else {
278 			if (bpos_ge(expected_start, cur->data->max_key)) {		/* fully? */
279 				if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
280 						     "btree node overwritten by prev node%s", buf.buf))
281 					ret = DROP_THIS_NODE;
282 			} else {
283 				if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
284 						     "btree node with incorrect min_key%s", buf.buf))
285 					ret = set_node_min(c, cur, expected_start);
286 			}
287 		}
288 	}
289 err:
290 fsck_err:
291 	printbuf_exit(&buf);
292 	return ret;
293 }
294 
btree_repair_node_end(struct btree_trans * trans,struct btree * b,struct btree * child,struct bpos * pulled_from_scan)295 static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
296 				 struct btree *child, struct bpos *pulled_from_scan)
297 {
298 	struct bch_fs *c = trans->c;
299 	struct printbuf buf = PRINTBUF;
300 	int ret = 0;
301 
302 	if (bpos_eq(child->key.k.p, b->key.k.p))
303 		return 0;
304 
305 	prt_printf(&buf, "\nat: ");
306 	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
307 	prt_printf(&buf, "\nparent: ");
308 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
309 
310 	prt_str(&buf, "\nchild: ");
311 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
312 
313 	if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
314 			     "btree node with incorrect max_key%s", buf.buf)) {
315 		if (b->c.level == 1 &&
316 		    bpos_lt(*pulled_from_scan, b->key.k.p)) {
317 			ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
318 						bpos_successor(child->key.k.p), b->key.k.p);
319 			if (ret)
320 				goto err;
321 
322 			*pulled_from_scan = b->key.k.p;
323 			ret = DID_FILL_FROM_SCAN;
324 		} else {
325 			ret = set_node_max(c, child, b->key.k.p);
326 		}
327 	}
328 err:
329 fsck_err:
330 	printbuf_exit(&buf);
331 	return ret;
332 }
333 
bch2_btree_repair_topology_recurse(struct btree_trans * trans,struct btree * b,struct bpos * pulled_from_scan)334 static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
335 					      struct bpos *pulled_from_scan)
336 {
337 	struct bch_fs *c = trans->c;
338 	struct btree_and_journal_iter iter;
339 	struct bkey_s_c k;
340 	struct bkey_buf prev_k, cur_k;
341 	struct btree *prev = NULL, *cur = NULL;
342 	bool have_child, new_pass = false;
343 	struct printbuf buf = PRINTBUF;
344 	int ret = 0;
345 
346 	if (!b->c.level)
347 		return 0;
348 
349 	bch2_bkey_buf_init(&prev_k);
350 	bch2_bkey_buf_init(&cur_k);
351 again:
352 	cur = prev = NULL;
353 	have_child = new_pass = false;
354 	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
355 	iter.prefetch = true;
356 
357 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
358 		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
359 		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
360 
361 		bch2_btree_and_journal_iter_advance(&iter);
362 		bch2_bkey_buf_reassemble(&cur_k, c, k);
363 
364 		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
365 					b->c.btree_id, b->c.level - 1,
366 					false);
367 		ret = PTR_ERR_OR_ZERO(cur);
368 
369 		printbuf_reset(&buf);
370 		bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1);
371 		prt_char(&buf, ' ');
372 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
373 
374 		if (bch2_err_matches(ret, EIO)) {
375 			bch2_btree_node_evict(trans, cur_k.k);
376 			cur = NULL;
377 			ret = bch2_journal_key_delete(c, b->c.btree_id,
378 						      b->c.level, cur_k.k->k.p);
379 			if (ret)
380 				break;
381 			continue;
382 		}
383 
384 		bch_err_msg(c, ret, "getting btree node");
385 		if (ret)
386 			break;
387 
388 		if (bch2_btree_node_is_stale(c, cur)) {
389 			bch_info(c, "btree node older than nodes found by scanning\n  %s", buf.buf);
390 			six_unlock_read(&cur->c.lock);
391 			bch2_btree_node_evict(trans, cur_k.k);
392 			ret = bch2_journal_key_delete(c, b->c.btree_id,
393 						      b->c.level, cur_k.k->k.p);
394 			cur = NULL;
395 			if (ret)
396 				break;
397 			continue;
398 		}
399 
400 		ret = lockrestart_do(trans,
401 			btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan));
402 		if (ret < 0)
403 			goto err;
404 
405 		if (ret == DID_FILL_FROM_SCAN) {
406 			new_pass = true;
407 			ret = 0;
408 		}
409 
410 		if (ret == DROP_THIS_NODE) {
411 			six_unlock_read(&cur->c.lock);
412 			bch2_btree_node_evict(trans, cur_k.k);
413 			ret = bch2_journal_key_delete(c, b->c.btree_id,
414 						      b->c.level, cur_k.k->k.p);
415 			cur = NULL;
416 			if (ret)
417 				break;
418 			continue;
419 		}
420 
421 		if (prev)
422 			six_unlock_read(&prev->c.lock);
423 		prev = NULL;
424 
425 		if (ret == DROP_PREV_NODE) {
426 			bch_info(c, "dropped prev node");
427 			bch2_btree_node_evict(trans, prev_k.k);
428 			ret = bch2_journal_key_delete(c, b->c.btree_id,
429 						      b->c.level, prev_k.k->k.p);
430 			if (ret)
431 				break;
432 
433 			bch2_btree_and_journal_iter_exit(&iter);
434 			goto again;
435 		} else if (ret)
436 			break;
437 
438 		prev = cur;
439 		cur = NULL;
440 		bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
441 	}
442 
443 	if (!ret && !IS_ERR_OR_NULL(prev)) {
444 		BUG_ON(cur);
445 		ret = lockrestart_do(trans,
446 			btree_repair_node_end(trans, b, prev, pulled_from_scan));
447 		if (ret == DID_FILL_FROM_SCAN) {
448 			new_pass = true;
449 			ret = 0;
450 		}
451 	}
452 
453 	if (!IS_ERR_OR_NULL(prev))
454 		six_unlock_read(&prev->c.lock);
455 	prev = NULL;
456 	if (!IS_ERR_OR_NULL(cur))
457 		six_unlock_read(&cur->c.lock);
458 	cur = NULL;
459 
460 	if (ret)
461 		goto err;
462 
463 	bch2_btree_and_journal_iter_exit(&iter);
464 
465 	if (new_pass)
466 		goto again;
467 
468 	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
469 	iter.prefetch = true;
470 
471 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
472 		bch2_bkey_buf_reassemble(&cur_k, c, k);
473 		bch2_btree_and_journal_iter_advance(&iter);
474 
475 		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
476 					b->c.btree_id, b->c.level - 1,
477 					false);
478 		ret = PTR_ERR_OR_ZERO(cur);
479 
480 		bch_err_msg(c, ret, "getting btree node");
481 		if (ret)
482 			goto err;
483 
484 		ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
485 		six_unlock_read(&cur->c.lock);
486 		cur = NULL;
487 
488 		if (ret == DROP_THIS_NODE) {
489 			bch2_btree_node_evict(trans, cur_k.k);
490 			ret = bch2_journal_key_delete(c, b->c.btree_id,
491 						      b->c.level, cur_k.k->k.p);
492 			new_pass = true;
493 		}
494 
495 		if (ret)
496 			goto err;
497 
498 		have_child = true;
499 	}
500 
501 	printbuf_reset(&buf);
502 	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
503 	prt_newline(&buf);
504 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
505 
506 	/*
507 	 * XXX: we're not passing the trans object here because we're not set up
508 	 * to handle a transaction restart - this code needs to be rewritten
509 	 * when we start doing online topology repair
510 	 */
511 	bch2_trans_unlock_long(trans);
512 	if (mustfix_fsck_err_on(!have_child,
513 			c, btree_node_topology_interior_node_empty,
514 			"empty interior btree node at %s", buf.buf))
515 		ret = DROP_THIS_NODE;
516 err:
517 fsck_err:
518 	if (!IS_ERR_OR_NULL(prev))
519 		six_unlock_read(&prev->c.lock);
520 	if (!IS_ERR_OR_NULL(cur))
521 		six_unlock_read(&cur->c.lock);
522 
523 	bch2_btree_and_journal_iter_exit(&iter);
524 
525 	if (!ret && new_pass)
526 		goto again;
527 
528 	BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
529 
530 	bch2_bkey_buf_exit(&prev_k, c);
531 	bch2_bkey_buf_exit(&cur_k, c);
532 	printbuf_exit(&buf);
533 	bch_err_fn(c, ret);
534 	return ret;
535 }
536 
bch2_check_root(struct btree_trans * trans,enum btree_id btree,bool * reconstructed_root)537 static int bch2_check_root(struct btree_trans *trans, enum btree_id btree,
538 			   bool *reconstructed_root)
539 {
540 	struct bch_fs *c = trans->c;
541 	struct btree_root *r = bch2_btree_id_root(c, btree);
542 	struct printbuf buf = PRINTBUF;
543 	int ret = 0;
544 
545 	bch2_btree_id_to_text(&buf, btree);
546 
547 	if (r->error) {
548 		bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
549 
550 		ret = bch2_btree_has_scanned_nodes(c, btree);
551 		if (ret < 0)
552 			goto err;
553 
554 		if (!ret) {
555 			__fsck_err(trans,
556 				   FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0),
557 				   btree_root_unreadable_and_scan_found_nothing,
558 				   "no nodes found for btree %s, continue?", buf.buf);
559 
560 			r->alive = false;
561 			r->error = 0;
562 			bch2_btree_root_alloc_fake_trans(trans, btree, 0);
563 		} else {
564 			r->alive = false;
565 			r->error = 0;
566 			bch2_btree_root_alloc_fake_trans(trans, btree, 1);
567 
568 			bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
569 			ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX);
570 			if (ret)
571 				goto err;
572 		}
573 
574 		*reconstructed_root = true;
575 	}
576 err:
577 fsck_err:
578 	printbuf_exit(&buf);
579 	bch_err_fn(c, ret);
580 	return ret;
581 }
582 
bch2_check_topology(struct bch_fs * c)583 int bch2_check_topology(struct bch_fs *c)
584 {
585 	struct btree_trans *trans = bch2_trans_get(c);
586 	struct bpos pulled_from_scan = POS_MIN;
587 	int ret = 0;
588 
589 	bch2_trans_srcu_unlock(trans);
590 
591 	for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
592 		bool reconstructed_root = false;
593 recover:
594 		ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root));
595 		if (ret)
596 			break;
597 
598 		struct btree_root *r = bch2_btree_id_root(c, i);
599 		struct btree *b = r->b;
600 
601 		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
602 		ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
603 		six_unlock_read(&b->c.lock);
604 
605 		if (ret == DROP_THIS_NODE) {
606 			mutex_lock(&c->btree_cache.lock);
607 			bch2_btree_node_hash_remove(&c->btree_cache, b);
608 			mutex_unlock(&c->btree_cache.lock);
609 
610 			r->b = NULL;
611 
612 			if (!reconstructed_root) {
613 				r->error = -EIO;
614 				goto recover;
615 			}
616 
617 			struct printbuf buf = PRINTBUF;
618 			bch2_btree_id_to_text(&buf, i);
619 			bch_err(c, "empty btree root %s", buf.buf);
620 			printbuf_exit(&buf);
621 			bch2_btree_root_alloc_fake_trans(trans, i, 0);
622 			r->alive = false;
623 			ret = 0;
624 		}
625 	}
626 
627 	bch2_trans_put(trans);
628 	return ret;
629 }
630 
631 /* marking of btree keys/nodes: */
632 
bch2_gc_mark_key(struct btree_trans * trans,enum btree_id btree_id,unsigned level,struct btree ** prev,struct btree_iter * iter,struct bkey_s_c k,bool initial)633 static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
634 			    unsigned level, struct btree **prev,
635 			    struct btree_iter *iter, struct bkey_s_c k,
636 			    bool initial)
637 {
638 	struct bch_fs *c = trans->c;
639 
640 	if (iter) {
641 		struct btree_path *path = btree_iter_path(trans, iter);
642 		struct btree *b = path_l(path)->b;
643 
644 		if (*prev != b) {
645 			int ret = bch2_btree_node_check_topology(trans, b);
646 			if (ret)
647 				return ret;
648 		}
649 		*prev = b;
650 	}
651 
652 	struct bkey deleted = KEY(0, 0, 0);
653 	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
654 	struct printbuf buf = PRINTBUF;
655 	int ret = 0;
656 
657 	deleted.p = k.k->p;
658 
659 	if (initial) {
660 		BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) &&
661 		       k.k->bversion.lo > atomic64_read(&c->journal.seq));
662 
663 		if (fsck_err_on(btree_id != BTREE_ID_accounting &&
664 				k.k->bversion.lo > atomic64_read(&c->key_version),
665 				trans, bkey_version_in_future,
666 				"key version number higher than recorded %llu\n%s",
667 				atomic64_read(&c->key_version),
668 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
669 			atomic64_set(&c->key_version, k.k->bversion.lo);
670 	}
671 
672 	if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
673 				trans, btree_bitmap_not_marked,
674 				"btree ptr not marked in member info btree allocated bitmap\n%s",
675 				(printbuf_reset(&buf),
676 				 bch2_bkey_val_to_text(&buf, c, k),
677 				 buf.buf))) {
678 		mutex_lock(&c->sb_lock);
679 		bch2_dev_btree_bitmap_mark(c, k);
680 		bch2_write_super(c);
681 		mutex_unlock(&c->sb_lock);
682 	}
683 
684 	/*
685 	 * We require a commit before key_trigger() because
686 	 * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
687 	 * wrong result if we run it multiple times.
688 	 */
689 	unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
690 
691 	ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
692 			       BTREE_TRIGGER_check_repair|flags);
693 	if (ret)
694 		goto out;
695 
696 	if (trans->nr_updates) {
697 		ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
698 			-BCH_ERR_transaction_restart_nested;
699 		goto out;
700 	}
701 
702 	ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
703 			       BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
704 out:
705 fsck_err:
706 	printbuf_exit(&buf);
707 	bch_err_fn(c, ret);
708 	return ret;
709 }
710 
bch2_gc_btree(struct btree_trans * trans,struct progress_indicator_state * progress,enum btree_id btree,bool initial)711 static int bch2_gc_btree(struct btree_trans *trans,
712 			 struct progress_indicator_state *progress,
713 			 enum btree_id btree, bool initial)
714 {
715 	struct bch_fs *c = trans->c;
716 	unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
717 	int ret = 0;
718 
719 	/* We need to make sure every leaf node is readable before going RW */
720 	if (initial)
721 		target_depth = 0;
722 
723 	for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
724 		struct btree *prev = NULL;
725 		struct btree_iter iter;
726 		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
727 					  BTREE_ITER_prefetch);
728 
729 		ret = for_each_btree_key_continue(trans, iter, 0, k, ({
730 			bch2_progress_update_iter(trans, progress, &iter, "check_allocations");
731 			gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
732 			bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
733 		}));
734 		if (ret)
735 			goto err;
736 	}
737 
738 	/* root */
739 	do {
740 retry_root:
741 		bch2_trans_begin(trans);
742 
743 		struct btree_iter iter;
744 		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
745 					  0, bch2_btree_id_root(c, btree)->b->c.level, 0);
746 		struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
747 		ret = PTR_ERR_OR_ZERO(b);
748 		if (ret)
749 			goto err_root;
750 
751 		if (b != btree_node_root(c, b)) {
752 			bch2_trans_iter_exit(trans, &iter);
753 			goto retry_root;
754 		}
755 
756 		gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
757 		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
758 		ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
759 err_root:
760 		bch2_trans_iter_exit(trans, &iter);
761 	} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
762 err:
763 	bch_err_fn(c, ret);
764 	return ret;
765 }
766 
btree_id_gc_phase_cmp(enum btree_id l,enum btree_id r)767 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
768 {
769 	return cmp_int(gc_btree_order(l), gc_btree_order(r));
770 }
771 
bch2_gc_btrees(struct bch_fs * c)772 static int bch2_gc_btrees(struct bch_fs *c)
773 {
774 	struct btree_trans *trans = bch2_trans_get(c);
775 	struct printbuf buf = PRINTBUF;
776 	int ret = 0;
777 
778 	struct progress_indicator_state progress;
779 	bch2_progress_init(&progress, c, ~0ULL);
780 
781 	enum btree_id ids[BTREE_ID_NR];
782 	for (unsigned i = 0; i < BTREE_ID_NR; i++)
783 		ids[i] = i;
784 	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
785 
786 	for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
787 		unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
788 
789 		if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
790 			continue;
791 
792 		ret = bch2_gc_btree(trans, &progress, btree, true);
793 	}
794 
795 	printbuf_exit(&buf);
796 	bch2_trans_put(trans);
797 	bch_err_fn(c, ret);
798 	return ret;
799 }
800 
bch2_mark_superblocks(struct bch_fs * c)801 static int bch2_mark_superblocks(struct bch_fs *c)
802 {
803 	gc_pos_set(c, gc_phase(GC_PHASE_sb));
804 
805 	return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
806 }
807 
bch2_gc_free(struct bch_fs * c)808 static void bch2_gc_free(struct bch_fs *c)
809 {
810 	bch2_accounting_gc_free(c);
811 
812 	genradix_free(&c->reflink_gc_table);
813 	genradix_free(&c->gc_stripes);
814 
815 	for_each_member_device(c, ca)
816 		genradix_free(&ca->buckets_gc);
817 }
818 
bch2_gc_start(struct bch_fs * c)819 static int bch2_gc_start(struct bch_fs *c)
820 {
821 	for_each_member_device(c, ca) {
822 		int ret = bch2_dev_usage_init(ca, true);
823 		if (ret) {
824 			bch2_dev_put(ca);
825 			return ret;
826 		}
827 	}
828 
829 	return 0;
830 }
831 
832 /* returns true if not equal */
bch2_alloc_v4_cmp(struct bch_alloc_v4 l,struct bch_alloc_v4 r)833 static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
834 				     struct bch_alloc_v4 r)
835 {
836 	return  l.gen != r.gen				||
837 		l.oldest_gen != r.oldest_gen		||
838 		l.data_type != r.data_type		||
839 		l.dirty_sectors	!= r.dirty_sectors	||
840 		l.stripe_sectors != r.stripe_sectors	||
841 		l.cached_sectors != r.cached_sectors	 ||
842 		l.stripe_redundancy != r.stripe_redundancy ||
843 		l.stripe != r.stripe;
844 }
845 
bch2_alloc_write_key(struct btree_trans * trans,struct btree_iter * iter,struct bch_dev * ca,struct bkey_s_c k)846 static int bch2_alloc_write_key(struct btree_trans *trans,
847 				struct btree_iter *iter,
848 				struct bch_dev *ca,
849 				struct bkey_s_c k)
850 {
851 	struct bch_fs *c = trans->c;
852 	struct bkey_i_alloc_v4 *a;
853 	struct bch_alloc_v4 old_gc, gc, old_convert, new;
854 	const struct bch_alloc_v4 *old;
855 	int ret;
856 
857 	if (!bucket_valid(ca, k.k->p.offset))
858 		return 0;
859 
860 	old = bch2_alloc_to_v4(k, &old_convert);
861 	gc = new = *old;
862 
863 	__bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
864 
865 	old_gc = gc;
866 
867 	if ((old->data_type == BCH_DATA_sb ||
868 	     old->data_type == BCH_DATA_journal) &&
869 	    !bch2_dev_is_online(ca)) {
870 		gc.data_type = old->data_type;
871 		gc.dirty_sectors = old->dirty_sectors;
872 	}
873 
874 	/*
875 	 * gc.data_type doesn't yet include need_discard & need_gc_gen states -
876 	 * fix that here:
877 	 */
878 	alloc_data_type_set(&gc, gc.data_type);
879 	if (gc.data_type != old_gc.data_type ||
880 	    gc.dirty_sectors != old_gc.dirty_sectors) {
881 		ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
882 		if (ret)
883 			return ret;
884 
885 		/*
886 		 * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
887 		 * safe w.r.t. transaction restarts, so fixup the gc_bucket so
888 		 * we don't run it twice:
889 		 */
890 		struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
891 		gc_m->data_type = gc.data_type;
892 		gc_m->dirty_sectors = gc.dirty_sectors;
893 	}
894 
895 	if (fsck_err_on(new.data_type != gc.data_type,
896 			trans, alloc_key_data_type_wrong,
897 			"bucket %llu:%llu gen %u has wrong data_type"
898 			": got %s, should be %s",
899 			iter->pos.inode, iter->pos.offset,
900 			gc.gen,
901 			bch2_data_type_str(new.data_type),
902 			bch2_data_type_str(gc.data_type)))
903 		new.data_type = gc.data_type;
904 
905 #define copy_bucket_field(_errtype, _f)					\
906 	if (fsck_err_on(new._f != gc._f,				\
907 			trans, _errtype,				\
908 			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
909 			": got %llu, should be %llu",			\
910 			iter->pos.inode, iter->pos.offset,		\
911 			gc.gen,						\
912 			bch2_data_type_str(gc.data_type),		\
913 			(u64) new._f, (u64) gc._f))				\
914 		new._f = gc._f;						\
915 
916 	copy_bucket_field(alloc_key_gen_wrong,			gen);
917 	copy_bucket_field(alloc_key_dirty_sectors_wrong,	dirty_sectors);
918 	copy_bucket_field(alloc_key_stripe_sectors_wrong,	stripe_sectors);
919 	copy_bucket_field(alloc_key_cached_sectors_wrong,	cached_sectors);
920 	copy_bucket_field(alloc_key_stripe_wrong,		stripe);
921 	copy_bucket_field(alloc_key_stripe_redundancy_wrong,	stripe_redundancy);
922 #undef copy_bucket_field
923 
924 	if (!bch2_alloc_v4_cmp(*old, new))
925 		return 0;
926 
927 	a = bch2_alloc_to_v4_mut(trans, k);
928 	ret = PTR_ERR_OR_ZERO(a);
929 	if (ret)
930 		return ret;
931 
932 	a->v = new;
933 
934 	/*
935 	 * The trigger normally makes sure these are set, but we're not running
936 	 * triggers:
937 	 */
938 	if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
939 		a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
940 
941 	ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
942 fsck_err:
943 	return ret;
944 }
945 
bch2_gc_alloc_done(struct bch_fs * c)946 static int bch2_gc_alloc_done(struct bch_fs *c)
947 {
948 	int ret = 0;
949 
950 	for_each_member_device(c, ca) {
951 		ret = bch2_trans_run(c,
952 			for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc,
953 					POS(ca->dev_idx, ca->mi.first_bucket),
954 					POS(ca->dev_idx, ca->mi.nbuckets - 1),
955 					BTREE_ITER_slots|BTREE_ITER_prefetch, k,
956 					NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
957 				bch2_alloc_write_key(trans, &iter, ca, k)));
958 		if (ret) {
959 			bch2_dev_put(ca);
960 			break;
961 		}
962 	}
963 
964 	bch_err_fn(c, ret);
965 	return ret;
966 }
967 
bch2_gc_alloc_start(struct bch_fs * c)968 static int bch2_gc_alloc_start(struct bch_fs *c)
969 {
970 	int ret = 0;
971 
972 	for_each_member_device(c, ca) {
973 		ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
974 		if (ret) {
975 			bch2_dev_put(ca);
976 			ret = bch_err_throw(c, ENOMEM_gc_alloc_start);
977 			break;
978 		}
979 	}
980 
981 	bch_err_fn(c, ret);
982 	return ret;
983 }
984 
bch2_gc_write_stripes_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)985 static int bch2_gc_write_stripes_key(struct btree_trans *trans,
986 				     struct btree_iter *iter,
987 				     struct bkey_s_c k)
988 {
989 	struct bch_fs *c = trans->c;
990 	struct printbuf buf = PRINTBUF;
991 	const struct bch_stripe *s;
992 	struct gc_stripe *m;
993 	bool bad = false;
994 	unsigned i;
995 	int ret = 0;
996 
997 	if (k.k->type != KEY_TYPE_stripe)
998 		return 0;
999 
1000 	s = bkey_s_c_to_stripe(k).v;
1001 	m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
1002 
1003 	for (i = 0; i < s->nr_blocks; i++) {
1004 		u32 old = stripe_blockcount_get(s, i);
1005 		u32 new = (m ? m->block_sectors[i] : 0);
1006 
1007 		if (old != new) {
1008 			prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
1009 				   i, old, new);
1010 			bad = true;
1011 		}
1012 	}
1013 
1014 	if (bad)
1015 		bch2_bkey_val_to_text(&buf, c, k);
1016 
1017 	if (fsck_err_on(bad,
1018 			trans, stripe_sector_count_wrong,
1019 			"%s", buf.buf)) {
1020 		struct bkey_i_stripe *new;
1021 
1022 		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1023 		ret = PTR_ERR_OR_ZERO(new);
1024 		if (ret)
1025 			return ret;
1026 
1027 		bkey_reassemble(&new->k_i, k);
1028 
1029 		for (i = 0; i < new->v.nr_blocks; i++)
1030 			stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
1031 
1032 		ret = bch2_trans_update(trans, iter, &new->k_i, 0);
1033 	}
1034 fsck_err:
1035 	printbuf_exit(&buf);
1036 	return ret;
1037 }
1038 
bch2_gc_stripes_done(struct bch_fs * c)1039 static int bch2_gc_stripes_done(struct bch_fs *c)
1040 {
1041 	return bch2_trans_run(c,
1042 		for_each_btree_key_commit(trans, iter,
1043 				BTREE_ID_stripes, POS_MIN,
1044 				BTREE_ITER_prefetch, k,
1045 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1046 			bch2_gc_write_stripes_key(trans, &iter, k)));
1047 }
1048 
1049 /**
1050  * bch2_check_allocations - walk all references to buckets, and recompute them:
1051  *
1052  * @c:			filesystem object
1053  *
1054  * Returns: 0 on success, or standard errcode on failure
1055  *
1056  * Order matters here:
1057  *  - Concurrent GC relies on the fact that we have a total ordering for
1058  *    everything that GC walks - see  gc_will_visit_node(),
1059  *    gc_will_visit_root()
1060  *
1061  *  - also, references move around in the course of index updates and
1062  *    various other crap: everything needs to agree on the ordering
1063  *    references are allowed to move around in - e.g., we're allowed to
1064  *    start with a reference owned by an open_bucket (the allocator) and
1065  *    move it to the btree, but not the reverse.
1066  *
1067  *    This is necessary to ensure that gc doesn't miss references that
1068  *    move around - if references move backwards in the ordering GC
1069  *    uses, GC could skip past them
1070  */
bch2_check_allocations(struct bch_fs * c)1071 int bch2_check_allocations(struct bch_fs *c)
1072 {
1073 	int ret;
1074 
1075 	down_read(&c->state_lock);
1076 	down_write(&c->gc_lock);
1077 
1078 	bch2_btree_interior_updates_flush(c);
1079 
1080 	ret   = bch2_gc_accounting_start(c) ?:
1081 		bch2_gc_start(c) ?:
1082 		bch2_gc_alloc_start(c) ?:
1083 		bch2_gc_reflink_start(c);
1084 	if (ret)
1085 		goto out;
1086 
1087 	gc_pos_set(c, gc_phase(GC_PHASE_start));
1088 
1089 	ret = bch2_mark_superblocks(c);
1090 	bch_err_msg(c, ret, "marking superblocks");
1091 	if (ret)
1092 		goto out;
1093 
1094 	ret = bch2_gc_btrees(c);
1095 	if (ret)
1096 		goto out;
1097 
1098 	c->gc_count++;
1099 
1100 	ret   = bch2_gc_alloc_done(c) ?:
1101 		bch2_gc_accounting_done(c) ?:
1102 		bch2_gc_stripes_done(c) ?:
1103 		bch2_gc_reflink_done(c);
1104 out:
1105 	percpu_down_write(&c->mark_lock);
1106 	/* Indicates that gc is no longer in progress: */
1107 	__gc_pos_set(c, gc_phase(GC_PHASE_not_running));
1108 
1109 	bch2_gc_free(c);
1110 	percpu_up_write(&c->mark_lock);
1111 
1112 	up_write(&c->gc_lock);
1113 	up_read(&c->state_lock);
1114 
1115 	/*
1116 	 * At startup, allocations can happen directly instead of via the
1117 	 * allocator thread - issue wakeup in case they blocked on gc_lock:
1118 	 */
1119 	closure_wake_up(&c->freelist_wait);
1120 
1121 	if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags))
1122 		bch2_sb_members_clean_deleted(c);
1123 
1124 	bch_err_fn(c, ret);
1125 	return ret;
1126 }
1127 
gc_btree_gens_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1128 static int gc_btree_gens_key(struct btree_trans *trans,
1129 			     struct btree_iter *iter,
1130 			     struct bkey_s_c k)
1131 {
1132 	struct bch_fs *c = trans->c;
1133 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1134 
1135 	if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
1136 		return -EROFS;
1137 
1138 	bool too_stale = false;
1139 	scoped_guard(rcu) {
1140 		bkey_for_each_ptr(ptrs, ptr) {
1141 			struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
1142 			if (!ca)
1143 				continue;
1144 
1145 			too_stale |= dev_ptr_stale(ca, ptr) > 16;
1146 		}
1147 
1148 		if (!too_stale)
1149 			bkey_for_each_ptr(ptrs, ptr) {
1150 				struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
1151 				if (!ca)
1152 					continue;
1153 
1154 				u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
1155 				if (gen_after(*gen, ptr->gen))
1156 					*gen = ptr->gen;
1157 			}
1158 	}
1159 
1160 	if (too_stale) {
1161 		struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0);
1162 		int ret = PTR_ERR_OR_ZERO(u);
1163 		if (ret)
1164 			return ret;
1165 
1166 		bch2_extent_normalize(c, bkey_i_to_s(u));
1167 	}
1168 
1169 	return 0;
1170 }
1171 
bch2_alloc_write_oldest_gen(struct btree_trans * trans,struct bch_dev * ca,struct btree_iter * iter,struct bkey_s_c k)1172 static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
1173 				       struct btree_iter *iter, struct bkey_s_c k)
1174 {
1175 	struct bch_alloc_v4 a_convert;
1176 	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
1177 	struct bkey_i_alloc_v4 *a_mut;
1178 	int ret;
1179 
1180 	if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
1181 		return 0;
1182 
1183 	a_mut = bch2_alloc_to_v4_mut(trans, k);
1184 	ret = PTR_ERR_OR_ZERO(a_mut);
1185 	if (ret)
1186 		return ret;
1187 
1188 	a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
1189 
1190 	return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
1191 }
1192 
bch2_gc_gens(struct bch_fs * c)1193 int bch2_gc_gens(struct bch_fs *c)
1194 {
1195 	u64 b, start_time = local_clock();
1196 	int ret;
1197 
1198 	if (!mutex_trylock(&c->gc_gens_lock))
1199 		return 0;
1200 
1201 	trace_and_count(c, gc_gens_start, c);
1202 
1203 	/*
1204 	 * We have to use trylock here. Otherwise, we would
1205 	 * introduce a deadlock in the RO path - we take the
1206 	 * state lock at the start of going RO.
1207 	 */
1208 	if (!down_read_trylock(&c->state_lock)) {
1209 		mutex_unlock(&c->gc_gens_lock);
1210 		return 0;
1211 	}
1212 
1213 	for_each_member_device(c, ca) {
1214 		struct bucket_gens *gens = bucket_gens(ca);
1215 
1216 		BUG_ON(ca->oldest_gen);
1217 
1218 		ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
1219 		if (!ca->oldest_gen) {
1220 			bch2_dev_put(ca);
1221 			ret = bch_err_throw(c, ENOMEM_gc_gens);
1222 			goto err;
1223 		}
1224 
1225 		for (b = gens->first_bucket;
1226 		     b < gens->nbuckets; b++)
1227 			ca->oldest_gen[b] = gens->b[b];
1228 	}
1229 
1230 	for (unsigned i = 0; i < BTREE_ID_NR; i++)
1231 		if (btree_type_has_ptrs(i)) {
1232 			c->gc_gens_btree = i;
1233 			c->gc_gens_pos = POS_MIN;
1234 
1235 			ret = bch2_trans_run(c,
1236 				for_each_btree_key_commit(trans, iter, i,
1237 						POS_MIN,
1238 						BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
1239 						k,
1240 						NULL, NULL,
1241 						BCH_TRANS_COMMIT_no_enospc,
1242 					gc_btree_gens_key(trans, &iter, k)));
1243 			if (ret)
1244 				goto err;
1245 		}
1246 
1247 	struct bch_dev *ca = NULL;
1248 	ret = bch2_trans_run(c,
1249 		for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1250 				POS_MIN,
1251 				BTREE_ITER_prefetch,
1252 				k,
1253 				NULL, NULL,
1254 				BCH_TRANS_COMMIT_no_enospc, ({
1255 			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
1256 			if (!ca) {
1257 				bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
1258 				continue;
1259 			}
1260 			bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
1261 		})));
1262 	bch2_dev_put(ca);
1263 
1264 	if (ret)
1265 		goto err;
1266 
1267 	c->gc_gens_btree	= 0;
1268 	c->gc_gens_pos		= POS_MIN;
1269 
1270 	c->gc_count++;
1271 
1272 	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
1273 	trace_and_count(c, gc_gens_end, c);
1274 err:
1275 	for_each_member_device(c, ca) {
1276 		kvfree(ca->oldest_gen);
1277 		ca->oldest_gen = NULL;
1278 	}
1279 
1280 	up_read(&c->state_lock);
1281 	mutex_unlock(&c->gc_gens_lock);
1282 	if (!bch2_err_matches(ret, EROFS))
1283 		bch_err_fn(c, ret);
1284 	return ret;
1285 }
1286 
bch2_gc_gens_work(struct work_struct * work)1287 static void bch2_gc_gens_work(struct work_struct *work)
1288 {
1289 	struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
1290 	bch2_gc_gens(c);
1291 	enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
1292 }
1293 
bch2_gc_gens_async(struct bch_fs * c)1294 void bch2_gc_gens_async(struct bch_fs *c)
1295 {
1296 	if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) &&
1297 	    !queue_work(c->write_ref_wq, &c->gc_gens_work))
1298 		enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
1299 }
1300 
bch2_fs_btree_gc_init_early(struct bch_fs * c)1301 void bch2_fs_btree_gc_init_early(struct bch_fs *c)
1302 {
1303 	seqcount_init(&c->gc_pos_lock);
1304 	INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
1305 
1306 	init_rwsem(&c->gc_lock);
1307 	mutex_init(&c->gc_gens_lock);
1308 }
1309