xref: /linux/fs/bcachefs/btree_gc.c (revision 031fba65fc202abf1f193e321be7a2c274fd88ba)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4  * Copyright (C) 2014 Datera Inc.
5  */
6 
7 #include "bcachefs.h"
8 #include "alloc_background.h"
9 #include "alloc_foreground.h"
10 #include "bkey_methods.h"
11 #include "bkey_buf.h"
12 #include "btree_journal_iter.h"
13 #include "btree_key_cache.h"
14 #include "btree_locking.h"
15 #include "btree_update_interior.h"
16 #include "btree_io.h"
17 #include "btree_gc.h"
18 #include "buckets.h"
19 #include "clock.h"
20 #include "debug.h"
21 #include "ec.h"
22 #include "error.h"
23 #include "extents.h"
24 #include "journal.h"
25 #include "keylist.h"
26 #include "move.h"
27 #include "recovery.h"
28 #include "reflink.h"
29 #include "replicas.h"
30 #include "super-io.h"
31 #include "trace.h"
32 
33 #include <linux/slab.h>
34 #include <linux/bitops.h>
35 #include <linux/freezer.h>
36 #include <linux/kthread.h>
37 #include <linux/preempt.h>
38 #include <linux/rcupdate.h>
39 #include <linux/sched/task.h>
40 
41 #define DROP_THIS_NODE		10
42 #define DROP_PREV_NODE		11
43 
44 static bool should_restart_for_topology_repair(struct bch_fs *c)
45 {
46 	return c->opts.fix_errors != FSCK_FIX_no &&
47 		!(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
48 }
49 
50 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
51 {
52 	preempt_disable();
53 	write_seqcount_begin(&c->gc_pos_lock);
54 	c->gc_pos = new_pos;
55 	write_seqcount_end(&c->gc_pos_lock);
56 	preempt_enable();
57 }
58 
59 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
60 {
61 	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
62 	__gc_pos_set(c, new_pos);
63 }
64 
65 /*
66  * Missing: if an interior btree node is empty, we need to do something -
67  * perhaps just kill it
68  */
69 static int bch2_gc_check_topology(struct bch_fs *c,
70 				  struct btree *b,
71 				  struct bkey_buf *prev,
72 				  struct bkey_buf cur,
73 				  bool is_last)
74 {
75 	struct bpos node_start	= b->data->min_key;
76 	struct bpos node_end	= b->data->max_key;
77 	struct bpos expected_start = bkey_deleted(&prev->k->k)
78 		? node_start
79 		: bpos_successor(prev->k->k.p);
80 	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
81 	int ret = 0;
82 
83 	if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
84 		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
85 
86 		if (!bpos_eq(expected_start, bp->v.min_key)) {
87 			bch2_topology_error(c);
88 
89 			if (bkey_deleted(&prev->k->k)) {
90 				prt_printf(&buf1, "start of node: ");
91 				bch2_bpos_to_text(&buf1, node_start);
92 			} else {
93 				bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
94 			}
95 			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
96 
97 			if (__fsck_err(c,
98 				  FSCK_CAN_FIX|
99 				  FSCK_CAN_IGNORE|
100 				  FSCK_NO_RATELIMIT,
101 				  "btree node with incorrect min_key at btree %s level %u:\n"
102 				  "  prev %s\n"
103 				  "  cur %s",
104 				  bch2_btree_ids[b->c.btree_id], b->c.level,
105 				  buf1.buf, buf2.buf) &&
106 			    should_restart_for_topology_repair(c)) {
107 				bch_info(c, "Halting mark and sweep to start topology repair pass");
108 				ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
109 				goto err;
110 			} else {
111 				set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
112 			}
113 		}
114 	}
115 
116 	if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
117 		bch2_topology_error(c);
118 
119 		printbuf_reset(&buf1);
120 		printbuf_reset(&buf2);
121 
122 		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
123 		bch2_bpos_to_text(&buf2, node_end);
124 
125 		if (__fsck_err(c,
126 			  FSCK_CAN_FIX|
127 			  FSCK_CAN_IGNORE|
128 			  FSCK_NO_RATELIMIT,
129 			  "btree node with incorrect max_key at btree %s level %u:\n"
130 			  "  %s\n"
131 			  "  expected %s",
132 			  bch2_btree_ids[b->c.btree_id], b->c.level,
133 			  buf1.buf, buf2.buf) &&
134 		    should_restart_for_topology_repair(c)) {
135 			bch_info(c, "Halting mark and sweep to start topology repair pass");
136 			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
137 			goto err;
138 		} else {
139 			set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
140 		}
141 	}
142 
143 	bch2_bkey_buf_copy(prev, c, cur.k);
144 err:
145 fsck_err:
146 	printbuf_exit(&buf2);
147 	printbuf_exit(&buf1);
148 	return ret;
149 }
150 
151 static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
152 {
153 	switch (b->key.k.type) {
154 	case KEY_TYPE_btree_ptr: {
155 		struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
156 
157 		dst->k.p		= src->k.p;
158 		dst->v.mem_ptr		= 0;
159 		dst->v.seq		= b->data->keys.seq;
160 		dst->v.sectors_written	= 0;
161 		dst->v.flags		= 0;
162 		dst->v.min_key		= b->data->min_key;
163 		set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
164 		memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
165 		break;
166 	}
167 	case KEY_TYPE_btree_ptr_v2:
168 		bkey_copy(&dst->k_i, &b->key);
169 		break;
170 	default:
171 		BUG();
172 	}
173 }
174 
175 static void bch2_btree_node_update_key_early(struct btree_trans *trans,
176 					     enum btree_id btree, unsigned level,
177 					     struct bkey_s_c old, struct bkey_i *new)
178 {
179 	struct bch_fs *c = trans->c;
180 	struct btree *b;
181 	struct bkey_buf tmp;
182 	int ret;
183 
184 	bch2_bkey_buf_init(&tmp);
185 	bch2_bkey_buf_reassemble(&tmp, c, old);
186 
187 	b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
188 	if (!IS_ERR_OR_NULL(b)) {
189 		mutex_lock(&c->btree_cache.lock);
190 
191 		bch2_btree_node_hash_remove(&c->btree_cache, b);
192 
193 		bkey_copy(&b->key, new);
194 		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
195 		BUG_ON(ret);
196 
197 		mutex_unlock(&c->btree_cache.lock);
198 		six_unlock_read(&b->c.lock);
199 	}
200 
201 	bch2_bkey_buf_exit(&tmp, c);
202 }
203 
204 static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
205 {
206 	struct bkey_i_btree_ptr_v2 *new;
207 	int ret;
208 
209 	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
210 	if (!new)
211 		return -BCH_ERR_ENOMEM_gc_repair_key;
212 
213 	btree_ptr_to_v2(b, new);
214 	b->data->min_key	= new_min;
215 	new->v.min_key		= new_min;
216 	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
217 
218 	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
219 	if (ret) {
220 		kfree(new);
221 		return ret;
222 	}
223 
224 	bch2_btree_node_drop_keys_outside_node(b);
225 	bkey_copy(&b->key, &new->k_i);
226 	return 0;
227 }
228 
229 static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
230 {
231 	struct bkey_i_btree_ptr_v2 *new;
232 	int ret;
233 
234 	ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
235 	if (ret)
236 		return ret;
237 
238 	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
239 	if (!new)
240 		return -BCH_ERR_ENOMEM_gc_repair_key;
241 
242 	btree_ptr_to_v2(b, new);
243 	b->data->max_key	= new_max;
244 	new->k.p		= new_max;
245 	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
246 
247 	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
248 	if (ret) {
249 		kfree(new);
250 		return ret;
251 	}
252 
253 	bch2_btree_node_drop_keys_outside_node(b);
254 
255 	mutex_lock(&c->btree_cache.lock);
256 	bch2_btree_node_hash_remove(&c->btree_cache, b);
257 
258 	bkey_copy(&b->key, &new->k_i);
259 	ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
260 	BUG_ON(ret);
261 	mutex_unlock(&c->btree_cache.lock);
262 	return 0;
263 }
264 
265 static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
266 					struct btree *prev, struct btree *cur)
267 {
268 	struct bpos expected_start = !prev
269 		? b->data->min_key
270 		: bpos_successor(prev->key.k.p);
271 	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
272 	int ret = 0;
273 
274 	if (!prev) {
275 		prt_printf(&buf1, "start of node: ");
276 		bch2_bpos_to_text(&buf1, b->data->min_key);
277 	} else {
278 		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
279 	}
280 
281 	bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
282 
283 	if (prev &&
284 	    bpos_gt(expected_start, cur->data->min_key) &&
285 	    BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
286 		/* cur overwrites prev: */
287 
288 		if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
289 						cur->data->min_key), c,
290 				"btree node overwritten by next node at btree %s level %u:\n"
291 				"  node %s\n"
292 				"  next %s",
293 				bch2_btree_ids[b->c.btree_id], b->c.level,
294 				buf1.buf, buf2.buf)) {
295 			ret = DROP_PREV_NODE;
296 			goto out;
297 		}
298 
299 		if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
300 						 bpos_predecessor(cur->data->min_key)), c,
301 				"btree node with incorrect max_key at btree %s level %u:\n"
302 				"  node %s\n"
303 				"  next %s",
304 				bch2_btree_ids[b->c.btree_id], b->c.level,
305 				buf1.buf, buf2.buf))
306 			ret = set_node_max(c, prev,
307 					   bpos_predecessor(cur->data->min_key));
308 	} else {
309 		/* prev overwrites cur: */
310 
311 		if (mustfix_fsck_err_on(bpos_ge(expected_start,
312 						cur->data->max_key), c,
313 				"btree node overwritten by prev node at btree %s level %u:\n"
314 				"  prev %s\n"
315 				"  node %s",
316 				bch2_btree_ids[b->c.btree_id], b->c.level,
317 				buf1.buf, buf2.buf)) {
318 			ret = DROP_THIS_NODE;
319 			goto out;
320 		}
321 
322 		if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
323 				"btree node with incorrect min_key at btree %s level %u:\n"
324 				"  prev %s\n"
325 				"  node %s",
326 				bch2_btree_ids[b->c.btree_id], b->c.level,
327 				buf1.buf, buf2.buf))
328 			ret = set_node_min(c, cur, expected_start);
329 	}
330 out:
331 fsck_err:
332 	printbuf_exit(&buf2);
333 	printbuf_exit(&buf1);
334 	return ret;
335 }
336 
337 static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
338 				 struct btree *child)
339 {
340 	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
341 	int ret = 0;
342 
343 	bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
344 	bch2_bpos_to_text(&buf2, b->key.k.p);
345 
346 	if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
347 			"btree node with incorrect max_key at btree %s level %u:\n"
348 			"  %s\n"
349 			"  expected %s",
350 			bch2_btree_ids[b->c.btree_id], b->c.level,
351 			buf1.buf, buf2.buf)) {
352 		ret = set_node_max(c, child, b->key.k.p);
353 		if (ret)
354 			goto err;
355 	}
356 err:
357 fsck_err:
358 	printbuf_exit(&buf2);
359 	printbuf_exit(&buf1);
360 	return ret;
361 }
362 
363 static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
364 {
365 	struct bch_fs *c = trans->c;
366 	struct btree_and_journal_iter iter;
367 	struct bkey_s_c k;
368 	struct bkey_buf prev_k, cur_k;
369 	struct btree *prev = NULL, *cur = NULL;
370 	bool have_child, dropped_children = false;
371 	struct printbuf buf = PRINTBUF;
372 	int ret = 0;
373 
374 	if (!b->c.level)
375 		return 0;
376 again:
377 	prev = NULL;
378 	have_child = dropped_children = false;
379 	bch2_bkey_buf_init(&prev_k);
380 	bch2_bkey_buf_init(&cur_k);
381 	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
382 
383 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
384 		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
385 		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
386 
387 		bch2_btree_and_journal_iter_advance(&iter);
388 		bch2_bkey_buf_reassemble(&cur_k, c, k);
389 
390 		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
391 					b->c.btree_id, b->c.level - 1,
392 					false);
393 		ret = PTR_ERR_OR_ZERO(cur);
394 
395 		printbuf_reset(&buf);
396 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
397 
398 		if (mustfix_fsck_err_on(ret == -EIO, c,
399 				"Topology repair: unreadable btree node at btree %s level %u:\n"
400 				"  %s",
401 				bch2_btree_ids[b->c.btree_id],
402 				b->c.level - 1,
403 				buf.buf)) {
404 			bch2_btree_node_evict(trans, cur_k.k);
405 			ret = bch2_journal_key_delete(c, b->c.btree_id,
406 						      b->c.level, cur_k.k->k.p);
407 			cur = NULL;
408 			if (ret)
409 				break;
410 			continue;
411 		}
412 
413 		if (ret) {
414 			bch_err_msg(c, ret, "getting btree node");
415 			break;
416 		}
417 
418 		ret = btree_repair_node_boundaries(c, b, prev, cur);
419 
420 		if (ret == DROP_THIS_NODE) {
421 			six_unlock_read(&cur->c.lock);
422 			bch2_btree_node_evict(trans, cur_k.k);
423 			ret = bch2_journal_key_delete(c, b->c.btree_id,
424 						      b->c.level, cur_k.k->k.p);
425 			cur = NULL;
426 			if (ret)
427 				break;
428 			continue;
429 		}
430 
431 		if (prev)
432 			six_unlock_read(&prev->c.lock);
433 		prev = NULL;
434 
435 		if (ret == DROP_PREV_NODE) {
436 			bch2_btree_node_evict(trans, prev_k.k);
437 			ret = bch2_journal_key_delete(c, b->c.btree_id,
438 						      b->c.level, prev_k.k->k.p);
439 			if (ret)
440 				break;
441 
442 			bch2_btree_and_journal_iter_exit(&iter);
443 			bch2_bkey_buf_exit(&prev_k, c);
444 			bch2_bkey_buf_exit(&cur_k, c);
445 			goto again;
446 		} else if (ret)
447 			break;
448 
449 		prev = cur;
450 		cur = NULL;
451 		bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
452 	}
453 
454 	if (!ret && !IS_ERR_OR_NULL(prev)) {
455 		BUG_ON(cur);
456 		ret = btree_repair_node_end(c, b, prev);
457 	}
458 
459 	if (!IS_ERR_OR_NULL(prev))
460 		six_unlock_read(&prev->c.lock);
461 	prev = NULL;
462 	if (!IS_ERR_OR_NULL(cur))
463 		six_unlock_read(&cur->c.lock);
464 	cur = NULL;
465 
466 	if (ret)
467 		goto err;
468 
469 	bch2_btree_and_journal_iter_exit(&iter);
470 	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
471 
472 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
473 		bch2_bkey_buf_reassemble(&cur_k, c, k);
474 		bch2_btree_and_journal_iter_advance(&iter);
475 
476 		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
477 					b->c.btree_id, b->c.level - 1,
478 					false);
479 		ret = PTR_ERR_OR_ZERO(cur);
480 
481 		if (ret) {
482 			bch_err_msg(c, ret, "getting btree node");
483 			goto err;
484 		}
485 
486 		ret = bch2_btree_repair_topology_recurse(trans, cur);
487 		six_unlock_read(&cur->c.lock);
488 		cur = NULL;
489 
490 		if (ret == DROP_THIS_NODE) {
491 			bch2_btree_node_evict(trans, cur_k.k);
492 			ret = bch2_journal_key_delete(c, b->c.btree_id,
493 						      b->c.level, cur_k.k->k.p);
494 			dropped_children = true;
495 		}
496 
497 		if (ret)
498 			goto err;
499 
500 		have_child = true;
501 	}
502 
503 	printbuf_reset(&buf);
504 	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
505 
506 	if (mustfix_fsck_err_on(!have_child, c,
507 			"empty interior btree node at btree %s level %u\n"
508 			"  %s",
509 			bch2_btree_ids[b->c.btree_id],
510 			b->c.level, buf.buf))
511 		ret = DROP_THIS_NODE;
512 err:
513 fsck_err:
514 	if (!IS_ERR_OR_NULL(prev))
515 		six_unlock_read(&prev->c.lock);
516 	if (!IS_ERR_OR_NULL(cur))
517 		six_unlock_read(&cur->c.lock);
518 
519 	bch2_btree_and_journal_iter_exit(&iter);
520 	bch2_bkey_buf_exit(&prev_k, c);
521 	bch2_bkey_buf_exit(&cur_k, c);
522 
523 	if (!ret && dropped_children)
524 		goto again;
525 
526 	printbuf_exit(&buf);
527 	return ret;
528 }
529 
530 int bch2_check_topology(struct bch_fs *c)
531 {
532 	struct btree_trans *trans = bch2_trans_get(c);
533 	struct btree *b;
534 	unsigned i;
535 	int ret = 0;
536 
537 	for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
538 		struct btree_root *r = bch2_btree_id_root(c, i);
539 
540 		if (!r->alive)
541 			continue;
542 
543 		b = r->b;
544 		if (btree_node_fake(b))
545 			continue;
546 
547 		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
548 		ret = bch2_btree_repair_topology_recurse(trans, b);
549 		six_unlock_read(&b->c.lock);
550 
551 		if (ret == DROP_THIS_NODE) {
552 			bch_err(c, "empty btree root - repair unimplemented");
553 			ret = -BCH_ERR_fsck_repair_unimplemented;
554 		}
555 	}
556 
557 	bch2_trans_put(trans);
558 
559 	return ret;
560 }
561 
562 static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
563 			       unsigned level, bool is_root,
564 			       struct bkey_s_c *k)
565 {
566 	struct bch_fs *c = trans->c;
567 	struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k);
568 	const union bch_extent_entry *entry_c;
569 	struct extent_ptr_decoded p = { 0 };
570 	bool do_update = false;
571 	struct printbuf buf = PRINTBUF;
572 	int ret = 0;
573 
574 	/*
575 	 * XXX
576 	 * use check_bucket_ref here
577 	 */
578 	bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
579 		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
580 		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
581 		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
582 
583 		if (!g->gen_valid &&
584 		    (c->opts.reconstruct_alloc ||
585 		     fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
586 			      "while marking %s",
587 			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
588 			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
589 			      p.ptr.gen,
590 			      (printbuf_reset(&buf),
591 			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
592 			if (!p.ptr.cached) {
593 				g->gen_valid		= true;
594 				g->gen			= p.ptr.gen;
595 			} else {
596 				do_update = true;
597 			}
598 		}
599 
600 		if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
601 		    (c->opts.reconstruct_alloc ||
602 		     fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
603 			      "while marking %s",
604 			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
605 			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
606 			      p.ptr.gen, g->gen,
607 			      (printbuf_reset(&buf),
608 			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
609 			if (!p.ptr.cached) {
610 				g->gen_valid		= true;
611 				g->gen			= p.ptr.gen;
612 				g->data_type		= 0;
613 				g->dirty_sectors	= 0;
614 				g->cached_sectors	= 0;
615 				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
616 			} else {
617 				do_update = true;
618 			}
619 		}
620 
621 		if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
622 		    (c->opts.reconstruct_alloc ||
623 		     fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
624 			      "while marking %s",
625 			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
626 			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
627 			      p.ptr.gen,
628 			      (printbuf_reset(&buf),
629 			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
630 			do_update = true;
631 
632 		if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
633 		    (c->opts.reconstruct_alloc ||
634 		     fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
635 			      "while marking %s",
636 			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
637 			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
638 			      p.ptr.gen, g->gen,
639 			      (printbuf_reset(&buf),
640 			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
641 			do_update = true;
642 
643 		if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
644 			continue;
645 
646 		if (fsck_err_on(bucket_data_type(g->data_type) &&
647 				bucket_data_type(g->data_type) != data_type, c,
648 				"bucket %u:%zu different types of data in same bucket: %s, %s\n"
649 				"while marking %s",
650 				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
651 				bch2_data_types[g->data_type],
652 				bch2_data_types[data_type],
653 				(printbuf_reset(&buf),
654 				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
655 			if (data_type == BCH_DATA_btree) {
656 				g->data_type	= data_type;
657 				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
658 			} else {
659 				do_update = true;
660 			}
661 		}
662 
663 		if (p.has_ec) {
664 			struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
665 
666 			if (fsck_err_on(!m || !m->alive, c,
667 					"pointer to nonexistent stripe %llu\n"
668 					"while marking %s",
669 					(u64) p.ec.idx,
670 					(printbuf_reset(&buf),
671 					 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
672 				do_update = true;
673 
674 			if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
675 					"pointer does not match stripe %llu\n"
676 					"while marking %s",
677 					(u64) p.ec.idx,
678 					(printbuf_reset(&buf),
679 					 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
680 				do_update = true;
681 		}
682 	}
683 
684 	if (do_update) {
685 		struct bkey_ptrs ptrs;
686 		union bch_extent_entry *entry;
687 		struct bch_extent_ptr *ptr;
688 		struct bkey_i *new;
689 
690 		if (is_root) {
691 			bch_err(c, "cannot update btree roots yet");
692 			ret = -EINVAL;
693 			goto err;
694 		}
695 
696 		new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
697 		if (!new) {
698 			bch_err_msg(c, ret, "allocating new key");
699 			ret = -BCH_ERR_ENOMEM_gc_repair_key;
700 			goto err;
701 		}
702 
703 		bkey_reassemble(new, *k);
704 
705 		if (level) {
706 			/*
707 			 * We don't want to drop btree node pointers - if the
708 			 * btree node isn't there anymore, the read path will
709 			 * sort it out:
710 			 */
711 			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
712 			bkey_for_each_ptr(ptrs, ptr) {
713 				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
714 				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
715 
716 				ptr->gen = g->gen;
717 			}
718 		} else {
719 			bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
720 				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
721 				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
722 				enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
723 
724 				(ptr->cached &&
725 				 (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
726 				(!ptr->cached &&
727 				 gen_cmp(ptr->gen, g->gen) < 0) ||
728 				gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
729 				(g->data_type &&
730 				 g->data_type != data_type);
731 			}));
732 again:
733 			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
734 			bkey_extent_entry_for_each(ptrs, entry) {
735 				if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
736 					struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
737 									entry->stripe_ptr.idx);
738 					union bch_extent_entry *next_ptr;
739 
740 					bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
741 						if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
742 							goto found;
743 					next_ptr = NULL;
744 found:
745 					if (!next_ptr) {
746 						bch_err(c, "aieee, found stripe ptr with no data ptr");
747 						continue;
748 					}
749 
750 					if (!m || !m->alive ||
751 					    !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
752 								       &next_ptr->ptr,
753 								       m->sectors)) {
754 						bch2_bkey_extent_entry_drop(new, entry);
755 						goto again;
756 					}
757 				}
758 			}
759 		}
760 
761 		ret = bch2_journal_key_insert_take(c, btree_id, level, new);
762 		if (ret) {
763 			kfree(new);
764 			goto err;
765 		}
766 
767 		if (level)
768 			bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
769 
770 		if (0) {
771 			printbuf_reset(&buf);
772 			bch2_bkey_val_to_text(&buf, c, *k);
773 			bch_info(c, "updated %s", buf.buf);
774 
775 			printbuf_reset(&buf);
776 			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
777 			bch_info(c, "new key %s", buf.buf);
778 		}
779 
780 		*k = bkey_i_to_s_c(new);
781 	}
782 err:
783 fsck_err:
784 	printbuf_exit(&buf);
785 	return ret;
786 }
787 
788 /* marking of btree keys/nodes: */
789 
790 static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
791 			    unsigned level, bool is_root,
792 			    struct bkey_s_c *k,
793 			    bool initial)
794 {
795 	struct bch_fs *c = trans->c;
796 	struct bkey deleted = KEY(0, 0, 0);
797 	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
798 	unsigned flags =
799 		BTREE_TRIGGER_GC|
800 		(initial ? BTREE_TRIGGER_NOATOMIC : 0);
801 	int ret = 0;
802 
803 	deleted.p = k->k->p;
804 
805 	if (initial) {
806 		BUG_ON(bch2_journal_seq_verify &&
807 		       k->k->version.lo > atomic64_read(&c->journal.seq));
808 
809 		ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
810 		if (ret)
811 			goto err;
812 
813 		if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
814 				"key version number higher than recorded: %llu > %llu",
815 				k->k->version.lo,
816 				atomic64_read(&c->key_version)))
817 			atomic64_set(&c->key_version, k->k->version.lo);
818 	}
819 
820 	ret = commit_do(trans, NULL, NULL, 0,
821 			bch2_mark_key(trans, btree_id, level, old, *k, flags));
822 fsck_err:
823 err:
824 	if (ret)
825 		bch_err_fn(c, ret);
826 	return ret;
827 }
828 
829 static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
830 {
831 	struct bch_fs *c = trans->c;
832 	struct btree_node_iter iter;
833 	struct bkey unpacked;
834 	struct bkey_s_c k;
835 	struct bkey_buf prev, cur;
836 	int ret = 0;
837 
838 	if (!btree_node_type_needs_gc(btree_node_type(b)))
839 		return 0;
840 
841 	bch2_btree_node_iter_init_from_start(&iter, b);
842 	bch2_bkey_buf_init(&prev);
843 	bch2_bkey_buf_init(&cur);
844 	bkey_init(&prev.k->k);
845 
846 	while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
847 		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
848 				       &k, initial);
849 		if (ret)
850 			break;
851 
852 		bch2_btree_node_iter_advance(&iter, b);
853 
854 		if (b->c.level) {
855 			bch2_bkey_buf_reassemble(&cur, c, k);
856 
857 			ret = bch2_gc_check_topology(c, b, &prev, cur,
858 					bch2_btree_node_iter_end(&iter));
859 			if (ret)
860 				break;
861 		}
862 	}
863 
864 	bch2_bkey_buf_exit(&cur, c);
865 	bch2_bkey_buf_exit(&prev, c);
866 	return ret;
867 }
868 
869 static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
870 			 bool initial, bool metadata_only)
871 {
872 	struct bch_fs *c = trans->c;
873 	struct btree_iter iter;
874 	struct btree *b;
875 	unsigned depth = metadata_only ? 1 : 0;
876 	int ret = 0;
877 
878 	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
879 
880 	__for_each_btree_node(trans, iter, btree_id, POS_MIN,
881 			      0, depth, BTREE_ITER_PREFETCH, b, ret) {
882 		bch2_verify_btree_nr_keys(b);
883 
884 		gc_pos_set(c, gc_pos_btree_node(b));
885 
886 		ret = btree_gc_mark_node(trans, b, initial);
887 		if (ret)
888 			break;
889 	}
890 	bch2_trans_iter_exit(trans, &iter);
891 
892 	if (ret)
893 		return ret;
894 
895 	mutex_lock(&c->btree_root_lock);
896 	b = bch2_btree_id_root(c, btree_id)->b;
897 	if (!btree_node_fake(b)) {
898 		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
899 
900 		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
901 				       true, &k, initial);
902 	}
903 	gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
904 	mutex_unlock(&c->btree_root_lock);
905 
906 	return ret;
907 }
908 
909 static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b,
910 				      unsigned target_depth)
911 {
912 	struct bch_fs *c = trans->c;
913 	struct btree_and_journal_iter iter;
914 	struct bkey_s_c k;
915 	struct bkey_buf cur, prev;
916 	struct printbuf buf = PRINTBUF;
917 	int ret = 0;
918 
919 	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
920 	bch2_bkey_buf_init(&prev);
921 	bch2_bkey_buf_init(&cur);
922 	bkey_init(&prev.k->k);
923 
924 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
925 		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
926 		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
927 
928 		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
929 				       false, &k, true);
930 		if (ret)
931 			goto fsck_err;
932 
933 		if (b->c.level) {
934 			bch2_bkey_buf_reassemble(&cur, c, k);
935 			k = bkey_i_to_s_c(cur.k);
936 
937 			bch2_btree_and_journal_iter_advance(&iter);
938 
939 			ret = bch2_gc_check_topology(c, b,
940 					&prev, cur,
941 					!bch2_btree_and_journal_iter_peek(&iter).k);
942 			if (ret)
943 				goto fsck_err;
944 		} else {
945 			bch2_btree_and_journal_iter_advance(&iter);
946 		}
947 	}
948 
949 	if (b->c.level > target_depth) {
950 		bch2_btree_and_journal_iter_exit(&iter);
951 		bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
952 
953 		while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
954 			struct btree *child;
955 
956 			bch2_bkey_buf_reassemble(&cur, c, k);
957 			bch2_btree_and_journal_iter_advance(&iter);
958 
959 			child = bch2_btree_node_get_noiter(trans, cur.k,
960 						b->c.btree_id, b->c.level - 1,
961 						false);
962 			ret = PTR_ERR_OR_ZERO(child);
963 
964 			if (ret == -EIO) {
965 				bch2_topology_error(c);
966 
967 				if (__fsck_err(c,
968 					  FSCK_CAN_FIX|
969 					  FSCK_CAN_IGNORE|
970 					  FSCK_NO_RATELIMIT,
971 					  "Unreadable btree node at btree %s level %u:\n"
972 					  "  %s",
973 					  bch2_btree_ids[b->c.btree_id],
974 					  b->c.level - 1,
975 					  (printbuf_reset(&buf),
976 					   bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
977 				    should_restart_for_topology_repair(c)) {
978 					bch_info(c, "Halting mark and sweep to start topology repair pass");
979 					ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
980 					goto fsck_err;
981 				} else {
982 					/* Continue marking when opted to not
983 					 * fix the error: */
984 					ret = 0;
985 					set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
986 					continue;
987 				}
988 			} else if (ret) {
989 				bch_err_msg(c, ret, "getting btree node");
990 				break;
991 			}
992 
993 			ret = bch2_gc_btree_init_recurse(trans, child,
994 							 target_depth);
995 			six_unlock_read(&child->c.lock);
996 
997 			if (ret)
998 				break;
999 		}
1000 	}
1001 fsck_err:
1002 	bch2_bkey_buf_exit(&cur, c);
1003 	bch2_bkey_buf_exit(&prev, c);
1004 	bch2_btree_and_journal_iter_exit(&iter);
1005 	printbuf_exit(&buf);
1006 	return ret;
1007 }
1008 
1009 static int bch2_gc_btree_init(struct btree_trans *trans,
1010 			      enum btree_id btree_id,
1011 			      bool metadata_only)
1012 {
1013 	struct bch_fs *c = trans->c;
1014 	struct btree *b;
1015 	unsigned target_depth = metadata_only ? 1 : 0;
1016 	struct printbuf buf = PRINTBUF;
1017 	int ret = 0;
1018 
1019 	b = bch2_btree_id_root(c, btree_id)->b;
1020 
1021 	if (btree_node_fake(b))
1022 		return 0;
1023 
1024 	six_lock_read(&b->c.lock, NULL, NULL);
1025 	printbuf_reset(&buf);
1026 	bch2_bpos_to_text(&buf, b->data->min_key);
1027 	if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
1028 			"btree root with incorrect min_key: %s", buf.buf)) {
1029 		bch_err(c, "repair unimplemented");
1030 		ret = -BCH_ERR_fsck_repair_unimplemented;
1031 		goto fsck_err;
1032 	}
1033 
1034 	printbuf_reset(&buf);
1035 	bch2_bpos_to_text(&buf, b->data->max_key);
1036 	if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
1037 			"btree root with incorrect max_key: %s", buf.buf)) {
1038 		bch_err(c, "repair unimplemented");
1039 		ret = -BCH_ERR_fsck_repair_unimplemented;
1040 		goto fsck_err;
1041 	}
1042 
1043 	if (b->c.level >= target_depth)
1044 		ret = bch2_gc_btree_init_recurse(trans, b, target_depth);
1045 
1046 	if (!ret) {
1047 		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
1048 
1049 		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true,
1050 				       &k, true);
1051 	}
1052 fsck_err:
1053 	six_unlock_read(&b->c.lock);
1054 
1055 	if (ret < 0)
1056 		bch_err_fn(c, ret);
1057 	printbuf_exit(&buf);
1058 	return ret;
1059 }
1060 
1061 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
1062 {
1063 	return  (int) btree_id_to_gc_phase(l) -
1064 		(int) btree_id_to_gc_phase(r);
1065 }
1066 
1067 static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
1068 {
1069 	struct btree_trans *trans = bch2_trans_get(c);
1070 	enum btree_id ids[BTREE_ID_NR];
1071 	unsigned i;
1072 	int ret = 0;
1073 
1074 	for (i = 0; i < BTREE_ID_NR; i++)
1075 		ids[i] = i;
1076 	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
1077 
1078 	for (i = 0; i < BTREE_ID_NR && !ret; i++)
1079 		ret = initial
1080 			? bch2_gc_btree_init(trans, ids[i], metadata_only)
1081 			: bch2_gc_btree(trans, ids[i], initial, metadata_only);
1082 
1083 	for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
1084 		if (!bch2_btree_id_root(c, i)->alive)
1085 			continue;
1086 
1087 		ret = initial
1088 			? bch2_gc_btree_init(trans, i, metadata_only)
1089 			: bch2_gc_btree(trans, i, initial, metadata_only);
1090 	}
1091 
1092 	if (ret < 0)
1093 		bch_err_fn(c, ret);
1094 
1095 	bch2_trans_put(trans);
1096 	return ret;
1097 }
1098 
1099 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
1100 				  u64 start, u64 end,
1101 				  enum bch_data_type type,
1102 				  unsigned flags)
1103 {
1104 	u64 b = sector_to_bucket(ca, start);
1105 
1106 	do {
1107 		unsigned sectors =
1108 			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
1109 
1110 		bch2_mark_metadata_bucket(c, ca, b, type, sectors,
1111 					  gc_phase(GC_PHASE_SB), flags);
1112 		b++;
1113 		start += sectors;
1114 	} while (start < end);
1115 }
1116 
1117 static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
1118 				     unsigned flags)
1119 {
1120 	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
1121 	unsigned i;
1122 	u64 b;
1123 
1124 	for (i = 0; i < layout->nr_superblocks; i++) {
1125 		u64 offset = le64_to_cpu(layout->sb_offset[i]);
1126 
1127 		if (offset == BCH_SB_SECTOR)
1128 			mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
1129 					      BCH_DATA_sb, flags);
1130 
1131 		mark_metadata_sectors(c, ca, offset,
1132 				      offset + (1 << layout->sb_max_size_bits),
1133 				      BCH_DATA_sb, flags);
1134 	}
1135 
1136 	for (i = 0; i < ca->journal.nr; i++) {
1137 		b = ca->journal.buckets[i];
1138 		bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
1139 					  ca->mi.bucket_size,
1140 					  gc_phase(GC_PHASE_SB), flags);
1141 	}
1142 }
1143 
1144 static void bch2_mark_superblocks(struct bch_fs *c)
1145 {
1146 	struct bch_dev *ca;
1147 	unsigned i;
1148 
1149 	mutex_lock(&c->sb_lock);
1150 	gc_pos_set(c, gc_phase(GC_PHASE_SB));
1151 
1152 	for_each_online_member(ca, c, i)
1153 		bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
1154 	mutex_unlock(&c->sb_lock);
1155 }
1156 
1157 #if 0
1158 /* Also see bch2_pending_btree_node_free_insert_done() */
1159 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
1160 {
1161 	struct btree_update *as;
1162 	struct pending_btree_node_free *d;
1163 
1164 	mutex_lock(&c->btree_interior_update_lock);
1165 	gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
1166 
1167 	for_each_pending_btree_node_free(c, as, d)
1168 		if (d->index_update_done)
1169 			bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC);
1170 
1171 	mutex_unlock(&c->btree_interior_update_lock);
1172 }
1173 #endif
1174 
1175 static void bch2_gc_free(struct bch_fs *c)
1176 {
1177 	struct bch_dev *ca;
1178 	unsigned i;
1179 
1180 	genradix_free(&c->reflink_gc_table);
1181 	genradix_free(&c->gc_stripes);
1182 
1183 	for_each_member_device(ca, c, i) {
1184 		kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
1185 			sizeof(struct bucket_array) +
1186 			ca->mi.nbuckets * sizeof(struct bucket));
1187 		ca->buckets_gc = NULL;
1188 
1189 		free_percpu(ca->usage_gc);
1190 		ca->usage_gc = NULL;
1191 	}
1192 
1193 	free_percpu(c->usage_gc);
1194 	c->usage_gc = NULL;
1195 }
1196 
1197 static int bch2_gc_done(struct bch_fs *c,
1198 			bool initial, bool metadata_only)
1199 {
1200 	struct bch_dev *ca = NULL;
1201 	struct printbuf buf = PRINTBUF;
1202 	bool verify = !metadata_only &&
1203 		!c->opts.reconstruct_alloc &&
1204 		(!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
1205 	unsigned i, dev;
1206 	int ret = 0;
1207 
1208 	percpu_down_write(&c->mark_lock);
1209 
1210 #define copy_field(_f, _msg, ...)					\
1211 	if (dst->_f != src->_f &&					\
1212 	    (!verify ||							\
1213 	     fsck_err(c, _msg ": got %llu, should be %llu"		\
1214 		      , ##__VA_ARGS__, dst->_f, src->_f)))		\
1215 		dst->_f = src->_f
1216 #define copy_dev_field(_f, _msg, ...)					\
1217 	copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
1218 #define copy_fs_field(_f, _msg, ...)					\
1219 	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
1220 
1221 	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
1222 		bch2_fs_usage_acc_to_base(c, i);
1223 
1224 	for_each_member_device(ca, c, dev) {
1225 		struct bch_dev_usage *dst = ca->usage_base;
1226 		struct bch_dev_usage *src = (void *)
1227 			bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
1228 					     dev_usage_u64s());
1229 
1230 		copy_dev_field(buckets_ec,		"buckets_ec");
1231 
1232 		for (i = 0; i < BCH_DATA_NR; i++) {
1233 			copy_dev_field(d[i].buckets,	"%s buckets", bch2_data_types[i]);
1234 			copy_dev_field(d[i].sectors,	"%s sectors", bch2_data_types[i]);
1235 			copy_dev_field(d[i].fragmented,	"%s fragmented", bch2_data_types[i]);
1236 		}
1237 	}
1238 
1239 	{
1240 		unsigned nr = fs_usage_u64s(c);
1241 		struct bch_fs_usage *dst = c->usage_base;
1242 		struct bch_fs_usage *src = (void *)
1243 			bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
1244 
1245 		copy_fs_field(hidden,		"hidden");
1246 		copy_fs_field(btree,		"btree");
1247 
1248 		if (!metadata_only) {
1249 			copy_fs_field(data,	"data");
1250 			copy_fs_field(cached,	"cached");
1251 			copy_fs_field(reserved,	"reserved");
1252 			copy_fs_field(nr_inodes,"nr_inodes");
1253 
1254 			for (i = 0; i < BCH_REPLICAS_MAX; i++)
1255 				copy_fs_field(persistent_reserved[i],
1256 					      "persistent_reserved[%i]", i);
1257 		}
1258 
1259 		for (i = 0; i < c->replicas.nr; i++) {
1260 			struct bch_replicas_entry *e =
1261 				cpu_replicas_entry(&c->replicas, i);
1262 
1263 			if (metadata_only &&
1264 			    (e->data_type == BCH_DATA_user ||
1265 			     e->data_type == BCH_DATA_cached))
1266 				continue;
1267 
1268 			printbuf_reset(&buf);
1269 			bch2_replicas_entry_to_text(&buf, e);
1270 
1271 			copy_fs_field(replicas[i], "%s", buf.buf);
1272 		}
1273 	}
1274 
1275 #undef copy_fs_field
1276 #undef copy_dev_field
1277 #undef copy_stripe_field
1278 #undef copy_field
1279 fsck_err:
1280 	if (ca)
1281 		percpu_ref_put(&ca->ref);
1282 	if (ret)
1283 		bch_err_fn(c, ret);
1284 
1285 	percpu_up_write(&c->mark_lock);
1286 	printbuf_exit(&buf);
1287 	return ret;
1288 }
1289 
1290 static int bch2_gc_start(struct bch_fs *c)
1291 {
1292 	struct bch_dev *ca = NULL;
1293 	unsigned i;
1294 
1295 	BUG_ON(c->usage_gc);
1296 
1297 	c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
1298 					 sizeof(u64), GFP_KERNEL);
1299 	if (!c->usage_gc) {
1300 		bch_err(c, "error allocating c->usage_gc");
1301 		return -BCH_ERR_ENOMEM_gc_start;
1302 	}
1303 
1304 	for_each_member_device(ca, c, i) {
1305 		BUG_ON(ca->usage_gc);
1306 
1307 		ca->usage_gc = alloc_percpu(struct bch_dev_usage);
1308 		if (!ca->usage_gc) {
1309 			bch_err(c, "error allocating ca->usage_gc");
1310 			percpu_ref_put(&ca->ref);
1311 			return -BCH_ERR_ENOMEM_gc_start;
1312 		}
1313 
1314 		this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
1315 			       ca->mi.nbuckets - ca->mi.first_bucket);
1316 	}
1317 
1318 	return 0;
1319 }
1320 
1321 static int bch2_gc_reset(struct bch_fs *c)
1322 {
1323 	struct bch_dev *ca;
1324 	unsigned i;
1325 
1326 	for_each_member_device(ca, c, i) {
1327 		free_percpu(ca->usage_gc);
1328 		ca->usage_gc = NULL;
1329 	}
1330 
1331 	free_percpu(c->usage_gc);
1332 	c->usage_gc = NULL;
1333 
1334 	return bch2_gc_start(c);
1335 }
1336 
1337 /* returns true if not equal */
1338 static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
1339 				     struct bch_alloc_v4 r)
1340 {
1341 	return  l.gen != r.gen				||
1342 		l.oldest_gen != r.oldest_gen		||
1343 		l.data_type != r.data_type		||
1344 		l.dirty_sectors	!= r.dirty_sectors	||
1345 		l.cached_sectors != r.cached_sectors	 ||
1346 		l.stripe_redundancy != r.stripe_redundancy ||
1347 		l.stripe != r.stripe;
1348 }
1349 
1350 static int bch2_alloc_write_key(struct btree_trans *trans,
1351 				struct btree_iter *iter,
1352 				struct bkey_s_c k,
1353 				bool metadata_only)
1354 {
1355 	struct bch_fs *c = trans->c;
1356 	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
1357 	struct bucket gc, *b;
1358 	struct bkey_i_alloc_v4 *a;
1359 	struct bch_alloc_v4 old_convert, new;
1360 	const struct bch_alloc_v4 *old;
1361 	enum bch_data_type type;
1362 	int ret;
1363 
1364 	if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)))
1365 		return 1;
1366 
1367 	old = bch2_alloc_to_v4(k, &old_convert);
1368 	new = *old;
1369 
1370 	percpu_down_read(&c->mark_lock);
1371 	b = gc_bucket(ca, iter->pos.offset);
1372 
1373 	/*
1374 	 * b->data_type doesn't yet include need_discard & need_gc_gen states -
1375 	 * fix that here:
1376 	 */
1377 	type = __alloc_data_type(b->dirty_sectors,
1378 				 b->cached_sectors,
1379 				 b->stripe,
1380 				 *old,
1381 				 b->data_type);
1382 	if (b->data_type != type) {
1383 		struct bch_dev_usage *u;
1384 
1385 		preempt_disable();
1386 		u = this_cpu_ptr(ca->usage_gc);
1387 		u->d[b->data_type].buckets--;
1388 		b->data_type = type;
1389 		u->d[b->data_type].buckets++;
1390 		preempt_enable();
1391 	}
1392 
1393 	gc = *b;
1394 	percpu_up_read(&c->mark_lock);
1395 
1396 	if (metadata_only &&
1397 	    gc.data_type != BCH_DATA_sb &&
1398 	    gc.data_type != BCH_DATA_journal &&
1399 	    gc.data_type != BCH_DATA_btree)
1400 		return 0;
1401 
1402 	if (gen_after(old->gen, gc.gen))
1403 		return 0;
1404 
1405 	if (c->opts.reconstruct_alloc ||
1406 	    fsck_err_on(new.data_type != gc.data_type, c,
1407 			"bucket %llu:%llu gen %u has wrong data_type"
1408 			": got %s, should be %s",
1409 			iter->pos.inode, iter->pos.offset,
1410 			gc.gen,
1411 			bch2_data_types[new.data_type],
1412 			bch2_data_types[gc.data_type]))
1413 		new.data_type = gc.data_type;
1414 
1415 #define copy_bucket_field(_f)						\
1416 	if (c->opts.reconstruct_alloc ||				\
1417 	    fsck_err_on(new._f != gc._f, c,				\
1418 			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
1419 			": got %u, should be %u",			\
1420 			iter->pos.inode, iter->pos.offset,		\
1421 			gc.gen,						\
1422 			bch2_data_types[gc.data_type],			\
1423 			new._f, gc._f))					\
1424 		new._f = gc._f;						\
1425 
1426 	copy_bucket_field(gen);
1427 	copy_bucket_field(dirty_sectors);
1428 	copy_bucket_field(cached_sectors);
1429 	copy_bucket_field(stripe_redundancy);
1430 	copy_bucket_field(stripe);
1431 #undef copy_bucket_field
1432 
1433 	if (!bch2_alloc_v4_cmp(*old, new))
1434 		return 0;
1435 
1436 	a = bch2_alloc_to_v4_mut(trans, k);
1437 	ret = PTR_ERR_OR_ZERO(a);
1438 	if (ret)
1439 		return ret;
1440 
1441 	a->v = new;
1442 
1443 	/*
1444 	 * The trigger normally makes sure this is set, but we're not running
1445 	 * triggers:
1446 	 */
1447 	if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
1448 		a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
1449 
1450 	ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
1451 fsck_err:
1452 	return ret;
1453 }
1454 
1455 static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
1456 {
1457 	struct btree_trans *trans = bch2_trans_get(c);
1458 	struct btree_iter iter;
1459 	struct bkey_s_c k;
1460 	struct bch_dev *ca;
1461 	unsigned i;
1462 	int ret = 0;
1463 
1464 	for_each_member_device(ca, c, i) {
1465 		ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1466 				POS(ca->dev_idx, ca->mi.first_bucket),
1467 				BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
1468 				NULL, NULL, BTREE_INSERT_LAZY_RW,
1469 			bch2_alloc_write_key(trans, &iter, k, metadata_only));
1470 
1471 		if (ret < 0) {
1472 			bch_err_fn(c, ret);
1473 			percpu_ref_put(&ca->ref);
1474 			break;
1475 		}
1476 	}
1477 
1478 	bch2_trans_put(trans);
1479 	return ret < 0 ? ret : 0;
1480 }
1481 
1482 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
1483 {
1484 	struct bch_dev *ca;
1485 	struct btree_trans *trans = bch2_trans_get(c);
1486 	struct btree_iter iter;
1487 	struct bkey_s_c k;
1488 	struct bucket *g;
1489 	struct bch_alloc_v4 a_convert;
1490 	const struct bch_alloc_v4 *a;
1491 	unsigned i;
1492 	int ret;
1493 
1494 	for_each_member_device(ca, c, i) {
1495 		struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
1496 				ca->mi.nbuckets * sizeof(struct bucket),
1497 				GFP_KERNEL|__GFP_ZERO);
1498 		if (!buckets) {
1499 			percpu_ref_put(&ca->ref);
1500 			bch_err(c, "error allocating ca->buckets[gc]");
1501 			ret = -BCH_ERR_ENOMEM_gc_alloc_start;
1502 			goto err;
1503 		}
1504 
1505 		buckets->first_bucket	= ca->mi.first_bucket;
1506 		buckets->nbuckets	= ca->mi.nbuckets;
1507 		rcu_assign_pointer(ca->buckets_gc, buckets);
1508 	}
1509 
1510 	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
1511 			   BTREE_ITER_PREFETCH, k, ret) {
1512 		ca = bch_dev_bkey_exists(c, k.k->p.inode);
1513 		g = gc_bucket(ca, k.k->p.offset);
1514 
1515 		a = bch2_alloc_to_v4(k, &a_convert);
1516 
1517 		g->gen_valid	= 1;
1518 		g->gen		= a->gen;
1519 
1520 		if (metadata_only &&
1521 		    (a->data_type == BCH_DATA_user ||
1522 		     a->data_type == BCH_DATA_cached ||
1523 		     a->data_type == BCH_DATA_parity)) {
1524 			g->data_type		= a->data_type;
1525 			g->dirty_sectors	= a->dirty_sectors;
1526 			g->cached_sectors	= a->cached_sectors;
1527 			g->stripe		= a->stripe;
1528 			g->stripe_redundancy	= a->stripe_redundancy;
1529 		}
1530 	}
1531 	bch2_trans_iter_exit(trans, &iter);
1532 err:
1533 	bch2_trans_put(trans);
1534 	if (ret)
1535 		bch_err_fn(c, ret);
1536 	return ret;
1537 }
1538 
1539 static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
1540 {
1541 	struct bch_dev *ca;
1542 	unsigned i;
1543 
1544 	for_each_member_device(ca, c, i) {
1545 		struct bucket_array *buckets = gc_bucket_array(ca);
1546 		struct bucket *g;
1547 
1548 		for_each_bucket(g, buckets) {
1549 			if (metadata_only &&
1550 			    (g->data_type == BCH_DATA_user ||
1551 			     g->data_type == BCH_DATA_cached ||
1552 			     g->data_type == BCH_DATA_parity))
1553 				continue;
1554 			g->data_type = 0;
1555 			g->dirty_sectors = 0;
1556 			g->cached_sectors = 0;
1557 		}
1558 	}
1559 }
1560 
1561 static int bch2_gc_write_reflink_key(struct btree_trans *trans,
1562 				     struct btree_iter *iter,
1563 				     struct bkey_s_c k,
1564 				     size_t *idx)
1565 {
1566 	struct bch_fs *c = trans->c;
1567 	const __le64 *refcount = bkey_refcount_c(k);
1568 	struct printbuf buf = PRINTBUF;
1569 	struct reflink_gc *r;
1570 	int ret = 0;
1571 
1572 	if (!refcount)
1573 		return 0;
1574 
1575 	while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
1576 	       r->offset < k.k->p.offset)
1577 		++*idx;
1578 
1579 	if (!r ||
1580 	    r->offset != k.k->p.offset ||
1581 	    r->size != k.k->size) {
1582 		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
1583 		return -EINVAL;
1584 	}
1585 
1586 	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
1587 			"reflink key has wrong refcount:\n"
1588 			"  %s\n"
1589 			"  should be %u",
1590 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
1591 			r->refcount)) {
1592 		struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
1593 
1594 		ret = PTR_ERR_OR_ZERO(new);
1595 		if (ret)
1596 			return ret;
1597 
1598 		if (!r->refcount)
1599 			new->k.type = KEY_TYPE_deleted;
1600 		else
1601 			*bkey_refcount(new) = cpu_to_le64(r->refcount);
1602 	}
1603 fsck_err:
1604 	printbuf_exit(&buf);
1605 	return ret;
1606 }
1607 
1608 static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
1609 {
1610 	struct btree_trans *trans;
1611 	struct btree_iter iter;
1612 	struct bkey_s_c k;
1613 	size_t idx = 0;
1614 	int ret = 0;
1615 
1616 	if (metadata_only)
1617 		return 0;
1618 
1619 	trans = bch2_trans_get(c);
1620 
1621 	ret = for_each_btree_key_commit(trans, iter,
1622 			BTREE_ID_reflink, POS_MIN,
1623 			BTREE_ITER_PREFETCH, k,
1624 			NULL, NULL, BTREE_INSERT_NOFAIL,
1625 		bch2_gc_write_reflink_key(trans, &iter, k, &idx));
1626 
1627 	c->reflink_gc_nr = 0;
1628 	bch2_trans_put(trans);
1629 	return ret;
1630 }
1631 
1632 static int bch2_gc_reflink_start(struct bch_fs *c,
1633 				 bool metadata_only)
1634 {
1635 	struct btree_trans *trans;
1636 	struct btree_iter iter;
1637 	struct bkey_s_c k;
1638 	struct reflink_gc *r;
1639 	int ret = 0;
1640 
1641 	if (metadata_only)
1642 		return 0;
1643 
1644 	trans = bch2_trans_get(c);
1645 	c->reflink_gc_nr = 0;
1646 
1647 	for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
1648 			   BTREE_ITER_PREFETCH, k, ret) {
1649 		const __le64 *refcount = bkey_refcount_c(k);
1650 
1651 		if (!refcount)
1652 			continue;
1653 
1654 		r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
1655 				       GFP_KERNEL);
1656 		if (!r) {
1657 			ret = -BCH_ERR_ENOMEM_gc_reflink_start;
1658 			break;
1659 		}
1660 
1661 		r->offset	= k.k->p.offset;
1662 		r->size		= k.k->size;
1663 		r->refcount	= 0;
1664 	}
1665 	bch2_trans_iter_exit(trans, &iter);
1666 
1667 	bch2_trans_put(trans);
1668 	return ret;
1669 }
1670 
1671 static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
1672 {
1673 	struct genradix_iter iter;
1674 	struct reflink_gc *r;
1675 
1676 	genradix_for_each(&c->reflink_gc_table, iter, r)
1677 		r->refcount = 0;
1678 }
1679 
1680 static int bch2_gc_write_stripes_key(struct btree_trans *trans,
1681 				     struct btree_iter *iter,
1682 				     struct bkey_s_c k)
1683 {
1684 	struct bch_fs *c = trans->c;
1685 	struct printbuf buf = PRINTBUF;
1686 	const struct bch_stripe *s;
1687 	struct gc_stripe *m;
1688 	bool bad = false;
1689 	unsigned i;
1690 	int ret = 0;
1691 
1692 	if (k.k->type != KEY_TYPE_stripe)
1693 		return 0;
1694 
1695 	s = bkey_s_c_to_stripe(k).v;
1696 	m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
1697 
1698 	for (i = 0; i < s->nr_blocks; i++) {
1699 		u32 old = stripe_blockcount_get(s, i);
1700 		u32 new = (m ? m->block_sectors[i] : 0);
1701 
1702 		if (old != new) {
1703 			prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
1704 				   i, old, new);
1705 			bad = true;
1706 		}
1707 	}
1708 
1709 	if (bad)
1710 		bch2_bkey_val_to_text(&buf, c, k);
1711 
1712 	if (fsck_err_on(bad, c, "%s", buf.buf)) {
1713 		struct bkey_i_stripe *new;
1714 
1715 		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1716 		ret = PTR_ERR_OR_ZERO(new);
1717 		if (ret)
1718 			return ret;
1719 
1720 		bkey_reassemble(&new->k_i, k);
1721 
1722 		for (i = 0; i < new->v.nr_blocks; i++)
1723 			stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
1724 
1725 		ret = bch2_trans_update(trans, iter, &new->k_i, 0);
1726 	}
1727 fsck_err:
1728 	printbuf_exit(&buf);
1729 	return ret;
1730 }
1731 
1732 static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
1733 {
1734 	struct btree_trans *trans;
1735 	struct btree_iter iter;
1736 	struct bkey_s_c k;
1737 	int ret = 0;
1738 
1739 	if (metadata_only)
1740 		return 0;
1741 
1742 	trans = bch2_trans_get(c);
1743 
1744 	ret = for_each_btree_key_commit(trans, iter,
1745 			BTREE_ID_stripes, POS_MIN,
1746 			BTREE_ITER_PREFETCH, k,
1747 			NULL, NULL, BTREE_INSERT_NOFAIL,
1748 		bch2_gc_write_stripes_key(trans, &iter, k));
1749 
1750 	bch2_trans_put(trans);
1751 	return ret;
1752 }
1753 
1754 static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
1755 {
1756 	genradix_free(&c->gc_stripes);
1757 }
1758 
1759 /**
1760  * bch2_gc - walk _all_ references to buckets, and recompute them:
1761  *
1762  * @c:			filesystem object
1763  * @initial:		are we in recovery?
1764  * @metadata_only:	are we just checking metadata references, or everything?
1765  *
1766  * Returns: 0 on success, or standard errcode on failure
1767  *
1768  * Order matters here:
1769  *  - Concurrent GC relies on the fact that we have a total ordering for
1770  *    everything that GC walks - see  gc_will_visit_node(),
1771  *    gc_will_visit_root()
1772  *
1773  *  - also, references move around in the course of index updates and
1774  *    various other crap: everything needs to agree on the ordering
1775  *    references are allowed to move around in - e.g., we're allowed to
1776  *    start with a reference owned by an open_bucket (the allocator) and
1777  *    move it to the btree, but not the reverse.
1778  *
1779  *    This is necessary to ensure that gc doesn't miss references that
1780  *    move around - if references move backwards in the ordering GC
1781  *    uses, GC could skip past them
1782  */
1783 int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
1784 {
1785 	unsigned iter = 0;
1786 	int ret;
1787 
1788 	lockdep_assert_held(&c->state_lock);
1789 
1790 	down_write(&c->gc_lock);
1791 
1792 	bch2_btree_interior_updates_flush(c);
1793 
1794 	ret   = bch2_gc_start(c) ?:
1795 		bch2_gc_alloc_start(c, metadata_only) ?:
1796 		bch2_gc_reflink_start(c, metadata_only);
1797 	if (ret)
1798 		goto out;
1799 again:
1800 	gc_pos_set(c, gc_phase(GC_PHASE_START));
1801 
1802 	bch2_mark_superblocks(c);
1803 
1804 	ret = bch2_gc_btrees(c, initial, metadata_only);
1805 
1806 	if (ret)
1807 		goto out;
1808 
1809 #if 0
1810 	bch2_mark_pending_btree_node_frees(c);
1811 #endif
1812 	c->gc_count++;
1813 
1814 	if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
1815 	    (!iter && bch2_test_restart_gc)) {
1816 		if (iter++ > 2) {
1817 			bch_info(c, "Unable to fix bucket gens, looping");
1818 			ret = -EINVAL;
1819 			goto out;
1820 		}
1821 
1822 		/*
1823 		 * XXX: make sure gens we fixed got saved
1824 		 */
1825 		bch_info(c, "Second GC pass needed, restarting:");
1826 		clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
1827 		__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
1828 
1829 		bch2_gc_stripes_reset(c, metadata_only);
1830 		bch2_gc_alloc_reset(c, metadata_only);
1831 		bch2_gc_reflink_reset(c, metadata_only);
1832 		ret = bch2_gc_reset(c);
1833 		if (ret)
1834 			goto out;
1835 
1836 		/* flush fsck errors, reset counters */
1837 		bch2_flush_fsck_errs(c);
1838 		goto again;
1839 	}
1840 out:
1841 	if (!ret) {
1842 		bch2_journal_block(&c->journal);
1843 
1844 		ret   = bch2_gc_stripes_done(c, metadata_only) ?:
1845 			bch2_gc_reflink_done(c, metadata_only) ?:
1846 			bch2_gc_alloc_done(c, metadata_only) ?:
1847 			bch2_gc_done(c, initial, metadata_only);
1848 
1849 		bch2_journal_unblock(&c->journal);
1850 	}
1851 
1852 	percpu_down_write(&c->mark_lock);
1853 	/* Indicates that gc is no longer in progress: */
1854 	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
1855 
1856 	bch2_gc_free(c);
1857 	percpu_up_write(&c->mark_lock);
1858 
1859 	up_write(&c->gc_lock);
1860 
1861 	/*
1862 	 * At startup, allocations can happen directly instead of via the
1863 	 * allocator thread - issue wakeup in case they blocked on gc_lock:
1864 	 */
1865 	closure_wake_up(&c->freelist_wait);
1866 
1867 	if (ret)
1868 		bch_err_fn(c, ret);
1869 	return ret;
1870 }
1871 
1872 static int gc_btree_gens_key(struct btree_trans *trans,
1873 			     struct btree_iter *iter,
1874 			     struct bkey_s_c k)
1875 {
1876 	struct bch_fs *c = trans->c;
1877 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1878 	const struct bch_extent_ptr *ptr;
1879 	struct bkey_i *u;
1880 	int ret;
1881 
1882 	percpu_down_read(&c->mark_lock);
1883 	bkey_for_each_ptr(ptrs, ptr) {
1884 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
1885 
1886 		if (ptr_stale(ca, ptr) > 16) {
1887 			percpu_up_read(&c->mark_lock);
1888 			goto update;
1889 		}
1890 	}
1891 
1892 	bkey_for_each_ptr(ptrs, ptr) {
1893 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
1894 		u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
1895 
1896 		if (gen_after(*gen, ptr->gen))
1897 			*gen = ptr->gen;
1898 	}
1899 	percpu_up_read(&c->mark_lock);
1900 	return 0;
1901 update:
1902 	u = bch2_bkey_make_mut(trans, iter, &k, 0);
1903 	ret = PTR_ERR_OR_ZERO(u);
1904 	if (ret)
1905 		return ret;
1906 
1907 	bch2_extent_normalize(c, bkey_i_to_s(u));
1908 	return 0;
1909 }
1910 
1911 static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
1912 				       struct bkey_s_c k)
1913 {
1914 	struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
1915 	struct bch_alloc_v4 a_convert;
1916 	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
1917 	struct bkey_i_alloc_v4 *a_mut;
1918 	int ret;
1919 
1920 	if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
1921 		return 0;
1922 
1923 	a_mut = bch2_alloc_to_v4_mut(trans, k);
1924 	ret = PTR_ERR_OR_ZERO(a_mut);
1925 	if (ret)
1926 		return ret;
1927 
1928 	a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
1929 	a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
1930 
1931 	return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
1932 }
1933 
1934 int bch2_gc_gens(struct bch_fs *c)
1935 {
1936 	struct btree_trans *trans;
1937 	struct btree_iter iter;
1938 	struct bkey_s_c k;
1939 	struct bch_dev *ca;
1940 	u64 b, start_time = local_clock();
1941 	unsigned i;
1942 	int ret;
1943 
1944 	/*
1945 	 * Ideally we would be using state_lock and not gc_lock here, but that
1946 	 * introduces a deadlock in the RO path - we currently take the state
1947 	 * lock at the start of going RO, thus the gc thread may get stuck:
1948 	 */
1949 	if (!mutex_trylock(&c->gc_gens_lock))
1950 		return 0;
1951 
1952 	trace_and_count(c, gc_gens_start, c);
1953 	down_read(&c->gc_lock);
1954 	trans = bch2_trans_get(c);
1955 
1956 	for_each_member_device(ca, c, i) {
1957 		struct bucket_gens *gens;
1958 
1959 		BUG_ON(ca->oldest_gen);
1960 
1961 		ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
1962 		if (!ca->oldest_gen) {
1963 			percpu_ref_put(&ca->ref);
1964 			ret = -BCH_ERR_ENOMEM_gc_gens;
1965 			goto err;
1966 		}
1967 
1968 		gens = bucket_gens(ca);
1969 
1970 		for (b = gens->first_bucket;
1971 		     b < gens->nbuckets; b++)
1972 			ca->oldest_gen[b] = gens->b[b];
1973 	}
1974 
1975 	for (i = 0; i < BTREE_ID_NR; i++)
1976 		if (btree_type_has_ptrs(i)) {
1977 			c->gc_gens_btree = i;
1978 			c->gc_gens_pos = POS_MIN;
1979 
1980 			ret = for_each_btree_key_commit(trans, iter, i,
1981 					POS_MIN,
1982 					BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
1983 					k,
1984 					NULL, NULL,
1985 					BTREE_INSERT_NOFAIL,
1986 				gc_btree_gens_key(trans, &iter, k));
1987 			if (ret && !bch2_err_matches(ret, EROFS))
1988 				bch_err_fn(c, ret);
1989 			if (ret)
1990 				goto err;
1991 		}
1992 
1993 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1994 			POS_MIN,
1995 			BTREE_ITER_PREFETCH,
1996 			k,
1997 			NULL, NULL,
1998 			BTREE_INSERT_NOFAIL,
1999 		bch2_alloc_write_oldest_gen(trans, &iter, k));
2000 	if (ret && !bch2_err_matches(ret, EROFS))
2001 		bch_err_fn(c, ret);
2002 	if (ret)
2003 		goto err;
2004 
2005 	c->gc_gens_btree	= 0;
2006 	c->gc_gens_pos		= POS_MIN;
2007 
2008 	c->gc_count++;
2009 
2010 	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
2011 	trace_and_count(c, gc_gens_end, c);
2012 err:
2013 	for_each_member_device(ca, c, i) {
2014 		kvfree(ca->oldest_gen);
2015 		ca->oldest_gen = NULL;
2016 	}
2017 
2018 	bch2_trans_put(trans);
2019 	up_read(&c->gc_lock);
2020 	mutex_unlock(&c->gc_gens_lock);
2021 	return ret;
2022 }
2023 
2024 static int bch2_gc_thread(void *arg)
2025 {
2026 	struct bch_fs *c = arg;
2027 	struct io_clock *clock = &c->io_clock[WRITE];
2028 	unsigned long last = atomic64_read(&clock->now);
2029 	unsigned last_kick = atomic_read(&c->kick_gc);
2030 	int ret;
2031 
2032 	set_freezable();
2033 
2034 	while (1) {
2035 		while (1) {
2036 			set_current_state(TASK_INTERRUPTIBLE);
2037 
2038 			if (kthread_should_stop()) {
2039 				__set_current_state(TASK_RUNNING);
2040 				return 0;
2041 			}
2042 
2043 			if (atomic_read(&c->kick_gc) != last_kick)
2044 				break;
2045 
2046 			if (c->btree_gc_periodic) {
2047 				unsigned long next = last + c->capacity / 16;
2048 
2049 				if (atomic64_read(&clock->now) >= next)
2050 					break;
2051 
2052 				bch2_io_clock_schedule_timeout(clock, next);
2053 			} else {
2054 				schedule();
2055 			}
2056 
2057 			try_to_freeze();
2058 		}
2059 		__set_current_state(TASK_RUNNING);
2060 
2061 		last = atomic64_read(&clock->now);
2062 		last_kick = atomic_read(&c->kick_gc);
2063 
2064 		/*
2065 		 * Full gc is currently incompatible with btree key cache:
2066 		 */
2067 #if 0
2068 		ret = bch2_gc(c, false, false);
2069 #else
2070 		ret = bch2_gc_gens(c);
2071 #endif
2072 		if (ret < 0)
2073 			bch_err_fn(c, ret);
2074 
2075 		debug_check_no_locks_held();
2076 	}
2077 
2078 	return 0;
2079 }
2080 
2081 void bch2_gc_thread_stop(struct bch_fs *c)
2082 {
2083 	struct task_struct *p;
2084 
2085 	p = c->gc_thread;
2086 	c->gc_thread = NULL;
2087 
2088 	if (p) {
2089 		kthread_stop(p);
2090 		put_task_struct(p);
2091 	}
2092 }
2093 
2094 int bch2_gc_thread_start(struct bch_fs *c)
2095 {
2096 	struct task_struct *p;
2097 
2098 	if (c->gc_thread)
2099 		return 0;
2100 
2101 	p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
2102 	if (IS_ERR(p)) {
2103 		bch_err_fn(c, PTR_ERR(p));
2104 		return PTR_ERR(p);
2105 	}
2106 
2107 	get_task_struct(p);
2108 	c->gc_thread = p;
2109 	wake_up_process(p);
2110 	return 0;
2111 }
2112