1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4 * Copyright (C) 2014 Datera Inc.
5 */
6
7 #include "bcachefs.h"
8 #include "alloc_background.h"
9 #include "alloc_foreground.h"
10 #include "backpointers.h"
11 #include "bkey_methods.h"
12 #include "bkey_buf.h"
13 #include "btree_journal_iter.h"
14 #include "btree_key_cache.h"
15 #include "btree_locking.h"
16 #include "btree_node_scan.h"
17 #include "btree_update_interior.h"
18 #include "btree_io.h"
19 #include "btree_gc.h"
20 #include "buckets.h"
21 #include "clock.h"
22 #include "debug.h"
23 #include "disk_accounting.h"
24 #include "ec.h"
25 #include "enumerated_ref.h"
26 #include "error.h"
27 #include "extents.h"
28 #include "journal.h"
29 #include "keylist.h"
30 #include "move.h"
31 #include "progress.h"
32 #include "recovery_passes.h"
33 #include "reflink.h"
34 #include "recovery.h"
35 #include "replicas.h"
36 #include "super-io.h"
37 #include "trace.h"
38
39 #include <linux/slab.h>
40 #include <linux/bitops.h>
41 #include <linux/freezer.h>
42 #include <linux/kthread.h>
43 #include <linux/preempt.h>
44 #include <linux/rcupdate.h>
45 #include <linux/sched/task.h>
46
47 #define DROP_THIS_NODE 10
48 #define DROP_PREV_NODE 11
49 #define DID_FILL_FROM_SCAN 12
50
51 /*
52 * Returns true if it's a btree we can easily reconstruct, or otherwise won't
53 * cause data loss if it's missing:
54 */
btree_id_important(enum btree_id btree)55 static bool btree_id_important(enum btree_id btree)
56 {
57 if (btree_id_is_alloc(btree))
58 return false;
59
60 switch (btree) {
61 case BTREE_ID_quotas:
62 case BTREE_ID_snapshot_trees:
63 case BTREE_ID_logged_ops:
64 case BTREE_ID_rebalance_work:
65 case BTREE_ID_subvolume_children:
66 return false;
67 default:
68 return true;
69 }
70 }
71
72 static const char * const bch2_gc_phase_strs[] = {
73 #define x(n) #n,
74 GC_PHASES()
75 #undef x
76 NULL
77 };
78
bch2_gc_pos_to_text(struct printbuf * out,struct gc_pos * p)79 void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
80 {
81 prt_str(out, bch2_gc_phase_strs[p->phase]);
82 prt_char(out, ' ');
83 bch2_btree_id_level_to_text(out, p->btree, p->level);
84 prt_char(out, ' ');
85 bch2_bpos_to_text(out, p->pos);
86 }
87
unsafe_bkey_s_c_to_s(struct bkey_s_c k)88 static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
89 {
90 return (struct bkey_s) {{{
91 (struct bkey *) k.k,
92 (struct bch_val *) k.v
93 }}};
94 }
95
__gc_pos_set(struct bch_fs * c,struct gc_pos new_pos)96 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
97 {
98 preempt_disable();
99 write_seqcount_begin(&c->gc_pos_lock);
100 c->gc_pos = new_pos;
101 write_seqcount_end(&c->gc_pos_lock);
102 preempt_enable();
103 }
104
gc_pos_set(struct bch_fs * c,struct gc_pos new_pos)105 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
106 {
107 BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
108 __gc_pos_set(c, new_pos);
109 }
110
btree_ptr_to_v2(struct btree * b,struct bkey_i_btree_ptr_v2 * dst)111 static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
112 {
113 switch (b->key.k.type) {
114 case KEY_TYPE_btree_ptr: {
115 struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
116
117 dst->k.p = src->k.p;
118 dst->v.mem_ptr = 0;
119 dst->v.seq = b->data->keys.seq;
120 dst->v.sectors_written = 0;
121 dst->v.flags = 0;
122 dst->v.min_key = b->data->min_key;
123 set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
124 memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
125 break;
126 }
127 case KEY_TYPE_btree_ptr_v2:
128 bkey_copy(&dst->k_i, &b->key);
129 break;
130 default:
131 BUG();
132 }
133 }
134
set_node_min(struct bch_fs * c,struct btree * b,struct bpos new_min)135 static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
136 {
137 struct bkey_i_btree_ptr_v2 *new;
138 int ret;
139
140 if (c->opts.verbose) {
141 struct printbuf buf = PRINTBUF;
142
143 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
144 prt_str(&buf, " -> ");
145 bch2_bpos_to_text(&buf, new_min);
146
147 bch_info(c, "%s(): %s", __func__, buf.buf);
148 printbuf_exit(&buf);
149 }
150
151 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
152 if (!new)
153 return bch_err_throw(c, ENOMEM_gc_repair_key);
154
155 btree_ptr_to_v2(b, new);
156 b->data->min_key = new_min;
157 new->v.min_key = new_min;
158 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
159
160 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
161 if (ret) {
162 kfree(new);
163 return ret;
164 }
165
166 bch2_btree_node_drop_keys_outside_node(b);
167 bkey_copy(&b->key, &new->k_i);
168 return 0;
169 }
170
set_node_max(struct bch_fs * c,struct btree * b,struct bpos new_max)171 static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
172 {
173 struct bkey_i_btree_ptr_v2 *new;
174 int ret;
175
176 if (c->opts.verbose) {
177 struct printbuf buf = PRINTBUF;
178
179 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
180 prt_str(&buf, " -> ");
181 bch2_bpos_to_text(&buf, new_max);
182
183 bch_info(c, "%s(): %s", __func__, buf.buf);
184 printbuf_exit(&buf);
185 }
186
187 ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
188 if (ret)
189 return ret;
190
191 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
192 if (!new)
193 return bch_err_throw(c, ENOMEM_gc_repair_key);
194
195 btree_ptr_to_v2(b, new);
196 b->data->max_key = new_max;
197 new->k.p = new_max;
198 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
199
200 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
201 if (ret) {
202 kfree(new);
203 return ret;
204 }
205
206 bch2_btree_node_drop_keys_outside_node(b);
207
208 mutex_lock(&c->btree_cache.lock);
209 __bch2_btree_node_hash_remove(&c->btree_cache, b);
210
211 bkey_copy(&b->key, &new->k_i);
212 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
213 BUG_ON(ret);
214 mutex_unlock(&c->btree_cache.lock);
215 return 0;
216 }
217
btree_check_node_boundaries(struct btree_trans * trans,struct btree * b,struct btree * prev,struct btree * cur,struct bpos * pulled_from_scan)218 static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
219 struct btree *prev, struct btree *cur,
220 struct bpos *pulled_from_scan)
221 {
222 struct bch_fs *c = trans->c;
223 struct bpos expected_start = !prev
224 ? b->data->min_key
225 : bpos_successor(prev->key.k.p);
226 struct printbuf buf = PRINTBUF;
227 int ret = 0;
228
229 BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
230 !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
231 b->data->min_key));
232
233 if (bpos_eq(expected_start, cur->data->min_key))
234 return 0;
235
236 prt_printf(&buf, " at ");
237 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
238 prt_printf(&buf, ":\nparent: ");
239 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
240
241 if (prev) {
242 prt_printf(&buf, "\nprev: ");
243 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
244 }
245
246 prt_str(&buf, "\nnext: ");
247 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
248
249 if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
250 if (b->c.level == 1 &&
251 bpos_lt(*pulled_from_scan, cur->data->min_key)) {
252 ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
253 expected_start,
254 bpos_predecessor(cur->data->min_key));
255 if (ret)
256 goto err;
257
258 *pulled_from_scan = cur->data->min_key;
259 ret = DID_FILL_FROM_SCAN;
260 } else {
261 if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
262 "btree node with incorrect min_key%s", buf.buf))
263 ret = set_node_min(c, cur, expected_start);
264 }
265 } else { /* overlap */
266 if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
267 if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
268 if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
269 "btree node overwritten by next node%s", buf.buf))
270 ret = DROP_PREV_NODE;
271 } else {
272 if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
273 "btree node with incorrect max_key%s", buf.buf))
274 ret = set_node_max(c, prev,
275 bpos_predecessor(cur->data->min_key));
276 }
277 } else {
278 if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
279 if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
280 "btree node overwritten by prev node%s", buf.buf))
281 ret = DROP_THIS_NODE;
282 } else {
283 if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
284 "btree node with incorrect min_key%s", buf.buf))
285 ret = set_node_min(c, cur, expected_start);
286 }
287 }
288 }
289 err:
290 fsck_err:
291 printbuf_exit(&buf);
292 return ret;
293 }
294
btree_repair_node_end(struct btree_trans * trans,struct btree * b,struct btree * child,struct bpos * pulled_from_scan)295 static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
296 struct btree *child, struct bpos *pulled_from_scan)
297 {
298 struct bch_fs *c = trans->c;
299 struct printbuf buf = PRINTBUF;
300 int ret = 0;
301
302 if (bpos_eq(child->key.k.p, b->key.k.p))
303 return 0;
304
305 prt_printf(&buf, "\nat: ");
306 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
307 prt_printf(&buf, "\nparent: ");
308 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
309
310 prt_str(&buf, "\nchild: ");
311 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
312
313 if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
314 "btree node with incorrect max_key%s", buf.buf)) {
315 if (b->c.level == 1 &&
316 bpos_lt(*pulled_from_scan, b->key.k.p)) {
317 ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
318 bpos_successor(child->key.k.p), b->key.k.p);
319 if (ret)
320 goto err;
321
322 *pulled_from_scan = b->key.k.p;
323 ret = DID_FILL_FROM_SCAN;
324 } else {
325 ret = set_node_max(c, child, b->key.k.p);
326 }
327 }
328 err:
329 fsck_err:
330 printbuf_exit(&buf);
331 return ret;
332 }
333
bch2_btree_repair_topology_recurse(struct btree_trans * trans,struct btree * b,struct bpos * pulled_from_scan)334 static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
335 struct bpos *pulled_from_scan)
336 {
337 struct bch_fs *c = trans->c;
338 struct btree_and_journal_iter iter;
339 struct bkey_s_c k;
340 struct bkey_buf prev_k, cur_k;
341 struct btree *prev = NULL, *cur = NULL;
342 bool have_child, new_pass = false;
343 struct printbuf buf = PRINTBUF;
344 int ret = 0;
345
346 if (!b->c.level)
347 return 0;
348
349 bch2_bkey_buf_init(&prev_k);
350 bch2_bkey_buf_init(&cur_k);
351 again:
352 cur = prev = NULL;
353 have_child = new_pass = false;
354 bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
355 iter.prefetch = true;
356
357 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
358 BUG_ON(bpos_lt(k.k->p, b->data->min_key));
359 BUG_ON(bpos_gt(k.k->p, b->data->max_key));
360
361 bch2_btree_and_journal_iter_advance(&iter);
362 bch2_bkey_buf_reassemble(&cur_k, c, k);
363
364 cur = bch2_btree_node_get_noiter(trans, cur_k.k,
365 b->c.btree_id, b->c.level - 1,
366 false);
367 ret = PTR_ERR_OR_ZERO(cur);
368
369 printbuf_reset(&buf);
370 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1);
371 prt_char(&buf, ' ');
372 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
373
374 if (bch2_err_matches(ret, EIO)) {
375 bch2_btree_node_evict(trans, cur_k.k);
376 cur = NULL;
377 ret = bch2_journal_key_delete(c, b->c.btree_id,
378 b->c.level, cur_k.k->k.p);
379 if (ret)
380 break;
381 continue;
382 }
383
384 bch_err_msg(c, ret, "getting btree node");
385 if (ret)
386 break;
387
388 if (bch2_btree_node_is_stale(c, cur)) {
389 bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf);
390 six_unlock_read(&cur->c.lock);
391 bch2_btree_node_evict(trans, cur_k.k);
392 ret = bch2_journal_key_delete(c, b->c.btree_id,
393 b->c.level, cur_k.k->k.p);
394 cur = NULL;
395 if (ret)
396 break;
397 continue;
398 }
399
400 ret = lockrestart_do(trans,
401 btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan));
402 if (ret < 0)
403 goto err;
404
405 if (ret == DID_FILL_FROM_SCAN) {
406 new_pass = true;
407 ret = 0;
408 }
409
410 if (ret == DROP_THIS_NODE) {
411 six_unlock_read(&cur->c.lock);
412 bch2_btree_node_evict(trans, cur_k.k);
413 ret = bch2_journal_key_delete(c, b->c.btree_id,
414 b->c.level, cur_k.k->k.p);
415 cur = NULL;
416 if (ret)
417 break;
418 continue;
419 }
420
421 if (prev)
422 six_unlock_read(&prev->c.lock);
423 prev = NULL;
424
425 if (ret == DROP_PREV_NODE) {
426 bch_info(c, "dropped prev node");
427 bch2_btree_node_evict(trans, prev_k.k);
428 ret = bch2_journal_key_delete(c, b->c.btree_id,
429 b->c.level, prev_k.k->k.p);
430 if (ret)
431 break;
432
433 bch2_btree_and_journal_iter_exit(&iter);
434 goto again;
435 } else if (ret)
436 break;
437
438 prev = cur;
439 cur = NULL;
440 bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
441 }
442
443 if (!ret && !IS_ERR_OR_NULL(prev)) {
444 BUG_ON(cur);
445 ret = lockrestart_do(trans,
446 btree_repair_node_end(trans, b, prev, pulled_from_scan));
447 if (ret == DID_FILL_FROM_SCAN) {
448 new_pass = true;
449 ret = 0;
450 }
451 }
452
453 if (!IS_ERR_OR_NULL(prev))
454 six_unlock_read(&prev->c.lock);
455 prev = NULL;
456 if (!IS_ERR_OR_NULL(cur))
457 six_unlock_read(&cur->c.lock);
458 cur = NULL;
459
460 if (ret)
461 goto err;
462
463 bch2_btree_and_journal_iter_exit(&iter);
464
465 if (new_pass)
466 goto again;
467
468 bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
469 iter.prefetch = true;
470
471 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
472 bch2_bkey_buf_reassemble(&cur_k, c, k);
473 bch2_btree_and_journal_iter_advance(&iter);
474
475 cur = bch2_btree_node_get_noiter(trans, cur_k.k,
476 b->c.btree_id, b->c.level - 1,
477 false);
478 ret = PTR_ERR_OR_ZERO(cur);
479
480 bch_err_msg(c, ret, "getting btree node");
481 if (ret)
482 goto err;
483
484 ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
485 six_unlock_read(&cur->c.lock);
486 cur = NULL;
487
488 if (ret == DROP_THIS_NODE) {
489 bch2_btree_node_evict(trans, cur_k.k);
490 ret = bch2_journal_key_delete(c, b->c.btree_id,
491 b->c.level, cur_k.k->k.p);
492 new_pass = true;
493 }
494
495 if (ret)
496 goto err;
497
498 have_child = true;
499 }
500
501 printbuf_reset(&buf);
502 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
503 prt_newline(&buf);
504 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
505
506 /*
507 * XXX: we're not passing the trans object here because we're not set up
508 * to handle a transaction restart - this code needs to be rewritten
509 * when we start doing online topology repair
510 */
511 bch2_trans_unlock_long(trans);
512 if (mustfix_fsck_err_on(!have_child,
513 c, btree_node_topology_interior_node_empty,
514 "empty interior btree node at %s", buf.buf))
515 ret = DROP_THIS_NODE;
516 err:
517 fsck_err:
518 if (!IS_ERR_OR_NULL(prev))
519 six_unlock_read(&prev->c.lock);
520 if (!IS_ERR_OR_NULL(cur))
521 six_unlock_read(&cur->c.lock);
522
523 bch2_btree_and_journal_iter_exit(&iter);
524
525 if (!ret && new_pass)
526 goto again;
527
528 BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
529
530 bch2_bkey_buf_exit(&prev_k, c);
531 bch2_bkey_buf_exit(&cur_k, c);
532 printbuf_exit(&buf);
533 bch_err_fn(c, ret);
534 return ret;
535 }
536
bch2_check_root(struct btree_trans * trans,enum btree_id btree,bool * reconstructed_root)537 static int bch2_check_root(struct btree_trans *trans, enum btree_id btree,
538 bool *reconstructed_root)
539 {
540 struct bch_fs *c = trans->c;
541 struct btree_root *r = bch2_btree_id_root(c, btree);
542 struct printbuf buf = PRINTBUF;
543 int ret = 0;
544
545 bch2_btree_id_to_text(&buf, btree);
546
547 if (r->error) {
548 bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
549
550 ret = bch2_btree_has_scanned_nodes(c, btree);
551 if (ret < 0)
552 goto err;
553
554 if (!ret) {
555 __fsck_err(trans,
556 FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0),
557 btree_root_unreadable_and_scan_found_nothing,
558 "no nodes found for btree %s, continue?", buf.buf);
559
560 r->alive = false;
561 r->error = 0;
562 bch2_btree_root_alloc_fake_trans(trans, btree, 0);
563 } else {
564 r->alive = false;
565 r->error = 0;
566 bch2_btree_root_alloc_fake_trans(trans, btree, 1);
567
568 bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
569 ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX);
570 if (ret)
571 goto err;
572 }
573
574 *reconstructed_root = true;
575 }
576 err:
577 fsck_err:
578 printbuf_exit(&buf);
579 bch_err_fn(c, ret);
580 return ret;
581 }
582
bch2_check_topology(struct bch_fs * c)583 int bch2_check_topology(struct bch_fs *c)
584 {
585 struct btree_trans *trans = bch2_trans_get(c);
586 struct bpos pulled_from_scan = POS_MIN;
587 int ret = 0;
588
589 bch2_trans_srcu_unlock(trans);
590
591 for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
592 bool reconstructed_root = false;
593 recover:
594 ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root));
595 if (ret)
596 break;
597
598 struct btree_root *r = bch2_btree_id_root(c, i);
599 struct btree *b = r->b;
600
601 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
602 ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
603 six_unlock_read(&b->c.lock);
604
605 if (ret == DROP_THIS_NODE) {
606 mutex_lock(&c->btree_cache.lock);
607 bch2_btree_node_hash_remove(&c->btree_cache, b);
608 mutex_unlock(&c->btree_cache.lock);
609
610 r->b = NULL;
611
612 if (!reconstructed_root) {
613 r->error = -EIO;
614 goto recover;
615 }
616
617 struct printbuf buf = PRINTBUF;
618 bch2_btree_id_to_text(&buf, i);
619 bch_err(c, "empty btree root %s", buf.buf);
620 printbuf_exit(&buf);
621 bch2_btree_root_alloc_fake_trans(trans, i, 0);
622 r->alive = false;
623 ret = 0;
624 }
625 }
626
627 bch2_trans_put(trans);
628 return ret;
629 }
630
631 /* marking of btree keys/nodes: */
632
bch2_gc_mark_key(struct btree_trans * trans,enum btree_id btree_id,unsigned level,struct btree ** prev,struct btree_iter * iter,struct bkey_s_c k,bool initial)633 static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
634 unsigned level, struct btree **prev,
635 struct btree_iter *iter, struct bkey_s_c k,
636 bool initial)
637 {
638 struct bch_fs *c = trans->c;
639
640 if (iter) {
641 struct btree_path *path = btree_iter_path(trans, iter);
642 struct btree *b = path_l(path)->b;
643
644 if (*prev != b) {
645 int ret = bch2_btree_node_check_topology(trans, b);
646 if (ret)
647 return ret;
648 }
649 *prev = b;
650 }
651
652 struct bkey deleted = KEY(0, 0, 0);
653 struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
654 struct printbuf buf = PRINTBUF;
655 int ret = 0;
656
657 deleted.p = k.k->p;
658
659 if (initial) {
660 BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) &&
661 k.k->bversion.lo > atomic64_read(&c->journal.seq));
662
663 if (fsck_err_on(btree_id != BTREE_ID_accounting &&
664 k.k->bversion.lo > atomic64_read(&c->key_version),
665 trans, bkey_version_in_future,
666 "key version number higher than recorded %llu\n%s",
667 atomic64_read(&c->key_version),
668 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
669 atomic64_set(&c->key_version, k.k->bversion.lo);
670 }
671
672 if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
673 trans, btree_bitmap_not_marked,
674 "btree ptr not marked in member info btree allocated bitmap\n%s",
675 (printbuf_reset(&buf),
676 bch2_bkey_val_to_text(&buf, c, k),
677 buf.buf))) {
678 mutex_lock(&c->sb_lock);
679 bch2_dev_btree_bitmap_mark(c, k);
680 bch2_write_super(c);
681 mutex_unlock(&c->sb_lock);
682 }
683
684 /*
685 * We require a commit before key_trigger() because
686 * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
687 * wrong result if we run it multiple times.
688 */
689 unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
690
691 ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
692 BTREE_TRIGGER_check_repair|flags);
693 if (ret)
694 goto out;
695
696 if (trans->nr_updates) {
697 ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
698 -BCH_ERR_transaction_restart_nested;
699 goto out;
700 }
701
702 ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
703 BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
704 out:
705 fsck_err:
706 printbuf_exit(&buf);
707 bch_err_fn(c, ret);
708 return ret;
709 }
710
bch2_gc_btree(struct btree_trans * trans,struct progress_indicator_state * progress,enum btree_id btree,bool initial)711 static int bch2_gc_btree(struct btree_trans *trans,
712 struct progress_indicator_state *progress,
713 enum btree_id btree, bool initial)
714 {
715 struct bch_fs *c = trans->c;
716 unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
717 int ret = 0;
718
719 /* We need to make sure every leaf node is readable before going RW */
720 if (initial)
721 target_depth = 0;
722
723 for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
724 struct btree *prev = NULL;
725 struct btree_iter iter;
726 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
727 BTREE_ITER_prefetch);
728
729 ret = for_each_btree_key_continue(trans, iter, 0, k, ({
730 bch2_progress_update_iter(trans, progress, &iter, "check_allocations");
731 gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
732 bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
733 }));
734 if (ret)
735 goto err;
736 }
737
738 /* root */
739 do {
740 retry_root:
741 bch2_trans_begin(trans);
742
743 struct btree_iter iter;
744 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
745 0, bch2_btree_id_root(c, btree)->b->c.level, 0);
746 struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
747 ret = PTR_ERR_OR_ZERO(b);
748 if (ret)
749 goto err_root;
750
751 if (b != btree_node_root(c, b)) {
752 bch2_trans_iter_exit(trans, &iter);
753 goto retry_root;
754 }
755
756 gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
757 struct bkey_s_c k = bkey_i_to_s_c(&b->key);
758 ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
759 err_root:
760 bch2_trans_iter_exit(trans, &iter);
761 } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
762 err:
763 bch_err_fn(c, ret);
764 return ret;
765 }
766
btree_id_gc_phase_cmp(enum btree_id l,enum btree_id r)767 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
768 {
769 return cmp_int(gc_btree_order(l), gc_btree_order(r));
770 }
771
bch2_gc_btrees(struct bch_fs * c)772 static int bch2_gc_btrees(struct bch_fs *c)
773 {
774 struct btree_trans *trans = bch2_trans_get(c);
775 struct printbuf buf = PRINTBUF;
776 int ret = 0;
777
778 struct progress_indicator_state progress;
779 bch2_progress_init(&progress, c, ~0ULL);
780
781 enum btree_id ids[BTREE_ID_NR];
782 for (unsigned i = 0; i < BTREE_ID_NR; i++)
783 ids[i] = i;
784 bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
785
786 for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
787 unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
788
789 if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
790 continue;
791
792 ret = bch2_gc_btree(trans, &progress, btree, true);
793 }
794
795 printbuf_exit(&buf);
796 bch2_trans_put(trans);
797 bch_err_fn(c, ret);
798 return ret;
799 }
800
bch2_mark_superblocks(struct bch_fs * c)801 static int bch2_mark_superblocks(struct bch_fs *c)
802 {
803 gc_pos_set(c, gc_phase(GC_PHASE_sb));
804
805 return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
806 }
807
bch2_gc_free(struct bch_fs * c)808 static void bch2_gc_free(struct bch_fs *c)
809 {
810 bch2_accounting_gc_free(c);
811
812 genradix_free(&c->reflink_gc_table);
813 genradix_free(&c->gc_stripes);
814
815 for_each_member_device(c, ca)
816 genradix_free(&ca->buckets_gc);
817 }
818
bch2_gc_start(struct bch_fs * c)819 static int bch2_gc_start(struct bch_fs *c)
820 {
821 for_each_member_device(c, ca) {
822 int ret = bch2_dev_usage_init(ca, true);
823 if (ret) {
824 bch2_dev_put(ca);
825 return ret;
826 }
827 }
828
829 return 0;
830 }
831
832 /* returns true if not equal */
bch2_alloc_v4_cmp(struct bch_alloc_v4 l,struct bch_alloc_v4 r)833 static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
834 struct bch_alloc_v4 r)
835 {
836 return l.gen != r.gen ||
837 l.oldest_gen != r.oldest_gen ||
838 l.data_type != r.data_type ||
839 l.dirty_sectors != r.dirty_sectors ||
840 l.stripe_sectors != r.stripe_sectors ||
841 l.cached_sectors != r.cached_sectors ||
842 l.stripe_redundancy != r.stripe_redundancy ||
843 l.stripe != r.stripe;
844 }
845
bch2_alloc_write_key(struct btree_trans * trans,struct btree_iter * iter,struct bch_dev * ca,struct bkey_s_c k)846 static int bch2_alloc_write_key(struct btree_trans *trans,
847 struct btree_iter *iter,
848 struct bch_dev *ca,
849 struct bkey_s_c k)
850 {
851 struct bch_fs *c = trans->c;
852 struct bkey_i_alloc_v4 *a;
853 struct bch_alloc_v4 old_gc, gc, old_convert, new;
854 const struct bch_alloc_v4 *old;
855 int ret;
856
857 if (!bucket_valid(ca, k.k->p.offset))
858 return 0;
859
860 old = bch2_alloc_to_v4(k, &old_convert);
861 gc = new = *old;
862
863 __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
864
865 old_gc = gc;
866
867 if ((old->data_type == BCH_DATA_sb ||
868 old->data_type == BCH_DATA_journal) &&
869 !bch2_dev_is_online(ca)) {
870 gc.data_type = old->data_type;
871 gc.dirty_sectors = old->dirty_sectors;
872 }
873
874 /*
875 * gc.data_type doesn't yet include need_discard & need_gc_gen states -
876 * fix that here:
877 */
878 alloc_data_type_set(&gc, gc.data_type);
879 if (gc.data_type != old_gc.data_type ||
880 gc.dirty_sectors != old_gc.dirty_sectors) {
881 ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
882 if (ret)
883 return ret;
884
885 /*
886 * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
887 * safe w.r.t. transaction restarts, so fixup the gc_bucket so
888 * we don't run it twice:
889 */
890 struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
891 gc_m->data_type = gc.data_type;
892 gc_m->dirty_sectors = gc.dirty_sectors;
893 }
894
895 if (fsck_err_on(new.data_type != gc.data_type,
896 trans, alloc_key_data_type_wrong,
897 "bucket %llu:%llu gen %u has wrong data_type"
898 ": got %s, should be %s",
899 iter->pos.inode, iter->pos.offset,
900 gc.gen,
901 bch2_data_type_str(new.data_type),
902 bch2_data_type_str(gc.data_type)))
903 new.data_type = gc.data_type;
904
905 #define copy_bucket_field(_errtype, _f) \
906 if (fsck_err_on(new._f != gc._f, \
907 trans, _errtype, \
908 "bucket %llu:%llu gen %u data type %s has wrong " #_f \
909 ": got %llu, should be %llu", \
910 iter->pos.inode, iter->pos.offset, \
911 gc.gen, \
912 bch2_data_type_str(gc.data_type), \
913 (u64) new._f, (u64) gc._f)) \
914 new._f = gc._f; \
915
916 copy_bucket_field(alloc_key_gen_wrong, gen);
917 copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors);
918 copy_bucket_field(alloc_key_stripe_sectors_wrong, stripe_sectors);
919 copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors);
920 copy_bucket_field(alloc_key_stripe_wrong, stripe);
921 copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy);
922 #undef copy_bucket_field
923
924 if (!bch2_alloc_v4_cmp(*old, new))
925 return 0;
926
927 a = bch2_alloc_to_v4_mut(trans, k);
928 ret = PTR_ERR_OR_ZERO(a);
929 if (ret)
930 return ret;
931
932 a->v = new;
933
934 /*
935 * The trigger normally makes sure these are set, but we're not running
936 * triggers:
937 */
938 if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
939 a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
940
941 ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
942 fsck_err:
943 return ret;
944 }
945
bch2_gc_alloc_done(struct bch_fs * c)946 static int bch2_gc_alloc_done(struct bch_fs *c)
947 {
948 int ret = 0;
949
950 for_each_member_device(c, ca) {
951 ret = bch2_trans_run(c,
952 for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc,
953 POS(ca->dev_idx, ca->mi.first_bucket),
954 POS(ca->dev_idx, ca->mi.nbuckets - 1),
955 BTREE_ITER_slots|BTREE_ITER_prefetch, k,
956 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
957 bch2_alloc_write_key(trans, &iter, ca, k)));
958 if (ret) {
959 bch2_dev_put(ca);
960 break;
961 }
962 }
963
964 bch_err_fn(c, ret);
965 return ret;
966 }
967
bch2_gc_alloc_start(struct bch_fs * c)968 static int bch2_gc_alloc_start(struct bch_fs *c)
969 {
970 int ret = 0;
971
972 for_each_member_device(c, ca) {
973 ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
974 if (ret) {
975 bch2_dev_put(ca);
976 ret = bch_err_throw(c, ENOMEM_gc_alloc_start);
977 break;
978 }
979 }
980
981 bch_err_fn(c, ret);
982 return ret;
983 }
984
bch2_gc_write_stripes_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)985 static int bch2_gc_write_stripes_key(struct btree_trans *trans,
986 struct btree_iter *iter,
987 struct bkey_s_c k)
988 {
989 struct bch_fs *c = trans->c;
990 struct printbuf buf = PRINTBUF;
991 const struct bch_stripe *s;
992 struct gc_stripe *m;
993 bool bad = false;
994 unsigned i;
995 int ret = 0;
996
997 if (k.k->type != KEY_TYPE_stripe)
998 return 0;
999
1000 s = bkey_s_c_to_stripe(k).v;
1001 m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
1002
1003 for (i = 0; i < s->nr_blocks; i++) {
1004 u32 old = stripe_blockcount_get(s, i);
1005 u32 new = (m ? m->block_sectors[i] : 0);
1006
1007 if (old != new) {
1008 prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
1009 i, old, new);
1010 bad = true;
1011 }
1012 }
1013
1014 if (bad)
1015 bch2_bkey_val_to_text(&buf, c, k);
1016
1017 if (fsck_err_on(bad,
1018 trans, stripe_sector_count_wrong,
1019 "%s", buf.buf)) {
1020 struct bkey_i_stripe *new;
1021
1022 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1023 ret = PTR_ERR_OR_ZERO(new);
1024 if (ret)
1025 return ret;
1026
1027 bkey_reassemble(&new->k_i, k);
1028
1029 for (i = 0; i < new->v.nr_blocks; i++)
1030 stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
1031
1032 ret = bch2_trans_update(trans, iter, &new->k_i, 0);
1033 }
1034 fsck_err:
1035 printbuf_exit(&buf);
1036 return ret;
1037 }
1038
bch2_gc_stripes_done(struct bch_fs * c)1039 static int bch2_gc_stripes_done(struct bch_fs *c)
1040 {
1041 return bch2_trans_run(c,
1042 for_each_btree_key_commit(trans, iter,
1043 BTREE_ID_stripes, POS_MIN,
1044 BTREE_ITER_prefetch, k,
1045 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1046 bch2_gc_write_stripes_key(trans, &iter, k)));
1047 }
1048
1049 /**
1050 * bch2_check_allocations - walk all references to buckets, and recompute them:
1051 *
1052 * @c: filesystem object
1053 *
1054 * Returns: 0 on success, or standard errcode on failure
1055 *
1056 * Order matters here:
1057 * - Concurrent GC relies on the fact that we have a total ordering for
1058 * everything that GC walks - see gc_will_visit_node(),
1059 * gc_will_visit_root()
1060 *
1061 * - also, references move around in the course of index updates and
1062 * various other crap: everything needs to agree on the ordering
1063 * references are allowed to move around in - e.g., we're allowed to
1064 * start with a reference owned by an open_bucket (the allocator) and
1065 * move it to the btree, but not the reverse.
1066 *
1067 * This is necessary to ensure that gc doesn't miss references that
1068 * move around - if references move backwards in the ordering GC
1069 * uses, GC could skip past them
1070 */
bch2_check_allocations(struct bch_fs * c)1071 int bch2_check_allocations(struct bch_fs *c)
1072 {
1073 int ret;
1074
1075 down_read(&c->state_lock);
1076 down_write(&c->gc_lock);
1077
1078 bch2_btree_interior_updates_flush(c);
1079
1080 ret = bch2_gc_accounting_start(c) ?:
1081 bch2_gc_start(c) ?:
1082 bch2_gc_alloc_start(c) ?:
1083 bch2_gc_reflink_start(c);
1084 if (ret)
1085 goto out;
1086
1087 gc_pos_set(c, gc_phase(GC_PHASE_start));
1088
1089 ret = bch2_mark_superblocks(c);
1090 bch_err_msg(c, ret, "marking superblocks");
1091 if (ret)
1092 goto out;
1093
1094 ret = bch2_gc_btrees(c);
1095 if (ret)
1096 goto out;
1097
1098 c->gc_count++;
1099
1100 ret = bch2_gc_alloc_done(c) ?:
1101 bch2_gc_accounting_done(c) ?:
1102 bch2_gc_stripes_done(c) ?:
1103 bch2_gc_reflink_done(c);
1104 out:
1105 percpu_down_write(&c->mark_lock);
1106 /* Indicates that gc is no longer in progress: */
1107 __gc_pos_set(c, gc_phase(GC_PHASE_not_running));
1108
1109 bch2_gc_free(c);
1110 percpu_up_write(&c->mark_lock);
1111
1112 up_write(&c->gc_lock);
1113 up_read(&c->state_lock);
1114
1115 /*
1116 * At startup, allocations can happen directly instead of via the
1117 * allocator thread - issue wakeup in case they blocked on gc_lock:
1118 */
1119 closure_wake_up(&c->freelist_wait);
1120
1121 if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags))
1122 bch2_sb_members_clean_deleted(c);
1123
1124 bch_err_fn(c, ret);
1125 return ret;
1126 }
1127
gc_btree_gens_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1128 static int gc_btree_gens_key(struct btree_trans *trans,
1129 struct btree_iter *iter,
1130 struct bkey_s_c k)
1131 {
1132 struct bch_fs *c = trans->c;
1133 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1134
1135 if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
1136 return -EROFS;
1137
1138 bool too_stale = false;
1139 scoped_guard(rcu) {
1140 bkey_for_each_ptr(ptrs, ptr) {
1141 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
1142 if (!ca)
1143 continue;
1144
1145 too_stale |= dev_ptr_stale(ca, ptr) > 16;
1146 }
1147
1148 if (!too_stale)
1149 bkey_for_each_ptr(ptrs, ptr) {
1150 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
1151 if (!ca)
1152 continue;
1153
1154 u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
1155 if (gen_after(*gen, ptr->gen))
1156 *gen = ptr->gen;
1157 }
1158 }
1159
1160 if (too_stale) {
1161 struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0);
1162 int ret = PTR_ERR_OR_ZERO(u);
1163 if (ret)
1164 return ret;
1165
1166 bch2_extent_normalize(c, bkey_i_to_s(u));
1167 }
1168
1169 return 0;
1170 }
1171
bch2_alloc_write_oldest_gen(struct btree_trans * trans,struct bch_dev * ca,struct btree_iter * iter,struct bkey_s_c k)1172 static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
1173 struct btree_iter *iter, struct bkey_s_c k)
1174 {
1175 struct bch_alloc_v4 a_convert;
1176 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
1177 struct bkey_i_alloc_v4 *a_mut;
1178 int ret;
1179
1180 if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
1181 return 0;
1182
1183 a_mut = bch2_alloc_to_v4_mut(trans, k);
1184 ret = PTR_ERR_OR_ZERO(a_mut);
1185 if (ret)
1186 return ret;
1187
1188 a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
1189
1190 return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
1191 }
1192
bch2_gc_gens(struct bch_fs * c)1193 int bch2_gc_gens(struct bch_fs *c)
1194 {
1195 u64 b, start_time = local_clock();
1196 int ret;
1197
1198 if (!mutex_trylock(&c->gc_gens_lock))
1199 return 0;
1200
1201 trace_and_count(c, gc_gens_start, c);
1202
1203 /*
1204 * We have to use trylock here. Otherwise, we would
1205 * introduce a deadlock in the RO path - we take the
1206 * state lock at the start of going RO.
1207 */
1208 if (!down_read_trylock(&c->state_lock)) {
1209 mutex_unlock(&c->gc_gens_lock);
1210 return 0;
1211 }
1212
1213 for_each_member_device(c, ca) {
1214 struct bucket_gens *gens = bucket_gens(ca);
1215
1216 BUG_ON(ca->oldest_gen);
1217
1218 ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
1219 if (!ca->oldest_gen) {
1220 bch2_dev_put(ca);
1221 ret = bch_err_throw(c, ENOMEM_gc_gens);
1222 goto err;
1223 }
1224
1225 for (b = gens->first_bucket;
1226 b < gens->nbuckets; b++)
1227 ca->oldest_gen[b] = gens->b[b];
1228 }
1229
1230 for (unsigned i = 0; i < BTREE_ID_NR; i++)
1231 if (btree_type_has_ptrs(i)) {
1232 c->gc_gens_btree = i;
1233 c->gc_gens_pos = POS_MIN;
1234
1235 ret = bch2_trans_run(c,
1236 for_each_btree_key_commit(trans, iter, i,
1237 POS_MIN,
1238 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
1239 k,
1240 NULL, NULL,
1241 BCH_TRANS_COMMIT_no_enospc,
1242 gc_btree_gens_key(trans, &iter, k)));
1243 if (ret)
1244 goto err;
1245 }
1246
1247 struct bch_dev *ca = NULL;
1248 ret = bch2_trans_run(c,
1249 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1250 POS_MIN,
1251 BTREE_ITER_prefetch,
1252 k,
1253 NULL, NULL,
1254 BCH_TRANS_COMMIT_no_enospc, ({
1255 ca = bch2_dev_iterate(c, ca, k.k->p.inode);
1256 if (!ca) {
1257 bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
1258 continue;
1259 }
1260 bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
1261 })));
1262 bch2_dev_put(ca);
1263
1264 if (ret)
1265 goto err;
1266
1267 c->gc_gens_btree = 0;
1268 c->gc_gens_pos = POS_MIN;
1269
1270 c->gc_count++;
1271
1272 bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
1273 trace_and_count(c, gc_gens_end, c);
1274 err:
1275 for_each_member_device(c, ca) {
1276 kvfree(ca->oldest_gen);
1277 ca->oldest_gen = NULL;
1278 }
1279
1280 up_read(&c->state_lock);
1281 mutex_unlock(&c->gc_gens_lock);
1282 if (!bch2_err_matches(ret, EROFS))
1283 bch_err_fn(c, ret);
1284 return ret;
1285 }
1286
bch2_gc_gens_work(struct work_struct * work)1287 static void bch2_gc_gens_work(struct work_struct *work)
1288 {
1289 struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
1290 bch2_gc_gens(c);
1291 enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
1292 }
1293
bch2_gc_gens_async(struct bch_fs * c)1294 void bch2_gc_gens_async(struct bch_fs *c)
1295 {
1296 if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) &&
1297 !queue_work(c->write_ref_wq, &c->gc_gens_work))
1298 enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
1299 }
1300
bch2_fs_btree_gc_init_early(struct bch_fs * c)1301 void bch2_fs_btree_gc_init_early(struct bch_fs *c)
1302 {
1303 seqcount_init(&c->gc_pos_lock);
1304 INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
1305
1306 init_rwsem(&c->gc_lock);
1307 mutex_init(&c->gc_gens_lock);
1308 }
1309