1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4 * Copyright (C) 2014 Datera Inc.
5 */
6
7 #include "bcachefs.h"
8 #include "alloc_background.h"
9 #include "alloc_foreground.h"
10 #include "backpointers.h"
11 #include "bkey_methods.h"
12 #include "bkey_buf.h"
13 #include "btree_journal_iter.h"
14 #include "btree_key_cache.h"
15 #include "btree_locking.h"
16 #include "btree_node_scan.h"
17 #include "btree_update_interior.h"
18 #include "btree_io.h"
19 #include "btree_gc.h"
20 #include "buckets.h"
21 #include "clock.h"
22 #include "debug.h"
23 #include "disk_accounting.h"
24 #include "ec.h"
25 #include "error.h"
26 #include "extents.h"
27 #include "journal.h"
28 #include "keylist.h"
29 #include "move.h"
30 #include "recovery_passes.h"
31 #include "reflink.h"
32 #include "recovery.h"
33 #include "replicas.h"
34 #include "super-io.h"
35 #include "trace.h"
36
37 #include <linux/slab.h>
38 #include <linux/bitops.h>
39 #include <linux/freezer.h>
40 #include <linux/kthread.h>
41 #include <linux/preempt.h>
42 #include <linux/rcupdate.h>
43 #include <linux/sched/task.h>
44
45 #define DROP_THIS_NODE 10
46 #define DROP_PREV_NODE 11
47 #define DID_FILL_FROM_SCAN 12
48
49 static const char * const bch2_gc_phase_strs[] = {
50 #define x(n) #n,
51 GC_PHASES()
52 #undef x
53 NULL
54 };
55
bch2_gc_pos_to_text(struct printbuf * out,struct gc_pos * p)56 void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
57 {
58 prt_str(out, bch2_gc_phase_strs[p->phase]);
59 prt_char(out, ' ');
60 bch2_btree_id_level_to_text(out, p->btree, p->level);
61 prt_char(out, ' ');
62 bch2_bpos_to_text(out, p->pos);
63 }
64
unsafe_bkey_s_c_to_s(struct bkey_s_c k)65 static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
66 {
67 return (struct bkey_s) {{{
68 (struct bkey *) k.k,
69 (struct bch_val *) k.v
70 }}};
71 }
72
__gc_pos_set(struct bch_fs * c,struct gc_pos new_pos)73 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
74 {
75 preempt_disable();
76 write_seqcount_begin(&c->gc_pos_lock);
77 c->gc_pos = new_pos;
78 write_seqcount_end(&c->gc_pos_lock);
79 preempt_enable();
80 }
81
gc_pos_set(struct bch_fs * c,struct gc_pos new_pos)82 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
83 {
84 BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
85 __gc_pos_set(c, new_pos);
86 }
87
btree_ptr_to_v2(struct btree * b,struct bkey_i_btree_ptr_v2 * dst)88 static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
89 {
90 switch (b->key.k.type) {
91 case KEY_TYPE_btree_ptr: {
92 struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
93
94 dst->k.p = src->k.p;
95 dst->v.mem_ptr = 0;
96 dst->v.seq = b->data->keys.seq;
97 dst->v.sectors_written = 0;
98 dst->v.flags = 0;
99 dst->v.min_key = b->data->min_key;
100 set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
101 memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
102 break;
103 }
104 case KEY_TYPE_btree_ptr_v2:
105 bkey_copy(&dst->k_i, &b->key);
106 break;
107 default:
108 BUG();
109 }
110 }
111
set_node_min(struct bch_fs * c,struct btree * b,struct bpos new_min)112 static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
113 {
114 struct bkey_i_btree_ptr_v2 *new;
115 int ret;
116
117 if (c->opts.verbose) {
118 struct printbuf buf = PRINTBUF;
119
120 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
121 prt_str(&buf, " -> ");
122 bch2_bpos_to_text(&buf, new_min);
123
124 bch_info(c, "%s(): %s", __func__, buf.buf);
125 printbuf_exit(&buf);
126 }
127
128 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
129 if (!new)
130 return -BCH_ERR_ENOMEM_gc_repair_key;
131
132 btree_ptr_to_v2(b, new);
133 b->data->min_key = new_min;
134 new->v.min_key = new_min;
135 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
136
137 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
138 if (ret) {
139 kfree(new);
140 return ret;
141 }
142
143 bch2_btree_node_drop_keys_outside_node(b);
144 bkey_copy(&b->key, &new->k_i);
145 return 0;
146 }
147
set_node_max(struct bch_fs * c,struct btree * b,struct bpos new_max)148 static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
149 {
150 struct bkey_i_btree_ptr_v2 *new;
151 int ret;
152
153 if (c->opts.verbose) {
154 struct printbuf buf = PRINTBUF;
155
156 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
157 prt_str(&buf, " -> ");
158 bch2_bpos_to_text(&buf, new_max);
159
160 bch_info(c, "%s(): %s", __func__, buf.buf);
161 printbuf_exit(&buf);
162 }
163
164 ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
165 if (ret)
166 return ret;
167
168 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
169 if (!new)
170 return -BCH_ERR_ENOMEM_gc_repair_key;
171
172 btree_ptr_to_v2(b, new);
173 b->data->max_key = new_max;
174 new->k.p = new_max;
175 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
176
177 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
178 if (ret) {
179 kfree(new);
180 return ret;
181 }
182
183 bch2_btree_node_drop_keys_outside_node(b);
184
185 mutex_lock(&c->btree_cache.lock);
186 __bch2_btree_node_hash_remove(&c->btree_cache, b);
187
188 bkey_copy(&b->key, &new->k_i);
189 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
190 BUG_ON(ret);
191 mutex_unlock(&c->btree_cache.lock);
192 return 0;
193 }
194
btree_check_node_boundaries(struct btree_trans * trans,struct btree * b,struct btree * prev,struct btree * cur,struct bpos * pulled_from_scan)195 static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
196 struct btree *prev, struct btree *cur,
197 struct bpos *pulled_from_scan)
198 {
199 struct bch_fs *c = trans->c;
200 struct bpos expected_start = !prev
201 ? b->data->min_key
202 : bpos_successor(prev->key.k.p);
203 struct printbuf buf = PRINTBUF;
204 int ret = 0;
205
206 BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
207 !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
208 b->data->min_key));
209
210 if (bpos_eq(expected_start, cur->data->min_key))
211 return 0;
212
213 prt_printf(&buf, " at ");
214 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
215 prt_printf(&buf, ":\n parent: ");
216 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
217
218 if (prev) {
219 prt_printf(&buf, "\n prev: ");
220 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
221 }
222
223 prt_str(&buf, "\n next: ");
224 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
225
226 if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
227 if (b->c.level == 1 &&
228 bpos_lt(*pulled_from_scan, cur->data->min_key)) {
229 ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
230 expected_start,
231 bpos_predecessor(cur->data->min_key));
232 if (ret)
233 goto err;
234
235 *pulled_from_scan = cur->data->min_key;
236 ret = DID_FILL_FROM_SCAN;
237 } else {
238 if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
239 "btree node with incorrect min_key%s", buf.buf))
240 ret = set_node_min(c, cur, expected_start);
241 }
242 } else { /* overlap */
243 if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
244 if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
245 if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
246 "btree node overwritten by next node%s", buf.buf))
247 ret = DROP_PREV_NODE;
248 } else {
249 if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
250 "btree node with incorrect max_key%s", buf.buf))
251 ret = set_node_max(c, prev,
252 bpos_predecessor(cur->data->min_key));
253 }
254 } else {
255 if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
256 if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
257 "btree node overwritten by prev node%s", buf.buf))
258 ret = DROP_THIS_NODE;
259 } else {
260 if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
261 "btree node with incorrect min_key%s", buf.buf))
262 ret = set_node_min(c, cur, expected_start);
263 }
264 }
265 }
266 err:
267 fsck_err:
268 printbuf_exit(&buf);
269 return ret;
270 }
271
btree_repair_node_end(struct btree_trans * trans,struct btree * b,struct btree * child,struct bpos * pulled_from_scan)272 static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
273 struct btree *child, struct bpos *pulled_from_scan)
274 {
275 struct bch_fs *c = trans->c;
276 struct printbuf buf = PRINTBUF;
277 int ret = 0;
278
279 if (bpos_eq(child->key.k.p, b->key.k.p))
280 return 0;
281
282 prt_printf(&buf, " at ");
283 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
284 prt_printf(&buf, ":\n parent: ");
285 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
286
287 prt_str(&buf, "\n child: ");
288 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
289
290 if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
291 "btree node with incorrect max_key%s", buf.buf)) {
292 if (b->c.level == 1 &&
293 bpos_lt(*pulled_from_scan, b->key.k.p)) {
294 ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
295 bpos_successor(child->key.k.p), b->key.k.p);
296 if (ret)
297 goto err;
298
299 *pulled_from_scan = b->key.k.p;
300 ret = DID_FILL_FROM_SCAN;
301 } else {
302 ret = set_node_max(c, child, b->key.k.p);
303 }
304 }
305 err:
306 fsck_err:
307 printbuf_exit(&buf);
308 return ret;
309 }
310
bch2_btree_repair_topology_recurse(struct btree_trans * trans,struct btree * b,struct bpos * pulled_from_scan)311 static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
312 struct bpos *pulled_from_scan)
313 {
314 struct bch_fs *c = trans->c;
315 struct btree_and_journal_iter iter;
316 struct bkey_s_c k;
317 struct bkey_buf prev_k, cur_k;
318 struct btree *prev = NULL, *cur = NULL;
319 bool have_child, new_pass = false;
320 struct printbuf buf = PRINTBUF;
321 int ret = 0;
322
323 if (!b->c.level)
324 return 0;
325
326 bch2_bkey_buf_init(&prev_k);
327 bch2_bkey_buf_init(&cur_k);
328 again:
329 cur = prev = NULL;
330 have_child = new_pass = false;
331 bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
332 iter.prefetch = true;
333
334 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
335 BUG_ON(bpos_lt(k.k->p, b->data->min_key));
336 BUG_ON(bpos_gt(k.k->p, b->data->max_key));
337
338 bch2_btree_and_journal_iter_advance(&iter);
339 bch2_bkey_buf_reassemble(&cur_k, c, k);
340
341 cur = bch2_btree_node_get_noiter(trans, cur_k.k,
342 b->c.btree_id, b->c.level - 1,
343 false);
344 ret = PTR_ERR_OR_ZERO(cur);
345
346 printbuf_reset(&buf);
347 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1);
348 prt_char(&buf, ' ');
349 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
350
351 if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
352 trans, btree_node_read_error,
353 "Topology repair: unreadable btree node at\n"
354 " %s",
355 buf.buf)) {
356 bch2_btree_node_evict(trans, cur_k.k);
357 cur = NULL;
358 ret = bch2_journal_key_delete(c, b->c.btree_id,
359 b->c.level, cur_k.k->k.p);
360 if (ret)
361 break;
362
363 ret = bch2_btree_lost_data(c, b->c.btree_id);
364 if (ret)
365 break;
366 continue;
367 }
368
369 bch_err_msg(c, ret, "getting btree node");
370 if (ret)
371 break;
372
373 if (bch2_btree_node_is_stale(c, cur)) {
374 bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf);
375 six_unlock_read(&cur->c.lock);
376 bch2_btree_node_evict(trans, cur_k.k);
377 ret = bch2_journal_key_delete(c, b->c.btree_id,
378 b->c.level, cur_k.k->k.p);
379 cur = NULL;
380 if (ret)
381 break;
382 continue;
383 }
384
385 ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan);
386 if (ret == DID_FILL_FROM_SCAN) {
387 new_pass = true;
388 ret = 0;
389 }
390
391 if (ret == DROP_THIS_NODE) {
392 six_unlock_read(&cur->c.lock);
393 bch2_btree_node_evict(trans, cur_k.k);
394 ret = bch2_journal_key_delete(c, b->c.btree_id,
395 b->c.level, cur_k.k->k.p);
396 cur = NULL;
397 if (ret)
398 break;
399 continue;
400 }
401
402 if (prev)
403 six_unlock_read(&prev->c.lock);
404 prev = NULL;
405
406 if (ret == DROP_PREV_NODE) {
407 bch_info(c, "dropped prev node");
408 bch2_btree_node_evict(trans, prev_k.k);
409 ret = bch2_journal_key_delete(c, b->c.btree_id,
410 b->c.level, prev_k.k->k.p);
411 if (ret)
412 break;
413
414 bch2_btree_and_journal_iter_exit(&iter);
415 goto again;
416 } else if (ret)
417 break;
418
419 prev = cur;
420 cur = NULL;
421 bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
422 }
423
424 if (!ret && !IS_ERR_OR_NULL(prev)) {
425 BUG_ON(cur);
426 ret = btree_repair_node_end(trans, b, prev, pulled_from_scan);
427 if (ret == DID_FILL_FROM_SCAN) {
428 new_pass = true;
429 ret = 0;
430 }
431 }
432
433 if (!IS_ERR_OR_NULL(prev))
434 six_unlock_read(&prev->c.lock);
435 prev = NULL;
436 if (!IS_ERR_OR_NULL(cur))
437 six_unlock_read(&cur->c.lock);
438 cur = NULL;
439
440 if (ret)
441 goto err;
442
443 bch2_btree_and_journal_iter_exit(&iter);
444
445 if (new_pass)
446 goto again;
447
448 bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
449 iter.prefetch = true;
450
451 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
452 bch2_bkey_buf_reassemble(&cur_k, c, k);
453 bch2_btree_and_journal_iter_advance(&iter);
454
455 cur = bch2_btree_node_get_noiter(trans, cur_k.k,
456 b->c.btree_id, b->c.level - 1,
457 false);
458 ret = PTR_ERR_OR_ZERO(cur);
459
460 bch_err_msg(c, ret, "getting btree node");
461 if (ret)
462 goto err;
463
464 ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
465 six_unlock_read(&cur->c.lock);
466 cur = NULL;
467
468 if (ret == DROP_THIS_NODE) {
469 bch2_btree_node_evict(trans, cur_k.k);
470 ret = bch2_journal_key_delete(c, b->c.btree_id,
471 b->c.level, cur_k.k->k.p);
472 new_pass = true;
473 }
474
475 if (ret)
476 goto err;
477
478 have_child = true;
479 }
480
481 printbuf_reset(&buf);
482 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
483 prt_newline(&buf);
484 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
485
486 if (mustfix_fsck_err_on(!have_child,
487 trans, btree_node_topology_interior_node_empty,
488 "empty interior btree node at %s", buf.buf))
489 ret = DROP_THIS_NODE;
490 err:
491 fsck_err:
492 if (!IS_ERR_OR_NULL(prev))
493 six_unlock_read(&prev->c.lock);
494 if (!IS_ERR_OR_NULL(cur))
495 six_unlock_read(&cur->c.lock);
496
497 bch2_btree_and_journal_iter_exit(&iter);
498
499 if (!ret && new_pass)
500 goto again;
501
502 BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
503
504 bch2_bkey_buf_exit(&prev_k, c);
505 bch2_bkey_buf_exit(&cur_k, c);
506 printbuf_exit(&buf);
507 return ret;
508 }
509
bch2_check_topology(struct bch_fs * c)510 int bch2_check_topology(struct bch_fs *c)
511 {
512 struct btree_trans *trans = bch2_trans_get(c);
513 struct bpos pulled_from_scan = POS_MIN;
514 struct printbuf buf = PRINTBUF;
515 int ret = 0;
516
517 bch2_trans_srcu_unlock(trans);
518
519 for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
520 struct btree_root *r = bch2_btree_id_root(c, i);
521 bool reconstructed_root = false;
522
523 printbuf_reset(&buf);
524 bch2_btree_id_to_text(&buf, i);
525
526 if (r->error) {
527 ret = bch2_btree_lost_data(c, i);
528 if (ret)
529 break;
530 reconstruct_root:
531 bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
532
533 r->alive = false;
534 r->error = 0;
535
536 if (!bch2_btree_has_scanned_nodes(c, i)) {
537 mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing,
538 "no nodes found for btree %s, continue?", buf.buf);
539 bch2_btree_root_alloc_fake_trans(trans, i, 0);
540 } else {
541 bch2_btree_root_alloc_fake_trans(trans, i, 1);
542 bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
543 ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
544 if (ret)
545 break;
546 }
547
548 reconstructed_root = true;
549 }
550
551 struct btree *b = r->b;
552
553 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
554 ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
555 six_unlock_read(&b->c.lock);
556
557 if (ret == DROP_THIS_NODE) {
558 mutex_lock(&c->btree_cache.lock);
559 bch2_btree_node_hash_remove(&c->btree_cache, b);
560 mutex_unlock(&c->btree_cache.lock);
561
562 r->b = NULL;
563
564 if (!reconstructed_root)
565 goto reconstruct_root;
566
567 bch_err(c, "empty btree root %s", buf.buf);
568 bch2_btree_root_alloc_fake_trans(trans, i, 0);
569 r->alive = false;
570 ret = 0;
571 }
572 }
573 fsck_err:
574 printbuf_exit(&buf);
575 bch2_trans_put(trans);
576 return ret;
577 }
578
579 /* marking of btree keys/nodes: */
580
bch2_gc_mark_key(struct btree_trans * trans,enum btree_id btree_id,unsigned level,struct btree ** prev,struct btree_iter * iter,struct bkey_s_c k,bool initial)581 static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
582 unsigned level, struct btree **prev,
583 struct btree_iter *iter, struct bkey_s_c k,
584 bool initial)
585 {
586 struct bch_fs *c = trans->c;
587
588 if (iter) {
589 struct btree_path *path = btree_iter_path(trans, iter);
590 struct btree *b = path_l(path)->b;
591
592 if (*prev != b) {
593 int ret = bch2_btree_node_check_topology(trans, b);
594 if (ret)
595 return ret;
596 }
597 *prev = b;
598 }
599
600 struct bkey deleted = KEY(0, 0, 0);
601 struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
602 struct printbuf buf = PRINTBUF;
603 int ret = 0;
604
605 deleted.p = k.k->p;
606
607 if (initial) {
608 BUG_ON(bch2_journal_seq_verify &&
609 k.k->bversion.lo > atomic64_read(&c->journal.seq));
610
611 if (fsck_err_on(btree_id != BTREE_ID_accounting &&
612 k.k->bversion.lo > atomic64_read(&c->key_version),
613 trans, bkey_version_in_future,
614 "key version number higher than recorded %llu\n %s",
615 atomic64_read(&c->key_version),
616 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
617 atomic64_set(&c->key_version, k.k->bversion.lo);
618 }
619
620 if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
621 trans, btree_bitmap_not_marked,
622 "btree ptr not marked in member info btree allocated bitmap\n %s",
623 (printbuf_reset(&buf),
624 bch2_bkey_val_to_text(&buf, c, k),
625 buf.buf))) {
626 mutex_lock(&c->sb_lock);
627 bch2_dev_btree_bitmap_mark(c, k);
628 bch2_write_super(c);
629 mutex_unlock(&c->sb_lock);
630 }
631
632 /*
633 * We require a commit before key_trigger() because
634 * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
635 * wrong result if we run it multiple times.
636 */
637 unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
638
639 ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
640 BTREE_TRIGGER_check_repair|flags);
641 if (ret)
642 goto out;
643
644 if (trans->nr_updates) {
645 ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
646 -BCH_ERR_transaction_restart_nested;
647 goto out;
648 }
649
650 ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
651 BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
652 out:
653 fsck_err:
654 printbuf_exit(&buf);
655 bch_err_fn(c, ret);
656 return ret;
657 }
658
bch2_gc_btree(struct btree_trans * trans,enum btree_id btree,bool initial)659 static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
660 {
661 struct bch_fs *c = trans->c;
662 unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
663 int ret = 0;
664
665 /* We need to make sure every leaf node is readable before going RW */
666 if (initial)
667 target_depth = 0;
668
669 for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
670 struct btree *prev = NULL;
671 struct btree_iter iter;
672 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
673 BTREE_ITER_prefetch);
674
675 ret = for_each_btree_key_continue(trans, iter, 0, k, ({
676 gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
677 bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
678 }));
679 if (ret)
680 goto err;
681 }
682
683 /* root */
684 do {
685 retry_root:
686 bch2_trans_begin(trans);
687
688 struct btree_iter iter;
689 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
690 0, bch2_btree_id_root(c, btree)->b->c.level, 0);
691 struct btree *b = bch2_btree_iter_peek_node(&iter);
692 ret = PTR_ERR_OR_ZERO(b);
693 if (ret)
694 goto err_root;
695
696 if (b != btree_node_root(c, b)) {
697 bch2_trans_iter_exit(trans, &iter);
698 goto retry_root;
699 }
700
701 gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
702 struct bkey_s_c k = bkey_i_to_s_c(&b->key);
703 ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
704 err_root:
705 bch2_trans_iter_exit(trans, &iter);
706 } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
707 err:
708 bch_err_fn(c, ret);
709 return ret;
710 }
711
btree_id_gc_phase_cmp(enum btree_id l,enum btree_id r)712 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
713 {
714 return cmp_int(gc_btree_order(l), gc_btree_order(r));
715 }
716
bch2_gc_btrees(struct bch_fs * c)717 static int bch2_gc_btrees(struct bch_fs *c)
718 {
719 struct btree_trans *trans = bch2_trans_get(c);
720 enum btree_id ids[BTREE_ID_NR];
721 struct printbuf buf = PRINTBUF;
722 unsigned i;
723 int ret = 0;
724
725 for (i = 0; i < BTREE_ID_NR; i++)
726 ids[i] = i;
727 bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
728
729 for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
730 unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
731
732 if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
733 continue;
734
735 ret = bch2_gc_btree(trans, btree, true);
736 }
737
738 printbuf_exit(&buf);
739 bch2_trans_put(trans);
740 bch_err_fn(c, ret);
741 return ret;
742 }
743
bch2_mark_superblocks(struct bch_fs * c)744 static int bch2_mark_superblocks(struct bch_fs *c)
745 {
746 gc_pos_set(c, gc_phase(GC_PHASE_sb));
747
748 return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
749 }
750
bch2_gc_free(struct bch_fs * c)751 static void bch2_gc_free(struct bch_fs *c)
752 {
753 bch2_accounting_gc_free(c);
754
755 genradix_free(&c->reflink_gc_table);
756 genradix_free(&c->gc_stripes);
757
758 for_each_member_device(c, ca)
759 genradix_free(&ca->buckets_gc);
760 }
761
bch2_gc_start(struct bch_fs * c)762 static int bch2_gc_start(struct bch_fs *c)
763 {
764 for_each_member_device(c, ca) {
765 int ret = bch2_dev_usage_init(ca, true);
766 if (ret) {
767 bch2_dev_put(ca);
768 return ret;
769 }
770 }
771
772 return 0;
773 }
774
775 /* returns true if not equal */
bch2_alloc_v4_cmp(struct bch_alloc_v4 l,struct bch_alloc_v4 r)776 static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
777 struct bch_alloc_v4 r)
778 {
779 return l.gen != r.gen ||
780 l.oldest_gen != r.oldest_gen ||
781 l.data_type != r.data_type ||
782 l.dirty_sectors != r.dirty_sectors ||
783 l.stripe_sectors != r.stripe_sectors ||
784 l.cached_sectors != r.cached_sectors ||
785 l.stripe_redundancy != r.stripe_redundancy ||
786 l.stripe != r.stripe;
787 }
788
bch2_alloc_write_key(struct btree_trans * trans,struct btree_iter * iter,struct bch_dev * ca,struct bkey_s_c k)789 static int bch2_alloc_write_key(struct btree_trans *trans,
790 struct btree_iter *iter,
791 struct bch_dev *ca,
792 struct bkey_s_c k)
793 {
794 struct bch_fs *c = trans->c;
795 struct bkey_i_alloc_v4 *a;
796 struct bch_alloc_v4 old_gc, gc, old_convert, new;
797 const struct bch_alloc_v4 *old;
798 int ret;
799
800 if (!bucket_valid(ca, k.k->p.offset))
801 return 0;
802
803 old = bch2_alloc_to_v4(k, &old_convert);
804 gc = new = *old;
805
806 __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
807
808 old_gc = gc;
809
810 if ((old->data_type == BCH_DATA_sb ||
811 old->data_type == BCH_DATA_journal) &&
812 !bch2_dev_is_online(ca)) {
813 gc.data_type = old->data_type;
814 gc.dirty_sectors = old->dirty_sectors;
815 }
816
817 /*
818 * gc.data_type doesn't yet include need_discard & need_gc_gen states -
819 * fix that here:
820 */
821 alloc_data_type_set(&gc, gc.data_type);
822 if (gc.data_type != old_gc.data_type ||
823 gc.dirty_sectors != old_gc.dirty_sectors) {
824 ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
825 if (ret)
826 return ret;
827
828 /*
829 * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
830 * safe w.r.t. transaction restarts, so fixup the gc_bucket so
831 * we don't run it twice:
832 */
833 struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
834 gc_m->data_type = gc.data_type;
835 gc_m->dirty_sectors = gc.dirty_sectors;
836 }
837
838 if (fsck_err_on(new.data_type != gc.data_type,
839 trans, alloc_key_data_type_wrong,
840 "bucket %llu:%llu gen %u has wrong data_type"
841 ": got %s, should be %s",
842 iter->pos.inode, iter->pos.offset,
843 gc.gen,
844 bch2_data_type_str(new.data_type),
845 bch2_data_type_str(gc.data_type)))
846 new.data_type = gc.data_type;
847
848 #define copy_bucket_field(_errtype, _f) \
849 if (fsck_err_on(new._f != gc._f, \
850 trans, _errtype, \
851 "bucket %llu:%llu gen %u data type %s has wrong " #_f \
852 ": got %llu, should be %llu", \
853 iter->pos.inode, iter->pos.offset, \
854 gc.gen, \
855 bch2_data_type_str(gc.data_type), \
856 (u64) new._f, (u64) gc._f)) \
857 new._f = gc._f; \
858
859 copy_bucket_field(alloc_key_gen_wrong, gen);
860 copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors);
861 copy_bucket_field(alloc_key_stripe_sectors_wrong, stripe_sectors);
862 copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors);
863 copy_bucket_field(alloc_key_stripe_wrong, stripe);
864 copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy);
865 #undef copy_bucket_field
866
867 if (!bch2_alloc_v4_cmp(*old, new))
868 return 0;
869
870 a = bch2_alloc_to_v4_mut(trans, k);
871 ret = PTR_ERR_OR_ZERO(a);
872 if (ret)
873 return ret;
874
875 a->v = new;
876
877 /*
878 * The trigger normally makes sure these are set, but we're not running
879 * triggers:
880 */
881 if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
882 a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
883
884 ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
885 fsck_err:
886 return ret;
887 }
888
bch2_gc_alloc_done(struct bch_fs * c)889 static int bch2_gc_alloc_done(struct bch_fs *c)
890 {
891 int ret = 0;
892
893 for_each_member_device(c, ca) {
894 ret = bch2_trans_run(c,
895 for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc,
896 POS(ca->dev_idx, ca->mi.first_bucket),
897 POS(ca->dev_idx, ca->mi.nbuckets - 1),
898 BTREE_ITER_slots|BTREE_ITER_prefetch, k,
899 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
900 bch2_alloc_write_key(trans, &iter, ca, k)));
901 if (ret) {
902 bch2_dev_put(ca);
903 break;
904 }
905 }
906
907 bch_err_fn(c, ret);
908 return ret;
909 }
910
bch2_gc_alloc_start(struct bch_fs * c)911 static int bch2_gc_alloc_start(struct bch_fs *c)
912 {
913 int ret = 0;
914
915 for_each_member_device(c, ca) {
916 ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
917 if (ret) {
918 bch2_dev_put(ca);
919 ret = -BCH_ERR_ENOMEM_gc_alloc_start;
920 break;
921 }
922 }
923
924 bch_err_fn(c, ret);
925 return ret;
926 }
927
bch2_gc_write_stripes_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)928 static int bch2_gc_write_stripes_key(struct btree_trans *trans,
929 struct btree_iter *iter,
930 struct bkey_s_c k)
931 {
932 struct bch_fs *c = trans->c;
933 struct printbuf buf = PRINTBUF;
934 const struct bch_stripe *s;
935 struct gc_stripe *m;
936 bool bad = false;
937 unsigned i;
938 int ret = 0;
939
940 if (k.k->type != KEY_TYPE_stripe)
941 return 0;
942
943 s = bkey_s_c_to_stripe(k).v;
944 m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
945
946 for (i = 0; i < s->nr_blocks; i++) {
947 u32 old = stripe_blockcount_get(s, i);
948 u32 new = (m ? m->block_sectors[i] : 0);
949
950 if (old != new) {
951 prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
952 i, old, new);
953 bad = true;
954 }
955 }
956
957 if (bad)
958 bch2_bkey_val_to_text(&buf, c, k);
959
960 if (fsck_err_on(bad,
961 trans, stripe_sector_count_wrong,
962 "%s", buf.buf)) {
963 struct bkey_i_stripe *new;
964
965 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
966 ret = PTR_ERR_OR_ZERO(new);
967 if (ret)
968 return ret;
969
970 bkey_reassemble(&new->k_i, k);
971
972 for (i = 0; i < new->v.nr_blocks; i++)
973 stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
974
975 ret = bch2_trans_update(trans, iter, &new->k_i, 0);
976 }
977 fsck_err:
978 printbuf_exit(&buf);
979 return ret;
980 }
981
bch2_gc_stripes_done(struct bch_fs * c)982 static int bch2_gc_stripes_done(struct bch_fs *c)
983 {
984 return bch2_trans_run(c,
985 for_each_btree_key_commit(trans, iter,
986 BTREE_ID_stripes, POS_MIN,
987 BTREE_ITER_prefetch, k,
988 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
989 bch2_gc_write_stripes_key(trans, &iter, k)));
990 }
991
992 /**
993 * bch2_check_allocations - walk all references to buckets, and recompute them:
994 *
995 * @c: filesystem object
996 *
997 * Returns: 0 on success, or standard errcode on failure
998 *
999 * Order matters here:
1000 * - Concurrent GC relies on the fact that we have a total ordering for
1001 * everything that GC walks - see gc_will_visit_node(),
1002 * gc_will_visit_root()
1003 *
1004 * - also, references move around in the course of index updates and
1005 * various other crap: everything needs to agree on the ordering
1006 * references are allowed to move around in - e.g., we're allowed to
1007 * start with a reference owned by an open_bucket (the allocator) and
1008 * move it to the btree, but not the reverse.
1009 *
1010 * This is necessary to ensure that gc doesn't miss references that
1011 * move around - if references move backwards in the ordering GC
1012 * uses, GC could skip past them
1013 */
bch2_check_allocations(struct bch_fs * c)1014 int bch2_check_allocations(struct bch_fs *c)
1015 {
1016 int ret;
1017
1018 lockdep_assert_held(&c->state_lock);
1019
1020 down_write(&c->gc_lock);
1021
1022 bch2_btree_interior_updates_flush(c);
1023
1024 ret = bch2_gc_accounting_start(c) ?:
1025 bch2_gc_start(c) ?:
1026 bch2_gc_alloc_start(c) ?:
1027 bch2_gc_reflink_start(c);
1028 if (ret)
1029 goto out;
1030
1031 gc_pos_set(c, gc_phase(GC_PHASE_start));
1032
1033 ret = bch2_mark_superblocks(c);
1034 bch_err_msg(c, ret, "marking superblocks");
1035 if (ret)
1036 goto out;
1037
1038 ret = bch2_gc_btrees(c);
1039 if (ret)
1040 goto out;
1041
1042 c->gc_count++;
1043
1044 ret = bch2_gc_alloc_done(c) ?:
1045 bch2_gc_accounting_done(c) ?:
1046 bch2_gc_stripes_done(c) ?:
1047 bch2_gc_reflink_done(c);
1048 out:
1049 percpu_down_write(&c->mark_lock);
1050 /* Indicates that gc is no longer in progress: */
1051 __gc_pos_set(c, gc_phase(GC_PHASE_not_running));
1052
1053 bch2_gc_free(c);
1054 percpu_up_write(&c->mark_lock);
1055
1056 up_write(&c->gc_lock);
1057
1058 /*
1059 * At startup, allocations can happen directly instead of via the
1060 * allocator thread - issue wakeup in case they blocked on gc_lock:
1061 */
1062 closure_wake_up(&c->freelist_wait);
1063 bch_err_fn(c, ret);
1064 return ret;
1065 }
1066
gc_btree_gens_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1067 static int gc_btree_gens_key(struct btree_trans *trans,
1068 struct btree_iter *iter,
1069 struct bkey_s_c k)
1070 {
1071 struct bch_fs *c = trans->c;
1072 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1073 struct bkey_i *u;
1074 int ret;
1075
1076 if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
1077 return -EROFS;
1078
1079 rcu_read_lock();
1080 bkey_for_each_ptr(ptrs, ptr) {
1081 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
1082 if (!ca)
1083 continue;
1084
1085 if (dev_ptr_stale(ca, ptr) > 16) {
1086 rcu_read_unlock();
1087 goto update;
1088 }
1089 }
1090
1091 bkey_for_each_ptr(ptrs, ptr) {
1092 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
1093 if (!ca)
1094 continue;
1095
1096 u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
1097 if (gen_after(*gen, ptr->gen))
1098 *gen = ptr->gen;
1099 }
1100 rcu_read_unlock();
1101 return 0;
1102 update:
1103 u = bch2_bkey_make_mut(trans, iter, &k, 0);
1104 ret = PTR_ERR_OR_ZERO(u);
1105 if (ret)
1106 return ret;
1107
1108 bch2_extent_normalize(c, bkey_i_to_s(u));
1109 return 0;
1110 }
1111
bch2_alloc_write_oldest_gen(struct btree_trans * trans,struct bch_dev * ca,struct btree_iter * iter,struct bkey_s_c k)1112 static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
1113 struct btree_iter *iter, struct bkey_s_c k)
1114 {
1115 struct bch_alloc_v4 a_convert;
1116 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
1117 struct bkey_i_alloc_v4 *a_mut;
1118 int ret;
1119
1120 if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
1121 return 0;
1122
1123 a_mut = bch2_alloc_to_v4_mut(trans, k);
1124 ret = PTR_ERR_OR_ZERO(a_mut);
1125 if (ret)
1126 return ret;
1127
1128 a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
1129
1130 return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
1131 }
1132
bch2_gc_gens(struct bch_fs * c)1133 int bch2_gc_gens(struct bch_fs *c)
1134 {
1135 u64 b, start_time = local_clock();
1136 int ret;
1137
1138 if (!mutex_trylock(&c->gc_gens_lock))
1139 return 0;
1140
1141 trace_and_count(c, gc_gens_start, c);
1142
1143 /*
1144 * We have to use trylock here. Otherwise, we would
1145 * introduce a deadlock in the RO path - we take the
1146 * state lock at the start of going RO.
1147 */
1148 if (!down_read_trylock(&c->state_lock)) {
1149 mutex_unlock(&c->gc_gens_lock);
1150 return 0;
1151 }
1152
1153 for_each_member_device(c, ca) {
1154 struct bucket_gens *gens = bucket_gens(ca);
1155
1156 BUG_ON(ca->oldest_gen);
1157
1158 ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
1159 if (!ca->oldest_gen) {
1160 bch2_dev_put(ca);
1161 ret = -BCH_ERR_ENOMEM_gc_gens;
1162 goto err;
1163 }
1164
1165 for (b = gens->first_bucket;
1166 b < gens->nbuckets; b++)
1167 ca->oldest_gen[b] = gens->b[b];
1168 }
1169
1170 for (unsigned i = 0; i < BTREE_ID_NR; i++)
1171 if (btree_type_has_ptrs(i)) {
1172 c->gc_gens_btree = i;
1173 c->gc_gens_pos = POS_MIN;
1174
1175 ret = bch2_trans_run(c,
1176 for_each_btree_key_commit(trans, iter, i,
1177 POS_MIN,
1178 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
1179 k,
1180 NULL, NULL,
1181 BCH_TRANS_COMMIT_no_enospc,
1182 gc_btree_gens_key(trans, &iter, k)));
1183 if (ret)
1184 goto err;
1185 }
1186
1187 struct bch_dev *ca = NULL;
1188 ret = bch2_trans_run(c,
1189 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1190 POS_MIN,
1191 BTREE_ITER_prefetch,
1192 k,
1193 NULL, NULL,
1194 BCH_TRANS_COMMIT_no_enospc, ({
1195 ca = bch2_dev_iterate(c, ca, k.k->p.inode);
1196 if (!ca) {
1197 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
1198 continue;
1199 }
1200 bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
1201 })));
1202 bch2_dev_put(ca);
1203
1204 if (ret)
1205 goto err;
1206
1207 c->gc_gens_btree = 0;
1208 c->gc_gens_pos = POS_MIN;
1209
1210 c->gc_count++;
1211
1212 bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
1213 trace_and_count(c, gc_gens_end, c);
1214 err:
1215 for_each_member_device(c, ca) {
1216 kvfree(ca->oldest_gen);
1217 ca->oldest_gen = NULL;
1218 }
1219
1220 up_read(&c->state_lock);
1221 mutex_unlock(&c->gc_gens_lock);
1222 if (!bch2_err_matches(ret, EROFS))
1223 bch_err_fn(c, ret);
1224 return ret;
1225 }
1226
bch2_gc_gens_work(struct work_struct * work)1227 static void bch2_gc_gens_work(struct work_struct *work)
1228 {
1229 struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
1230 bch2_gc_gens(c);
1231 bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
1232 }
1233
bch2_gc_gens_async(struct bch_fs * c)1234 void bch2_gc_gens_async(struct bch_fs *c)
1235 {
1236 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) &&
1237 !queue_work(c->write_ref_wq, &c->gc_gens_work))
1238 bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
1239 }
1240
bch2_fs_btree_gc_exit(struct bch_fs * c)1241 void bch2_fs_btree_gc_exit(struct bch_fs *c)
1242 {
1243 }
1244
bch2_fs_btree_gc_init(struct bch_fs * c)1245 int bch2_fs_btree_gc_init(struct bch_fs *c)
1246 {
1247 seqcount_init(&c->gc_pos_lock);
1248 INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
1249
1250 init_rwsem(&c->gc_lock);
1251 mutex_init(&c->gc_gens_lock);
1252 return 0;
1253 }
1254