1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> 4 * Copyright (C) 2014 Datera Inc. 5 */ 6 7 #include "bcachefs.h" 8 #include "alloc_background.h" 9 #include "alloc_foreground.h" 10 #include "backpointers.h" 11 #include "bkey_methods.h" 12 #include "bkey_buf.h" 13 #include "btree_journal_iter.h" 14 #include "btree_key_cache.h" 15 #include "btree_locking.h" 16 #include "btree_node_scan.h" 17 #include "btree_update_interior.h" 18 #include "btree_io.h" 19 #include "btree_gc.h" 20 #include "buckets.h" 21 #include "clock.h" 22 #include "debug.h" 23 #include "disk_accounting.h" 24 #include "ec.h" 25 #include "error.h" 26 #include "extents.h" 27 #include "journal.h" 28 #include "keylist.h" 29 #include "move.h" 30 #include "recovery_passes.h" 31 #include "reflink.h" 32 #include "recovery.h" 33 #include "replicas.h" 34 #include "super-io.h" 35 #include "trace.h" 36 37 #include <linux/slab.h> 38 #include <linux/bitops.h> 39 #include <linux/freezer.h> 40 #include <linux/kthread.h> 41 #include <linux/preempt.h> 42 #include <linux/rcupdate.h> 43 #include <linux/sched/task.h> 44 45 #define DROP_THIS_NODE 10 46 #define DROP_PREV_NODE 11 47 #define DID_FILL_FROM_SCAN 12 48 49 static const char * const bch2_gc_phase_strs[] = { 50 #define x(n) #n, 51 GC_PHASES() 52 #undef x 53 NULL 54 }; 55 56 void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p) 57 { 58 prt_str(out, bch2_gc_phase_strs[p->phase]); 59 prt_char(out, ' '); 60 bch2_btree_id_level_to_text(out, p->btree, p->level); 61 prt_char(out, ' '); 62 bch2_bpos_to_text(out, p->pos); 63 } 64 65 static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) 66 { 67 return (struct bkey_s) {{{ 68 (struct bkey *) k.k, 69 (struct bch_val *) k.v 70 }}}; 71 } 72 73 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) 74 { 75 preempt_disable(); 76 write_seqcount_begin(&c->gc_pos_lock); 77 c->gc_pos = new_pos; 78 write_seqcount_end(&c->gc_pos_lock); 79 preempt_enable(); 80 } 81 82 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) 83 { 84 BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0); 85 __gc_pos_set(c, new_pos); 86 } 87 88 static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) 89 { 90 switch (b->key.k.type) { 91 case KEY_TYPE_btree_ptr: { 92 struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key); 93 94 dst->k.p = src->k.p; 95 dst->v.mem_ptr = 0; 96 dst->v.seq = b->data->keys.seq; 97 dst->v.sectors_written = 0; 98 dst->v.flags = 0; 99 dst->v.min_key = b->data->min_key; 100 set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k)); 101 memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k)); 102 break; 103 } 104 case KEY_TYPE_btree_ptr_v2: 105 bkey_copy(&dst->k_i, &b->key); 106 break; 107 default: 108 BUG(); 109 } 110 } 111 112 static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) 113 { 114 struct bkey_i_btree_ptr_v2 *new; 115 int ret; 116 117 if (c->opts.verbose) { 118 struct printbuf buf = PRINTBUF; 119 120 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 121 prt_str(&buf, " -> "); 122 bch2_bpos_to_text(&buf, new_min); 123 124 bch_info(c, "%s(): %s", __func__, buf.buf); 125 printbuf_exit(&buf); 126 } 127 128 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); 129 if (!new) 130 return -BCH_ERR_ENOMEM_gc_repair_key; 131 132 btree_ptr_to_v2(b, new); 133 b->data->min_key = new_min; 134 new->v.min_key = new_min; 135 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); 136 137 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); 138 if (ret) { 139 kfree(new); 140 return ret; 141 } 142 143 bch2_btree_node_drop_keys_outside_node(b); 144 bkey_copy(&b->key, &new->k_i); 145 return 0; 146 } 147 148 static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) 149 { 150 struct bkey_i_btree_ptr_v2 *new; 151 int ret; 152 153 if (c->opts.verbose) { 154 struct printbuf buf = PRINTBUF; 155 156 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 157 prt_str(&buf, " -> "); 158 bch2_bpos_to_text(&buf, new_max); 159 160 bch_info(c, "%s(): %s", __func__, buf.buf); 161 printbuf_exit(&buf); 162 } 163 164 ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); 165 if (ret) 166 return ret; 167 168 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); 169 if (!new) 170 return -BCH_ERR_ENOMEM_gc_repair_key; 171 172 btree_ptr_to_v2(b, new); 173 b->data->max_key = new_max; 174 new->k.p = new_max; 175 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); 176 177 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); 178 if (ret) { 179 kfree(new); 180 return ret; 181 } 182 183 bch2_btree_node_drop_keys_outside_node(b); 184 185 mutex_lock(&c->btree_cache.lock); 186 __bch2_btree_node_hash_remove(&c->btree_cache, b); 187 188 bkey_copy(&b->key, &new->k_i); 189 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 190 BUG_ON(ret); 191 mutex_unlock(&c->btree_cache.lock); 192 return 0; 193 } 194 195 static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b, 196 struct btree *prev, struct btree *cur, 197 struct bpos *pulled_from_scan) 198 { 199 struct bch_fs *c = trans->c; 200 struct bpos expected_start = !prev 201 ? b->data->min_key 202 : bpos_successor(prev->key.k.p); 203 struct printbuf buf = PRINTBUF; 204 int ret = 0; 205 206 BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && 207 !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, 208 b->data->min_key)); 209 210 if (bpos_eq(expected_start, cur->data->min_key)) 211 return 0; 212 213 prt_printf(&buf, " at "); 214 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 215 prt_printf(&buf, ":\n parent: "); 216 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 217 218 if (prev) { 219 prt_printf(&buf, "\n prev: "); 220 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key)); 221 } 222 223 prt_str(&buf, "\n next: "); 224 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key)); 225 226 if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */ 227 if (b->c.level == 1 && 228 bpos_lt(*pulled_from_scan, cur->data->min_key)) { 229 ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, 230 expected_start, 231 bpos_predecessor(cur->data->min_key)); 232 if (ret) 233 goto err; 234 235 *pulled_from_scan = cur->data->min_key; 236 ret = DID_FILL_FROM_SCAN; 237 } else { 238 if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, 239 "btree node with incorrect min_key%s", buf.buf)) 240 ret = set_node_min(c, cur, expected_start); 241 } 242 } else { /* overlap */ 243 if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */ 244 if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ 245 if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node, 246 "btree node overwritten by next node%s", buf.buf)) 247 ret = DROP_PREV_NODE; 248 } else { 249 if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, 250 "btree node with incorrect max_key%s", buf.buf)) 251 ret = set_node_max(c, prev, 252 bpos_predecessor(cur->data->min_key)); 253 } 254 } else { 255 if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ 256 if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node, 257 "btree node overwritten by prev node%s", buf.buf)) 258 ret = DROP_THIS_NODE; 259 } else { 260 if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, 261 "btree node with incorrect min_key%s", buf.buf)) 262 ret = set_node_min(c, cur, expected_start); 263 } 264 } 265 } 266 err: 267 fsck_err: 268 printbuf_exit(&buf); 269 return ret; 270 } 271 272 static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, 273 struct btree *child, struct bpos *pulled_from_scan) 274 { 275 struct bch_fs *c = trans->c; 276 struct printbuf buf = PRINTBUF; 277 int ret = 0; 278 279 if (bpos_eq(child->key.k.p, b->key.k.p)) 280 return 0; 281 282 prt_printf(&buf, " at "); 283 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 284 prt_printf(&buf, ":\n parent: "); 285 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 286 287 prt_str(&buf, "\n child: "); 288 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); 289 290 if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, 291 "btree node with incorrect max_key%s", buf.buf)) { 292 if (b->c.level == 1 && 293 bpos_lt(*pulled_from_scan, b->key.k.p)) { 294 ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, 295 bpos_successor(child->key.k.p), b->key.k.p); 296 if (ret) 297 goto err; 298 299 *pulled_from_scan = b->key.k.p; 300 ret = DID_FILL_FROM_SCAN; 301 } else { 302 ret = set_node_max(c, child, b->key.k.p); 303 } 304 } 305 err: 306 fsck_err: 307 printbuf_exit(&buf); 308 return ret; 309 } 310 311 static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b, 312 struct bpos *pulled_from_scan) 313 { 314 struct bch_fs *c = trans->c; 315 struct btree_and_journal_iter iter; 316 struct bkey_s_c k; 317 struct bkey_buf prev_k, cur_k; 318 struct btree *prev = NULL, *cur = NULL; 319 bool have_child, new_pass = false; 320 struct printbuf buf = PRINTBUF; 321 int ret = 0; 322 323 if (!b->c.level) 324 return 0; 325 326 bch2_bkey_buf_init(&prev_k); 327 bch2_bkey_buf_init(&cur_k); 328 again: 329 cur = prev = NULL; 330 have_child = new_pass = false; 331 bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); 332 iter.prefetch = true; 333 334 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 335 BUG_ON(bpos_lt(k.k->p, b->data->min_key)); 336 BUG_ON(bpos_gt(k.k->p, b->data->max_key)); 337 338 bch2_btree_and_journal_iter_advance(&iter); 339 bch2_bkey_buf_reassemble(&cur_k, c, k); 340 341 cur = bch2_btree_node_get_noiter(trans, cur_k.k, 342 b->c.btree_id, b->c.level - 1, 343 false); 344 ret = PTR_ERR_OR_ZERO(cur); 345 346 printbuf_reset(&buf); 347 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1); 348 prt_char(&buf, ' '); 349 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); 350 351 if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), 352 trans, btree_node_read_error, 353 "Topology repair: unreadable btree node at\n" 354 " %s", 355 buf.buf)) { 356 bch2_btree_node_evict(trans, cur_k.k); 357 cur = NULL; 358 ret = bch2_journal_key_delete(c, b->c.btree_id, 359 b->c.level, cur_k.k->k.p); 360 if (ret) 361 break; 362 363 ret = bch2_btree_lost_data(c, b->c.btree_id); 364 if (ret) 365 break; 366 continue; 367 } 368 369 bch_err_msg(c, ret, "getting btree node"); 370 if (ret) 371 break; 372 373 if (bch2_btree_node_is_stale(c, cur)) { 374 bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf); 375 six_unlock_read(&cur->c.lock); 376 bch2_btree_node_evict(trans, cur_k.k); 377 ret = bch2_journal_key_delete(c, b->c.btree_id, 378 b->c.level, cur_k.k->k.p); 379 cur = NULL; 380 if (ret) 381 break; 382 continue; 383 } 384 385 ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan); 386 if (ret == DID_FILL_FROM_SCAN) { 387 new_pass = true; 388 ret = 0; 389 } 390 391 if (ret == DROP_THIS_NODE) { 392 six_unlock_read(&cur->c.lock); 393 bch2_btree_node_evict(trans, cur_k.k); 394 ret = bch2_journal_key_delete(c, b->c.btree_id, 395 b->c.level, cur_k.k->k.p); 396 cur = NULL; 397 if (ret) 398 break; 399 continue; 400 } 401 402 if (prev) 403 six_unlock_read(&prev->c.lock); 404 prev = NULL; 405 406 if (ret == DROP_PREV_NODE) { 407 bch_info(c, "dropped prev node"); 408 bch2_btree_node_evict(trans, prev_k.k); 409 ret = bch2_journal_key_delete(c, b->c.btree_id, 410 b->c.level, prev_k.k->k.p); 411 if (ret) 412 break; 413 414 bch2_btree_and_journal_iter_exit(&iter); 415 goto again; 416 } else if (ret) 417 break; 418 419 prev = cur; 420 cur = NULL; 421 bch2_bkey_buf_copy(&prev_k, c, cur_k.k); 422 } 423 424 if (!ret && !IS_ERR_OR_NULL(prev)) { 425 BUG_ON(cur); 426 ret = btree_repair_node_end(trans, b, prev, pulled_from_scan); 427 if (ret == DID_FILL_FROM_SCAN) { 428 new_pass = true; 429 ret = 0; 430 } 431 } 432 433 if (!IS_ERR_OR_NULL(prev)) 434 six_unlock_read(&prev->c.lock); 435 prev = NULL; 436 if (!IS_ERR_OR_NULL(cur)) 437 six_unlock_read(&cur->c.lock); 438 cur = NULL; 439 440 if (ret) 441 goto err; 442 443 bch2_btree_and_journal_iter_exit(&iter); 444 445 if (new_pass) 446 goto again; 447 448 bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); 449 iter.prefetch = true; 450 451 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 452 bch2_bkey_buf_reassemble(&cur_k, c, k); 453 bch2_btree_and_journal_iter_advance(&iter); 454 455 cur = bch2_btree_node_get_noiter(trans, cur_k.k, 456 b->c.btree_id, b->c.level - 1, 457 false); 458 ret = PTR_ERR_OR_ZERO(cur); 459 460 bch_err_msg(c, ret, "getting btree node"); 461 if (ret) 462 goto err; 463 464 ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan); 465 six_unlock_read(&cur->c.lock); 466 cur = NULL; 467 468 if (ret == DROP_THIS_NODE) { 469 bch2_btree_node_evict(trans, cur_k.k); 470 ret = bch2_journal_key_delete(c, b->c.btree_id, 471 b->c.level, cur_k.k->k.p); 472 new_pass = true; 473 } 474 475 if (ret) 476 goto err; 477 478 have_child = true; 479 } 480 481 printbuf_reset(&buf); 482 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 483 prt_newline(&buf); 484 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 485 486 if (mustfix_fsck_err_on(!have_child, 487 trans, btree_node_topology_interior_node_empty, 488 "empty interior btree node at %s", buf.buf)) 489 ret = DROP_THIS_NODE; 490 err: 491 fsck_err: 492 if (!IS_ERR_OR_NULL(prev)) 493 six_unlock_read(&prev->c.lock); 494 if (!IS_ERR_OR_NULL(cur)) 495 six_unlock_read(&cur->c.lock); 496 497 bch2_btree_and_journal_iter_exit(&iter); 498 499 if (!ret && new_pass) 500 goto again; 501 502 BUG_ON(!ret && bch2_btree_node_check_topology(trans, b)); 503 504 bch2_bkey_buf_exit(&prev_k, c); 505 bch2_bkey_buf_exit(&cur_k, c); 506 printbuf_exit(&buf); 507 return ret; 508 } 509 510 int bch2_check_topology(struct bch_fs *c) 511 { 512 struct btree_trans *trans = bch2_trans_get(c); 513 struct bpos pulled_from_scan = POS_MIN; 514 struct printbuf buf = PRINTBUF; 515 int ret = 0; 516 517 bch2_trans_srcu_unlock(trans); 518 519 for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { 520 struct btree_root *r = bch2_btree_id_root(c, i); 521 bool reconstructed_root = false; 522 523 printbuf_reset(&buf); 524 bch2_btree_id_to_text(&buf, i); 525 526 if (r->error) { 527 ret = bch2_btree_lost_data(c, i); 528 if (ret) 529 break; 530 reconstruct_root: 531 bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); 532 533 r->alive = false; 534 r->error = 0; 535 536 if (!bch2_btree_has_scanned_nodes(c, i)) { 537 mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing, 538 "no nodes found for btree %s, continue?", buf.buf); 539 bch2_btree_root_alloc_fake_trans(trans, i, 0); 540 } else { 541 bch2_btree_root_alloc_fake_trans(trans, i, 1); 542 bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 543 ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); 544 if (ret) 545 break; 546 } 547 548 reconstructed_root = true; 549 } 550 551 struct btree *b = r->b; 552 553 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); 554 ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); 555 six_unlock_read(&b->c.lock); 556 557 if (ret == DROP_THIS_NODE) { 558 mutex_lock(&c->btree_cache.lock); 559 bch2_btree_node_hash_remove(&c->btree_cache, b); 560 mutex_unlock(&c->btree_cache.lock); 561 562 r->b = NULL; 563 564 if (!reconstructed_root) 565 goto reconstruct_root; 566 567 bch_err(c, "empty btree root %s", buf.buf); 568 bch2_btree_root_alloc_fake_trans(trans, i, 0); 569 r->alive = false; 570 ret = 0; 571 } 572 } 573 fsck_err: 574 printbuf_exit(&buf); 575 bch2_trans_put(trans); 576 return ret; 577 } 578 579 /* marking of btree keys/nodes: */ 580 581 static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, 582 unsigned level, struct btree **prev, 583 struct btree_iter *iter, struct bkey_s_c k, 584 bool initial) 585 { 586 struct bch_fs *c = trans->c; 587 588 if (iter) { 589 struct btree_path *path = btree_iter_path(trans, iter); 590 struct btree *b = path_l(path)->b; 591 592 if (*prev != b) { 593 int ret = bch2_btree_node_check_topology(trans, b); 594 if (ret) 595 return ret; 596 } 597 *prev = b; 598 } 599 600 struct bkey deleted = KEY(0, 0, 0); 601 struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; 602 struct printbuf buf = PRINTBUF; 603 int ret = 0; 604 605 deleted.p = k.k->p; 606 607 if (initial) { 608 BUG_ON(bch2_journal_seq_verify && 609 k.k->bversion.lo > atomic64_read(&c->journal.seq)); 610 611 if (fsck_err_on(btree_id != BTREE_ID_accounting && 612 k.k->bversion.lo > atomic64_read(&c->key_version), 613 trans, bkey_version_in_future, 614 "key version number higher than recorded %llu\n %s", 615 atomic64_read(&c->key_version), 616 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 617 atomic64_set(&c->key_version, k.k->bversion.lo); 618 } 619 620 if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), 621 trans, btree_bitmap_not_marked, 622 "btree ptr not marked in member info btree allocated bitmap\n %s", 623 (printbuf_reset(&buf), 624 bch2_bkey_val_to_text(&buf, c, k), 625 buf.buf))) { 626 mutex_lock(&c->sb_lock); 627 bch2_dev_btree_bitmap_mark(c, k); 628 bch2_write_super(c); 629 mutex_unlock(&c->sb_lock); 630 } 631 632 /* 633 * We require a commit before key_trigger() because 634 * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the 635 * wrong result if we run it multiple times. 636 */ 637 unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0; 638 639 ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), 640 BTREE_TRIGGER_check_repair|flags); 641 if (ret) 642 goto out; 643 644 if (trans->nr_updates) { 645 ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: 646 -BCH_ERR_transaction_restart_nested; 647 goto out; 648 } 649 650 ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), 651 BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags); 652 out: 653 fsck_err: 654 printbuf_exit(&buf); 655 bch_err_fn(c, ret); 656 return ret; 657 } 658 659 static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial) 660 { 661 struct bch_fs *c = trans->c; 662 unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; 663 int ret = 0; 664 665 /* We need to make sure every leaf node is readable before going RW */ 666 if (initial) 667 target_depth = 0; 668 669 for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) { 670 struct btree *prev = NULL; 671 struct btree_iter iter; 672 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level, 673 BTREE_ITER_prefetch); 674 675 ret = for_each_btree_key_continue(trans, iter, 0, k, ({ 676 gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); 677 bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); 678 })); 679 if (ret) 680 goto err; 681 } 682 683 /* root */ 684 do { 685 retry_root: 686 bch2_trans_begin(trans); 687 688 struct btree_iter iter; 689 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 690 0, bch2_btree_id_root(c, btree)->b->c.level, 0); 691 struct btree *b = bch2_btree_iter_peek_node(&iter); 692 ret = PTR_ERR_OR_ZERO(b); 693 if (ret) 694 goto err_root; 695 696 if (b != btree_node_root(c, b)) { 697 bch2_trans_iter_exit(trans, &iter); 698 goto retry_root; 699 } 700 701 gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX)); 702 struct bkey_s_c k = bkey_i_to_s_c(&b->key); 703 ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial); 704 err_root: 705 bch2_trans_iter_exit(trans, &iter); 706 } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); 707 err: 708 bch_err_fn(c, ret); 709 return ret; 710 } 711 712 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) 713 { 714 return cmp_int(gc_btree_order(l), gc_btree_order(r)); 715 } 716 717 static int bch2_gc_btrees(struct bch_fs *c) 718 { 719 struct btree_trans *trans = bch2_trans_get(c); 720 enum btree_id ids[BTREE_ID_NR]; 721 struct printbuf buf = PRINTBUF; 722 unsigned i; 723 int ret = 0; 724 725 for (i = 0; i < BTREE_ID_NR; i++) 726 ids[i] = i; 727 bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); 728 729 for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { 730 unsigned btree = i < BTREE_ID_NR ? ids[i] : i; 731 732 if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) 733 continue; 734 735 ret = bch2_gc_btree(trans, btree, true); 736 } 737 738 printbuf_exit(&buf); 739 bch2_trans_put(trans); 740 bch_err_fn(c, ret); 741 return ret; 742 } 743 744 static int bch2_mark_superblocks(struct bch_fs *c) 745 { 746 gc_pos_set(c, gc_phase(GC_PHASE_sb)); 747 748 return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); 749 } 750 751 static void bch2_gc_free(struct bch_fs *c) 752 { 753 bch2_accounting_gc_free(c); 754 755 genradix_free(&c->reflink_gc_table); 756 genradix_free(&c->gc_stripes); 757 758 for_each_member_device(c, ca) 759 genradix_free(&ca->buckets_gc); 760 } 761 762 static int bch2_gc_start(struct bch_fs *c) 763 { 764 for_each_member_device(c, ca) { 765 int ret = bch2_dev_usage_init(ca, true); 766 if (ret) { 767 bch2_dev_put(ca); 768 return ret; 769 } 770 } 771 772 return 0; 773 } 774 775 /* returns true if not equal */ 776 static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, 777 struct bch_alloc_v4 r) 778 { 779 return l.gen != r.gen || 780 l.oldest_gen != r.oldest_gen || 781 l.data_type != r.data_type || 782 l.dirty_sectors != r.dirty_sectors || 783 l.stripe_sectors != r.stripe_sectors || 784 l.cached_sectors != r.cached_sectors || 785 l.stripe_redundancy != r.stripe_redundancy || 786 l.stripe != r.stripe; 787 } 788 789 static int bch2_alloc_write_key(struct btree_trans *trans, 790 struct btree_iter *iter, 791 struct bch_dev *ca, 792 struct bkey_s_c k) 793 { 794 struct bch_fs *c = trans->c; 795 struct bkey_i_alloc_v4 *a; 796 struct bch_alloc_v4 old_gc, gc, old_convert, new; 797 const struct bch_alloc_v4 *old; 798 int ret; 799 800 if (!bucket_valid(ca, k.k->p.offset)) 801 return 0; 802 803 old = bch2_alloc_to_v4(k, &old_convert); 804 gc = new = *old; 805 806 __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset)); 807 808 old_gc = gc; 809 810 if ((old->data_type == BCH_DATA_sb || 811 old->data_type == BCH_DATA_journal) && 812 !bch2_dev_is_online(ca)) { 813 gc.data_type = old->data_type; 814 gc.dirty_sectors = old->dirty_sectors; 815 } 816 817 /* 818 * gc.data_type doesn't yet include need_discard & need_gc_gen states - 819 * fix that here: 820 */ 821 alloc_data_type_set(&gc, gc.data_type); 822 if (gc.data_type != old_gc.data_type || 823 gc.dirty_sectors != old_gc.dirty_sectors) { 824 ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc); 825 if (ret) 826 return ret; 827 828 /* 829 * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not 830 * safe w.r.t. transaction restarts, so fixup the gc_bucket so 831 * we don't run it twice: 832 */ 833 struct bucket *gc_m = gc_bucket(ca, iter->pos.offset); 834 gc_m->data_type = gc.data_type; 835 gc_m->dirty_sectors = gc.dirty_sectors; 836 } 837 838 if (fsck_err_on(new.data_type != gc.data_type, 839 trans, alloc_key_data_type_wrong, 840 "bucket %llu:%llu gen %u has wrong data_type" 841 ": got %s, should be %s", 842 iter->pos.inode, iter->pos.offset, 843 gc.gen, 844 bch2_data_type_str(new.data_type), 845 bch2_data_type_str(gc.data_type))) 846 new.data_type = gc.data_type; 847 848 #define copy_bucket_field(_errtype, _f) \ 849 if (fsck_err_on(new._f != gc._f, \ 850 trans, _errtype, \ 851 "bucket %llu:%llu gen %u data type %s has wrong " #_f \ 852 ": got %llu, should be %llu", \ 853 iter->pos.inode, iter->pos.offset, \ 854 gc.gen, \ 855 bch2_data_type_str(gc.data_type), \ 856 (u64) new._f, (u64) gc._f)) \ 857 new._f = gc._f; \ 858 859 copy_bucket_field(alloc_key_gen_wrong, gen); 860 copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors); 861 copy_bucket_field(alloc_key_stripe_sectors_wrong, stripe_sectors); 862 copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors); 863 copy_bucket_field(alloc_key_stripe_wrong, stripe); 864 copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy); 865 #undef copy_bucket_field 866 867 if (!bch2_alloc_v4_cmp(*old, new)) 868 return 0; 869 870 a = bch2_alloc_to_v4_mut(trans, k); 871 ret = PTR_ERR_OR_ZERO(a); 872 if (ret) 873 return ret; 874 875 a->v = new; 876 877 /* 878 * The trigger normally makes sure these are set, but we're not running 879 * triggers: 880 */ 881 if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) 882 a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); 883 884 ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun); 885 fsck_err: 886 return ret; 887 } 888 889 static int bch2_gc_alloc_done(struct bch_fs *c) 890 { 891 int ret = 0; 892 893 for_each_member_device(c, ca) { 894 ret = bch2_trans_run(c, 895 for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, 896 POS(ca->dev_idx, ca->mi.first_bucket), 897 POS(ca->dev_idx, ca->mi.nbuckets - 1), 898 BTREE_ITER_slots|BTREE_ITER_prefetch, k, 899 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 900 bch2_alloc_write_key(trans, &iter, ca, k))); 901 if (ret) { 902 bch2_dev_put(ca); 903 break; 904 } 905 } 906 907 bch_err_fn(c, ret); 908 return ret; 909 } 910 911 static int bch2_gc_alloc_start(struct bch_fs *c) 912 { 913 int ret = 0; 914 915 for_each_member_device(c, ca) { 916 ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL); 917 if (ret) { 918 bch2_dev_put(ca); 919 ret = -BCH_ERR_ENOMEM_gc_alloc_start; 920 break; 921 } 922 } 923 924 bch_err_fn(c, ret); 925 return ret; 926 } 927 928 static int bch2_gc_write_stripes_key(struct btree_trans *trans, 929 struct btree_iter *iter, 930 struct bkey_s_c k) 931 { 932 struct bch_fs *c = trans->c; 933 struct printbuf buf = PRINTBUF; 934 const struct bch_stripe *s; 935 struct gc_stripe *m; 936 bool bad = false; 937 unsigned i; 938 int ret = 0; 939 940 if (k.k->type != KEY_TYPE_stripe) 941 return 0; 942 943 s = bkey_s_c_to_stripe(k).v; 944 m = genradix_ptr(&c->gc_stripes, k.k->p.offset); 945 946 for (i = 0; i < s->nr_blocks; i++) { 947 u32 old = stripe_blockcount_get(s, i); 948 u32 new = (m ? m->block_sectors[i] : 0); 949 950 if (old != new) { 951 prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n", 952 i, old, new); 953 bad = true; 954 } 955 } 956 957 if (bad) 958 bch2_bkey_val_to_text(&buf, c, k); 959 960 if (fsck_err_on(bad, 961 trans, stripe_sector_count_wrong, 962 "%s", buf.buf)) { 963 struct bkey_i_stripe *new; 964 965 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); 966 ret = PTR_ERR_OR_ZERO(new); 967 if (ret) 968 return ret; 969 970 bkey_reassemble(&new->k_i, k); 971 972 for (i = 0; i < new->v.nr_blocks; i++) 973 stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); 974 975 ret = bch2_trans_update(trans, iter, &new->k_i, 0); 976 } 977 fsck_err: 978 printbuf_exit(&buf); 979 return ret; 980 } 981 982 static int bch2_gc_stripes_done(struct bch_fs *c) 983 { 984 return bch2_trans_run(c, 985 for_each_btree_key_commit(trans, iter, 986 BTREE_ID_stripes, POS_MIN, 987 BTREE_ITER_prefetch, k, 988 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 989 bch2_gc_write_stripes_key(trans, &iter, k))); 990 } 991 992 /** 993 * bch2_check_allocations - walk all references to buckets, and recompute them: 994 * 995 * @c: filesystem object 996 * 997 * Returns: 0 on success, or standard errcode on failure 998 * 999 * Order matters here: 1000 * - Concurrent GC relies on the fact that we have a total ordering for 1001 * everything that GC walks - see gc_will_visit_node(), 1002 * gc_will_visit_root() 1003 * 1004 * - also, references move around in the course of index updates and 1005 * various other crap: everything needs to agree on the ordering 1006 * references are allowed to move around in - e.g., we're allowed to 1007 * start with a reference owned by an open_bucket (the allocator) and 1008 * move it to the btree, but not the reverse. 1009 * 1010 * This is necessary to ensure that gc doesn't miss references that 1011 * move around - if references move backwards in the ordering GC 1012 * uses, GC could skip past them 1013 */ 1014 int bch2_check_allocations(struct bch_fs *c) 1015 { 1016 int ret; 1017 1018 lockdep_assert_held(&c->state_lock); 1019 1020 down_write(&c->gc_lock); 1021 1022 bch2_btree_interior_updates_flush(c); 1023 1024 ret = bch2_gc_accounting_start(c) ?: 1025 bch2_gc_start(c) ?: 1026 bch2_gc_alloc_start(c) ?: 1027 bch2_gc_reflink_start(c); 1028 if (ret) 1029 goto out; 1030 1031 gc_pos_set(c, gc_phase(GC_PHASE_start)); 1032 1033 ret = bch2_mark_superblocks(c); 1034 bch_err_msg(c, ret, "marking superblocks"); 1035 if (ret) 1036 goto out; 1037 1038 ret = bch2_gc_btrees(c); 1039 if (ret) 1040 goto out; 1041 1042 c->gc_count++; 1043 1044 ret = bch2_gc_alloc_done(c) ?: 1045 bch2_gc_accounting_done(c) ?: 1046 bch2_gc_stripes_done(c) ?: 1047 bch2_gc_reflink_done(c); 1048 out: 1049 percpu_down_write(&c->mark_lock); 1050 /* Indicates that gc is no longer in progress: */ 1051 __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); 1052 1053 bch2_gc_free(c); 1054 percpu_up_write(&c->mark_lock); 1055 1056 up_write(&c->gc_lock); 1057 1058 /* 1059 * At startup, allocations can happen directly instead of via the 1060 * allocator thread - issue wakeup in case they blocked on gc_lock: 1061 */ 1062 closure_wake_up(&c->freelist_wait); 1063 bch_err_fn(c, ret); 1064 return ret; 1065 } 1066 1067 static int gc_btree_gens_key(struct btree_trans *trans, 1068 struct btree_iter *iter, 1069 struct bkey_s_c k) 1070 { 1071 struct bch_fs *c = trans->c; 1072 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1073 struct bkey_i *u; 1074 int ret; 1075 1076 if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) 1077 return -EROFS; 1078 1079 rcu_read_lock(); 1080 bkey_for_each_ptr(ptrs, ptr) { 1081 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); 1082 if (!ca) 1083 continue; 1084 1085 if (dev_ptr_stale(ca, ptr) > 16) { 1086 rcu_read_unlock(); 1087 goto update; 1088 } 1089 } 1090 1091 bkey_for_each_ptr(ptrs, ptr) { 1092 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); 1093 if (!ca) 1094 continue; 1095 1096 u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; 1097 if (gen_after(*gen, ptr->gen)) 1098 *gen = ptr->gen; 1099 } 1100 rcu_read_unlock(); 1101 return 0; 1102 update: 1103 u = bch2_bkey_make_mut(trans, iter, &k, 0); 1104 ret = PTR_ERR_OR_ZERO(u); 1105 if (ret) 1106 return ret; 1107 1108 bch2_extent_normalize(c, bkey_i_to_s(u)); 1109 return 0; 1110 } 1111 1112 static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca, 1113 struct btree_iter *iter, struct bkey_s_c k) 1114 { 1115 struct bch_alloc_v4 a_convert; 1116 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); 1117 struct bkey_i_alloc_v4 *a_mut; 1118 int ret; 1119 1120 if (a->oldest_gen == ca->oldest_gen[iter->pos.offset]) 1121 return 0; 1122 1123 a_mut = bch2_alloc_to_v4_mut(trans, k); 1124 ret = PTR_ERR_OR_ZERO(a_mut); 1125 if (ret) 1126 return ret; 1127 1128 a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; 1129 1130 return bch2_trans_update(trans, iter, &a_mut->k_i, 0); 1131 } 1132 1133 int bch2_gc_gens(struct bch_fs *c) 1134 { 1135 u64 b, start_time = local_clock(); 1136 int ret; 1137 1138 if (!mutex_trylock(&c->gc_gens_lock)) 1139 return 0; 1140 1141 trace_and_count(c, gc_gens_start, c); 1142 1143 /* 1144 * We have to use trylock here. Otherwise, we would 1145 * introduce a deadlock in the RO path - we take the 1146 * state lock at the start of going RO. 1147 */ 1148 if (!down_read_trylock(&c->state_lock)) { 1149 mutex_unlock(&c->gc_gens_lock); 1150 return 0; 1151 } 1152 1153 for_each_member_device(c, ca) { 1154 struct bucket_gens *gens = bucket_gens(ca); 1155 1156 BUG_ON(ca->oldest_gen); 1157 1158 ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); 1159 if (!ca->oldest_gen) { 1160 bch2_dev_put(ca); 1161 ret = -BCH_ERR_ENOMEM_gc_gens; 1162 goto err; 1163 } 1164 1165 for (b = gens->first_bucket; 1166 b < gens->nbuckets; b++) 1167 ca->oldest_gen[b] = gens->b[b]; 1168 } 1169 1170 for (unsigned i = 0; i < BTREE_ID_NR; i++) 1171 if (btree_type_has_ptrs(i)) { 1172 c->gc_gens_btree = i; 1173 c->gc_gens_pos = POS_MIN; 1174 1175 ret = bch2_trans_run(c, 1176 for_each_btree_key_commit(trans, iter, i, 1177 POS_MIN, 1178 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, 1179 k, 1180 NULL, NULL, 1181 BCH_TRANS_COMMIT_no_enospc, 1182 gc_btree_gens_key(trans, &iter, k))); 1183 if (ret) 1184 goto err; 1185 } 1186 1187 struct bch_dev *ca = NULL; 1188 ret = bch2_trans_run(c, 1189 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, 1190 POS_MIN, 1191 BTREE_ITER_prefetch, 1192 k, 1193 NULL, NULL, 1194 BCH_TRANS_COMMIT_no_enospc, ({ 1195 ca = bch2_dev_iterate(c, ca, k.k->p.inode); 1196 if (!ca) { 1197 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); 1198 continue; 1199 } 1200 bch2_alloc_write_oldest_gen(trans, ca, &iter, k); 1201 }))); 1202 bch2_dev_put(ca); 1203 1204 if (ret) 1205 goto err; 1206 1207 c->gc_gens_btree = 0; 1208 c->gc_gens_pos = POS_MIN; 1209 1210 c->gc_count++; 1211 1212 bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); 1213 trace_and_count(c, gc_gens_end, c); 1214 err: 1215 for_each_member_device(c, ca) { 1216 kvfree(ca->oldest_gen); 1217 ca->oldest_gen = NULL; 1218 } 1219 1220 up_read(&c->state_lock); 1221 mutex_unlock(&c->gc_gens_lock); 1222 if (!bch2_err_matches(ret, EROFS)) 1223 bch_err_fn(c, ret); 1224 return ret; 1225 } 1226 1227 static void bch2_gc_gens_work(struct work_struct *work) 1228 { 1229 struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); 1230 bch2_gc_gens(c); 1231 bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); 1232 } 1233 1234 void bch2_gc_gens_async(struct bch_fs *c) 1235 { 1236 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) && 1237 !queue_work(c->write_ref_wq, &c->gc_gens_work)) 1238 bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); 1239 } 1240 1241 void bch2_fs_btree_gc_exit(struct bch_fs *c) 1242 { 1243 } 1244 1245 int bch2_fs_btree_gc_init(struct bch_fs *c) 1246 { 1247 seqcount_init(&c->gc_pos_lock); 1248 INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); 1249 1250 init_rwsem(&c->gc_lock); 1251 mutex_init(&c->gc_gens_lock); 1252 return 0; 1253 } 1254