1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> 4 * Copyright (C) 2014 Datera Inc. 5 */ 6 7 #include "bcachefs.h" 8 #include "alloc_background.h" 9 #include "alloc_foreground.h" 10 #include "bkey_methods.h" 11 #include "bkey_buf.h" 12 #include "btree_journal_iter.h" 13 #include "btree_key_cache.h" 14 #include "btree_locking.h" 15 #include "btree_update_interior.h" 16 #include "btree_io.h" 17 #include "btree_gc.h" 18 #include "buckets.h" 19 #include "clock.h" 20 #include "debug.h" 21 #include "ec.h" 22 #include "error.h" 23 #include "extents.h" 24 #include "journal.h" 25 #include "keylist.h" 26 #include "move.h" 27 #include "recovery.h" 28 #include "reflink.h" 29 #include "replicas.h" 30 #include "super-io.h" 31 #include "trace.h" 32 33 #include <linux/slab.h> 34 #include <linux/bitops.h> 35 #include <linux/freezer.h> 36 #include <linux/kthread.h> 37 #include <linux/preempt.h> 38 #include <linux/rcupdate.h> 39 #include <linux/sched/task.h> 40 41 #define DROP_THIS_NODE 10 42 #define DROP_PREV_NODE 11 43 44 static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) 45 { 46 return (struct bkey_s) {{{ 47 (struct bkey *) k.k, 48 (struct bch_val *) k.v 49 }}}; 50 } 51 52 static bool should_restart_for_topology_repair(struct bch_fs *c) 53 { 54 return c->opts.fix_errors != FSCK_FIX_no && 55 !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); 56 } 57 58 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) 59 { 60 preempt_disable(); 61 write_seqcount_begin(&c->gc_pos_lock); 62 c->gc_pos = new_pos; 63 write_seqcount_end(&c->gc_pos_lock); 64 preempt_enable(); 65 } 66 67 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) 68 { 69 BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); 70 __gc_pos_set(c, new_pos); 71 } 72 73 /* 74 * Missing: if an interior btree node is empty, we need to do something - 75 * perhaps just kill it 76 */ 77 static int bch2_gc_check_topology(struct bch_fs *c, 78 struct btree *b, 79 struct bkey_buf *prev, 80 struct bkey_buf cur, 81 bool is_last) 82 { 83 struct bpos node_start = b->data->min_key; 84 struct bpos node_end = b->data->max_key; 85 struct bpos expected_start = bkey_deleted(&prev->k->k) 86 ? node_start 87 : bpos_successor(prev->k->k.p); 88 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 89 int ret = 0; 90 91 if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { 92 struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); 93 94 if (!bpos_eq(expected_start, bp->v.min_key)) { 95 bch2_topology_error(c); 96 97 if (bkey_deleted(&prev->k->k)) { 98 prt_printf(&buf1, "start of node: "); 99 bch2_bpos_to_text(&buf1, node_start); 100 } else { 101 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); 102 } 103 bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); 104 105 if (__fsck_err(c, 106 FSCK_CAN_FIX| 107 FSCK_CAN_IGNORE| 108 FSCK_NO_RATELIMIT, 109 btree_node_topology_bad_min_key, 110 "btree node with incorrect min_key at btree %s level %u:\n" 111 " prev %s\n" 112 " cur %s", 113 bch2_btree_id_str(b->c.btree_id), b->c.level, 114 buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) { 115 bch_info(c, "Halting mark and sweep to start topology repair pass"); 116 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); 117 goto err; 118 } else { 119 set_bit(BCH_FS_initial_gc_unfixed, &c->flags); 120 } 121 } 122 } 123 124 if (is_last && !bpos_eq(cur.k->k.p, node_end)) { 125 bch2_topology_error(c); 126 127 printbuf_reset(&buf1); 128 printbuf_reset(&buf2); 129 130 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); 131 bch2_bpos_to_text(&buf2, node_end); 132 133 if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT, 134 btree_node_topology_bad_max_key, 135 "btree node with incorrect max_key at btree %s level %u:\n" 136 " %s\n" 137 " expected %s", 138 bch2_btree_id_str(b->c.btree_id), b->c.level, 139 buf1.buf, buf2.buf) && 140 should_restart_for_topology_repair(c)) { 141 bch_info(c, "Halting mark and sweep to start topology repair pass"); 142 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); 143 goto err; 144 } else { 145 set_bit(BCH_FS_initial_gc_unfixed, &c->flags); 146 } 147 } 148 149 bch2_bkey_buf_copy(prev, c, cur.k); 150 err: 151 fsck_err: 152 printbuf_exit(&buf2); 153 printbuf_exit(&buf1); 154 return ret; 155 } 156 157 static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) 158 { 159 switch (b->key.k.type) { 160 case KEY_TYPE_btree_ptr: { 161 struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key); 162 163 dst->k.p = src->k.p; 164 dst->v.mem_ptr = 0; 165 dst->v.seq = b->data->keys.seq; 166 dst->v.sectors_written = 0; 167 dst->v.flags = 0; 168 dst->v.min_key = b->data->min_key; 169 set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k)); 170 memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k)); 171 break; 172 } 173 case KEY_TYPE_btree_ptr_v2: 174 bkey_copy(&dst->k_i, &b->key); 175 break; 176 default: 177 BUG(); 178 } 179 } 180 181 static void bch2_btree_node_update_key_early(struct btree_trans *trans, 182 enum btree_id btree, unsigned level, 183 struct bkey_s_c old, struct bkey_i *new) 184 { 185 struct bch_fs *c = trans->c; 186 struct btree *b; 187 struct bkey_buf tmp; 188 int ret; 189 190 bch2_bkey_buf_init(&tmp); 191 bch2_bkey_buf_reassemble(&tmp, c, old); 192 193 b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); 194 if (!IS_ERR_OR_NULL(b)) { 195 mutex_lock(&c->btree_cache.lock); 196 197 bch2_btree_node_hash_remove(&c->btree_cache, b); 198 199 bkey_copy(&b->key, new); 200 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 201 BUG_ON(ret); 202 203 mutex_unlock(&c->btree_cache.lock); 204 six_unlock_read(&b->c.lock); 205 } 206 207 bch2_bkey_buf_exit(&tmp, c); 208 } 209 210 static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) 211 { 212 struct bkey_i_btree_ptr_v2 *new; 213 int ret; 214 215 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); 216 if (!new) 217 return -BCH_ERR_ENOMEM_gc_repair_key; 218 219 btree_ptr_to_v2(b, new); 220 b->data->min_key = new_min; 221 new->v.min_key = new_min; 222 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); 223 224 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); 225 if (ret) { 226 kfree(new); 227 return ret; 228 } 229 230 bch2_btree_node_drop_keys_outside_node(b); 231 bkey_copy(&b->key, &new->k_i); 232 return 0; 233 } 234 235 static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) 236 { 237 struct bkey_i_btree_ptr_v2 *new; 238 int ret; 239 240 ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); 241 if (ret) 242 return ret; 243 244 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); 245 if (!new) 246 return -BCH_ERR_ENOMEM_gc_repair_key; 247 248 btree_ptr_to_v2(b, new); 249 b->data->max_key = new_max; 250 new->k.p = new_max; 251 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); 252 253 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); 254 if (ret) { 255 kfree(new); 256 return ret; 257 } 258 259 bch2_btree_node_drop_keys_outside_node(b); 260 261 mutex_lock(&c->btree_cache.lock); 262 bch2_btree_node_hash_remove(&c->btree_cache, b); 263 264 bkey_copy(&b->key, &new->k_i); 265 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 266 BUG_ON(ret); 267 mutex_unlock(&c->btree_cache.lock); 268 return 0; 269 } 270 271 static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, 272 struct btree *prev, struct btree *cur) 273 { 274 struct bpos expected_start = !prev 275 ? b->data->min_key 276 : bpos_successor(prev->key.k.p); 277 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 278 int ret = 0; 279 280 if (!prev) { 281 prt_printf(&buf1, "start of node: "); 282 bch2_bpos_to_text(&buf1, b->data->min_key); 283 } else { 284 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); 285 } 286 287 bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); 288 289 if (prev && 290 bpos_gt(expected_start, cur->data->min_key) && 291 BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { 292 /* cur overwrites prev: */ 293 294 if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, 295 cur->data->min_key), c, 296 btree_node_topology_overwritten_by_next_node, 297 "btree node overwritten by next node at btree %s level %u:\n" 298 " node %s\n" 299 " next %s", 300 bch2_btree_id_str(b->c.btree_id), b->c.level, 301 buf1.buf, buf2.buf)) { 302 ret = DROP_PREV_NODE; 303 goto out; 304 } 305 306 if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, 307 bpos_predecessor(cur->data->min_key)), c, 308 btree_node_topology_bad_max_key, 309 "btree node with incorrect max_key at btree %s level %u:\n" 310 " node %s\n" 311 " next %s", 312 bch2_btree_id_str(b->c.btree_id), b->c.level, 313 buf1.buf, buf2.buf)) 314 ret = set_node_max(c, prev, 315 bpos_predecessor(cur->data->min_key)); 316 } else { 317 /* prev overwrites cur: */ 318 319 if (mustfix_fsck_err_on(bpos_ge(expected_start, 320 cur->data->max_key), c, 321 btree_node_topology_overwritten_by_prev_node, 322 "btree node overwritten by prev node at btree %s level %u:\n" 323 " prev %s\n" 324 " node %s", 325 bch2_btree_id_str(b->c.btree_id), b->c.level, 326 buf1.buf, buf2.buf)) { 327 ret = DROP_THIS_NODE; 328 goto out; 329 } 330 331 if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, 332 btree_node_topology_bad_min_key, 333 "btree node with incorrect min_key at btree %s level %u:\n" 334 " prev %s\n" 335 " node %s", 336 bch2_btree_id_str(b->c.btree_id), b->c.level, 337 buf1.buf, buf2.buf)) 338 ret = set_node_min(c, cur, expected_start); 339 } 340 out: 341 fsck_err: 342 printbuf_exit(&buf2); 343 printbuf_exit(&buf1); 344 return ret; 345 } 346 347 static int btree_repair_node_end(struct bch_fs *c, struct btree *b, 348 struct btree *child) 349 { 350 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 351 int ret = 0; 352 353 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); 354 bch2_bpos_to_text(&buf2, b->key.k.p); 355 356 if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, 357 btree_node_topology_bad_max_key, 358 "btree node with incorrect max_key at btree %s level %u:\n" 359 " %s\n" 360 " expected %s", 361 bch2_btree_id_str(b->c.btree_id), b->c.level, 362 buf1.buf, buf2.buf)) { 363 ret = set_node_max(c, child, b->key.k.p); 364 if (ret) 365 goto err; 366 } 367 err: 368 fsck_err: 369 printbuf_exit(&buf2); 370 printbuf_exit(&buf1); 371 return ret; 372 } 373 374 static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) 375 { 376 struct bch_fs *c = trans->c; 377 struct btree_and_journal_iter iter; 378 struct bkey_s_c k; 379 struct bkey_buf prev_k, cur_k; 380 struct btree *prev = NULL, *cur = NULL; 381 bool have_child, dropped_children = false; 382 struct printbuf buf = PRINTBUF; 383 int ret = 0; 384 385 if (!b->c.level) 386 return 0; 387 again: 388 prev = NULL; 389 have_child = dropped_children = false; 390 bch2_bkey_buf_init(&prev_k); 391 bch2_bkey_buf_init(&cur_k); 392 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 393 394 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 395 BUG_ON(bpos_lt(k.k->p, b->data->min_key)); 396 BUG_ON(bpos_gt(k.k->p, b->data->max_key)); 397 398 bch2_btree_and_journal_iter_advance(&iter); 399 bch2_bkey_buf_reassemble(&cur_k, c, k); 400 401 cur = bch2_btree_node_get_noiter(trans, cur_k.k, 402 b->c.btree_id, b->c.level - 1, 403 false); 404 ret = PTR_ERR_OR_ZERO(cur); 405 406 printbuf_reset(&buf); 407 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); 408 409 if (mustfix_fsck_err_on(ret == -EIO, c, 410 btree_node_unreadable, 411 "Topology repair: unreadable btree node at btree %s level %u:\n" 412 " %s", 413 bch2_btree_id_str(b->c.btree_id), 414 b->c.level - 1, 415 buf.buf)) { 416 bch2_btree_node_evict(trans, cur_k.k); 417 ret = bch2_journal_key_delete(c, b->c.btree_id, 418 b->c.level, cur_k.k->k.p); 419 cur = NULL; 420 if (ret) 421 break; 422 continue; 423 } 424 425 bch_err_msg(c, ret, "getting btree node"); 426 if (ret) 427 break; 428 429 ret = btree_repair_node_boundaries(c, b, prev, cur); 430 431 if (ret == DROP_THIS_NODE) { 432 six_unlock_read(&cur->c.lock); 433 bch2_btree_node_evict(trans, cur_k.k); 434 ret = bch2_journal_key_delete(c, b->c.btree_id, 435 b->c.level, cur_k.k->k.p); 436 cur = NULL; 437 if (ret) 438 break; 439 continue; 440 } 441 442 if (prev) 443 six_unlock_read(&prev->c.lock); 444 prev = NULL; 445 446 if (ret == DROP_PREV_NODE) { 447 bch2_btree_node_evict(trans, prev_k.k); 448 ret = bch2_journal_key_delete(c, b->c.btree_id, 449 b->c.level, prev_k.k->k.p); 450 if (ret) 451 break; 452 453 bch2_btree_and_journal_iter_exit(&iter); 454 bch2_bkey_buf_exit(&prev_k, c); 455 bch2_bkey_buf_exit(&cur_k, c); 456 goto again; 457 } else if (ret) 458 break; 459 460 prev = cur; 461 cur = NULL; 462 bch2_bkey_buf_copy(&prev_k, c, cur_k.k); 463 } 464 465 if (!ret && !IS_ERR_OR_NULL(prev)) { 466 BUG_ON(cur); 467 ret = btree_repair_node_end(c, b, prev); 468 } 469 470 if (!IS_ERR_OR_NULL(prev)) 471 six_unlock_read(&prev->c.lock); 472 prev = NULL; 473 if (!IS_ERR_OR_NULL(cur)) 474 six_unlock_read(&cur->c.lock); 475 cur = NULL; 476 477 if (ret) 478 goto err; 479 480 bch2_btree_and_journal_iter_exit(&iter); 481 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 482 483 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 484 bch2_bkey_buf_reassemble(&cur_k, c, k); 485 bch2_btree_and_journal_iter_advance(&iter); 486 487 cur = bch2_btree_node_get_noiter(trans, cur_k.k, 488 b->c.btree_id, b->c.level - 1, 489 false); 490 ret = PTR_ERR_OR_ZERO(cur); 491 492 bch_err_msg(c, ret, "getting btree node"); 493 if (ret) 494 goto err; 495 496 ret = bch2_btree_repair_topology_recurse(trans, cur); 497 six_unlock_read(&cur->c.lock); 498 cur = NULL; 499 500 if (ret == DROP_THIS_NODE) { 501 bch2_btree_node_evict(trans, cur_k.k); 502 ret = bch2_journal_key_delete(c, b->c.btree_id, 503 b->c.level, cur_k.k->k.p); 504 dropped_children = true; 505 } 506 507 if (ret) 508 goto err; 509 510 have_child = true; 511 } 512 513 printbuf_reset(&buf); 514 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 515 516 if (mustfix_fsck_err_on(!have_child, c, 517 btree_node_topology_interior_node_empty, 518 "empty interior btree node at btree %s level %u\n" 519 " %s", 520 bch2_btree_id_str(b->c.btree_id), 521 b->c.level, buf.buf)) 522 ret = DROP_THIS_NODE; 523 err: 524 fsck_err: 525 if (!IS_ERR_OR_NULL(prev)) 526 six_unlock_read(&prev->c.lock); 527 if (!IS_ERR_OR_NULL(cur)) 528 six_unlock_read(&cur->c.lock); 529 530 bch2_btree_and_journal_iter_exit(&iter); 531 bch2_bkey_buf_exit(&prev_k, c); 532 bch2_bkey_buf_exit(&cur_k, c); 533 534 if (!ret && dropped_children) 535 goto again; 536 537 printbuf_exit(&buf); 538 return ret; 539 } 540 541 int bch2_check_topology(struct bch_fs *c) 542 { 543 struct btree_trans *trans = bch2_trans_get(c); 544 struct btree *b; 545 unsigned i; 546 int ret = 0; 547 548 for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { 549 struct btree_root *r = bch2_btree_id_root(c, i); 550 551 if (!r->alive) 552 continue; 553 554 b = r->b; 555 if (btree_node_fake(b)) 556 continue; 557 558 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); 559 ret = bch2_btree_repair_topology_recurse(trans, b); 560 six_unlock_read(&b->c.lock); 561 562 if (ret == DROP_THIS_NODE) { 563 bch_err(c, "empty btree root - repair unimplemented"); 564 ret = -BCH_ERR_fsck_repair_unimplemented; 565 } 566 } 567 568 bch2_trans_put(trans); 569 570 return ret; 571 } 572 573 static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id, 574 unsigned level, bool is_root, 575 struct bkey_s_c *k) 576 { 577 struct bch_fs *c = trans->c; 578 struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k); 579 const union bch_extent_entry *entry_c; 580 struct extent_ptr_decoded p = { 0 }; 581 bool do_update = false; 582 struct printbuf buf = PRINTBUF; 583 int ret = 0; 584 585 /* 586 * XXX 587 * use check_bucket_ref here 588 */ 589 bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { 590 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); 591 struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); 592 enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); 593 594 if (!g->gen_valid && 595 (c->opts.reconstruct_alloc || 596 fsck_err(c, ptr_to_missing_alloc_key, 597 "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" 598 "while marking %s", 599 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 600 bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), 601 p.ptr.gen, 602 (printbuf_reset(&buf), 603 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { 604 if (!p.ptr.cached) { 605 g->gen_valid = true; 606 g->gen = p.ptr.gen; 607 } else { 608 do_update = true; 609 } 610 } 611 612 if (gen_cmp(p.ptr.gen, g->gen) > 0 && 613 (c->opts.reconstruct_alloc || 614 fsck_err(c, ptr_gen_newer_than_bucket_gen, 615 "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" 616 "while marking %s", 617 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 618 bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), 619 p.ptr.gen, g->gen, 620 (printbuf_reset(&buf), 621 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { 622 if (!p.ptr.cached) { 623 g->gen_valid = true; 624 g->gen = p.ptr.gen; 625 g->data_type = 0; 626 g->dirty_sectors = 0; 627 g->cached_sectors = 0; 628 set_bit(BCH_FS_need_another_gc, &c->flags); 629 } else { 630 do_update = true; 631 } 632 } 633 634 if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && 635 (c->opts.reconstruct_alloc || 636 fsck_err(c, ptr_gen_newer_than_bucket_gen, 637 "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" 638 "while marking %s", 639 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, 640 bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), 641 p.ptr.gen, 642 (printbuf_reset(&buf), 643 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) 644 do_update = true; 645 646 if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && 647 (c->opts.reconstruct_alloc || 648 fsck_err(c, stale_dirty_ptr, 649 "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" 650 "while marking %s", 651 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 652 bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), 653 p.ptr.gen, g->gen, 654 (printbuf_reset(&buf), 655 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) 656 do_update = true; 657 658 if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) 659 continue; 660 661 if (fsck_err_on(bucket_data_type(g->data_type) && 662 bucket_data_type(g->data_type) != data_type, c, 663 ptr_bucket_data_type_mismatch, 664 "bucket %u:%zu different types of data in same bucket: %s, %s\n" 665 "while marking %s", 666 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 667 bch2_data_type_str(g->data_type), 668 bch2_data_type_str(data_type), 669 (printbuf_reset(&buf), 670 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { 671 if (data_type == BCH_DATA_btree) { 672 g->data_type = data_type; 673 set_bit(BCH_FS_need_another_gc, &c->flags); 674 } else { 675 do_update = true; 676 } 677 } 678 679 if (p.has_ec) { 680 struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); 681 682 if (fsck_err_on(!m || !m->alive, c, 683 ptr_to_missing_stripe, 684 "pointer to nonexistent stripe %llu\n" 685 "while marking %s", 686 (u64) p.ec.idx, 687 (printbuf_reset(&buf), 688 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) 689 do_update = true; 690 691 if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, 692 ptr_to_incorrect_stripe, 693 "pointer does not match stripe %llu\n" 694 "while marking %s", 695 (u64) p.ec.idx, 696 (printbuf_reset(&buf), 697 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) 698 do_update = true; 699 } 700 } 701 702 if (do_update) { 703 struct bkey_ptrs ptrs; 704 union bch_extent_entry *entry; 705 struct bch_extent_ptr *ptr; 706 struct bkey_i *new; 707 708 if (is_root) { 709 bch_err(c, "cannot update btree roots yet"); 710 ret = -EINVAL; 711 goto err; 712 } 713 714 new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); 715 if (!new) { 716 ret = -BCH_ERR_ENOMEM_gc_repair_key; 717 bch_err_msg(c, ret, "allocating new key"); 718 goto err; 719 } 720 721 bkey_reassemble(new, *k); 722 723 if (level) { 724 /* 725 * We don't want to drop btree node pointers - if the 726 * btree node isn't there anymore, the read path will 727 * sort it out: 728 */ 729 ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); 730 bkey_for_each_ptr(ptrs, ptr) { 731 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 732 struct bucket *g = PTR_GC_BUCKET(ca, ptr); 733 734 ptr->gen = g->gen; 735 } 736 } else { 737 bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ 738 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 739 struct bucket *g = PTR_GC_BUCKET(ca, ptr); 740 enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); 741 742 (ptr->cached && 743 (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || 744 (!ptr->cached && 745 gen_cmp(ptr->gen, g->gen) < 0) || 746 gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || 747 (g->data_type && 748 g->data_type != data_type); 749 })); 750 again: 751 ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); 752 bkey_extent_entry_for_each(ptrs, entry) { 753 if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { 754 struct gc_stripe *m = genradix_ptr(&c->gc_stripes, 755 entry->stripe_ptr.idx); 756 union bch_extent_entry *next_ptr; 757 758 bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) 759 if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) 760 goto found; 761 next_ptr = NULL; 762 found: 763 if (!next_ptr) { 764 bch_err(c, "aieee, found stripe ptr with no data ptr"); 765 continue; 766 } 767 768 if (!m || !m->alive || 769 !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], 770 &next_ptr->ptr, 771 m->sectors)) { 772 bch2_bkey_extent_entry_drop(new, entry); 773 goto again; 774 } 775 } 776 } 777 } 778 779 ret = bch2_journal_key_insert_take(c, btree_id, level, new); 780 if (ret) { 781 kfree(new); 782 goto err; 783 } 784 785 if (level) 786 bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); 787 788 if (0) { 789 printbuf_reset(&buf); 790 bch2_bkey_val_to_text(&buf, c, *k); 791 bch_info(c, "updated %s", buf.buf); 792 793 printbuf_reset(&buf); 794 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); 795 bch_info(c, "new key %s", buf.buf); 796 } 797 798 *k = bkey_i_to_s_c(new); 799 } 800 err: 801 fsck_err: 802 printbuf_exit(&buf); 803 return ret; 804 } 805 806 /* marking of btree keys/nodes: */ 807 808 static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, 809 unsigned level, bool is_root, 810 struct bkey_s_c *k, 811 bool initial) 812 { 813 struct bch_fs *c = trans->c; 814 struct bkey deleted = KEY(0, 0, 0); 815 struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; 816 int ret = 0; 817 818 deleted.p = k->k->p; 819 820 if (initial) { 821 BUG_ON(bch2_journal_seq_verify && 822 k->k->version.lo > atomic64_read(&c->journal.seq)); 823 824 ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); 825 if (ret) 826 goto err; 827 828 if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, 829 bkey_version_in_future, 830 "key version number higher than recorded: %llu > %llu", 831 k->k->version.lo, 832 atomic64_read(&c->key_version))) 833 atomic64_set(&c->key_version, k->k->version.lo); 834 } 835 836 ret = commit_do(trans, NULL, NULL, 0, 837 bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); 838 fsck_err: 839 err: 840 bch_err_fn(c, ret); 841 return ret; 842 } 843 844 static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) 845 { 846 struct bch_fs *c = trans->c; 847 struct btree_node_iter iter; 848 struct bkey unpacked; 849 struct bkey_s_c k; 850 struct bkey_buf prev, cur; 851 int ret = 0; 852 853 if (!btree_node_type_needs_gc(btree_node_type(b))) 854 return 0; 855 856 bch2_btree_node_iter_init_from_start(&iter, b); 857 bch2_bkey_buf_init(&prev); 858 bch2_bkey_buf_init(&cur); 859 bkey_init(&prev.k->k); 860 861 while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { 862 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, 863 &k, initial); 864 if (ret) 865 break; 866 867 bch2_btree_node_iter_advance(&iter, b); 868 869 if (b->c.level) { 870 bch2_bkey_buf_reassemble(&cur, c, k); 871 872 ret = bch2_gc_check_topology(c, b, &prev, cur, 873 bch2_btree_node_iter_end(&iter)); 874 if (ret) 875 break; 876 } 877 } 878 879 bch2_bkey_buf_exit(&cur, c); 880 bch2_bkey_buf_exit(&prev, c); 881 return ret; 882 } 883 884 static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, 885 bool initial, bool metadata_only) 886 { 887 struct bch_fs *c = trans->c; 888 struct btree_iter iter; 889 struct btree *b; 890 unsigned depth = metadata_only ? 1 : 0; 891 int ret = 0; 892 893 gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); 894 895 __for_each_btree_node(trans, iter, btree_id, POS_MIN, 896 0, depth, BTREE_ITER_PREFETCH, b, ret) { 897 bch2_verify_btree_nr_keys(b); 898 899 gc_pos_set(c, gc_pos_btree_node(b)); 900 901 ret = btree_gc_mark_node(trans, b, initial); 902 if (ret) 903 break; 904 } 905 bch2_trans_iter_exit(trans, &iter); 906 907 if (ret) 908 return ret; 909 910 mutex_lock(&c->btree_root_lock); 911 b = bch2_btree_id_root(c, btree_id)->b; 912 if (!btree_node_fake(b)) { 913 struct bkey_s_c k = bkey_i_to_s_c(&b->key); 914 915 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, 916 true, &k, initial); 917 } 918 gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); 919 mutex_unlock(&c->btree_root_lock); 920 921 return ret; 922 } 923 924 static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, 925 unsigned target_depth) 926 { 927 struct bch_fs *c = trans->c; 928 struct btree_and_journal_iter iter; 929 struct bkey_s_c k; 930 struct bkey_buf cur, prev; 931 struct printbuf buf = PRINTBUF; 932 int ret = 0; 933 934 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 935 bch2_bkey_buf_init(&prev); 936 bch2_bkey_buf_init(&cur); 937 bkey_init(&prev.k->k); 938 939 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 940 BUG_ON(bpos_lt(k.k->p, b->data->min_key)); 941 BUG_ON(bpos_gt(k.k->p, b->data->max_key)); 942 943 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, 944 false, &k, true); 945 if (ret) 946 goto fsck_err; 947 948 if (b->c.level) { 949 bch2_bkey_buf_reassemble(&cur, c, k); 950 k = bkey_i_to_s_c(cur.k); 951 952 bch2_btree_and_journal_iter_advance(&iter); 953 954 ret = bch2_gc_check_topology(c, b, 955 &prev, cur, 956 !bch2_btree_and_journal_iter_peek(&iter).k); 957 if (ret) 958 goto fsck_err; 959 } else { 960 bch2_btree_and_journal_iter_advance(&iter); 961 } 962 } 963 964 if (b->c.level > target_depth) { 965 bch2_btree_and_journal_iter_exit(&iter); 966 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 967 968 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 969 struct btree *child; 970 971 bch2_bkey_buf_reassemble(&cur, c, k); 972 bch2_btree_and_journal_iter_advance(&iter); 973 974 child = bch2_btree_node_get_noiter(trans, cur.k, 975 b->c.btree_id, b->c.level - 1, 976 false); 977 ret = PTR_ERR_OR_ZERO(child); 978 979 if (ret == -EIO) { 980 bch2_topology_error(c); 981 982 if (__fsck_err(c, 983 FSCK_CAN_FIX| 984 FSCK_CAN_IGNORE| 985 FSCK_NO_RATELIMIT, 986 btree_node_read_error, 987 "Unreadable btree node at btree %s level %u:\n" 988 " %s", 989 bch2_btree_id_str(b->c.btree_id), 990 b->c.level - 1, 991 (printbuf_reset(&buf), 992 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && 993 should_restart_for_topology_repair(c)) { 994 bch_info(c, "Halting mark and sweep to start topology repair pass"); 995 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); 996 goto fsck_err; 997 } else { 998 /* Continue marking when opted to not 999 * fix the error: */ 1000 ret = 0; 1001 set_bit(BCH_FS_initial_gc_unfixed, &c->flags); 1002 continue; 1003 } 1004 } else if (ret) { 1005 bch_err_msg(c, ret, "getting btree node"); 1006 break; 1007 } 1008 1009 ret = bch2_gc_btree_init_recurse(trans, child, 1010 target_depth); 1011 six_unlock_read(&child->c.lock); 1012 1013 if (ret) 1014 break; 1015 } 1016 } 1017 fsck_err: 1018 bch2_bkey_buf_exit(&cur, c); 1019 bch2_bkey_buf_exit(&prev, c); 1020 bch2_btree_and_journal_iter_exit(&iter); 1021 printbuf_exit(&buf); 1022 return ret; 1023 } 1024 1025 static int bch2_gc_btree_init(struct btree_trans *trans, 1026 enum btree_id btree_id, 1027 bool metadata_only) 1028 { 1029 struct bch_fs *c = trans->c; 1030 struct btree *b; 1031 unsigned target_depth = metadata_only ? 1 : 0; 1032 struct printbuf buf = PRINTBUF; 1033 int ret = 0; 1034 1035 b = bch2_btree_id_root(c, btree_id)->b; 1036 1037 if (btree_node_fake(b)) 1038 return 0; 1039 1040 six_lock_read(&b->c.lock, NULL, NULL); 1041 printbuf_reset(&buf); 1042 bch2_bpos_to_text(&buf, b->data->min_key); 1043 if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, 1044 btree_root_bad_min_key, 1045 "btree root with incorrect min_key: %s", buf.buf)) { 1046 bch_err(c, "repair unimplemented"); 1047 ret = -BCH_ERR_fsck_repair_unimplemented; 1048 goto fsck_err; 1049 } 1050 1051 printbuf_reset(&buf); 1052 bch2_bpos_to_text(&buf, b->data->max_key); 1053 if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, 1054 btree_root_bad_max_key, 1055 "btree root with incorrect max_key: %s", buf.buf)) { 1056 bch_err(c, "repair unimplemented"); 1057 ret = -BCH_ERR_fsck_repair_unimplemented; 1058 goto fsck_err; 1059 } 1060 1061 if (b->c.level >= target_depth) 1062 ret = bch2_gc_btree_init_recurse(trans, b, target_depth); 1063 1064 if (!ret) { 1065 struct bkey_s_c k = bkey_i_to_s_c(&b->key); 1066 1067 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true, 1068 &k, true); 1069 } 1070 fsck_err: 1071 six_unlock_read(&b->c.lock); 1072 1073 bch_err_fn(c, ret); 1074 printbuf_exit(&buf); 1075 return ret; 1076 } 1077 1078 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) 1079 { 1080 return (int) btree_id_to_gc_phase(l) - 1081 (int) btree_id_to_gc_phase(r); 1082 } 1083 1084 static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) 1085 { 1086 struct btree_trans *trans = bch2_trans_get(c); 1087 enum btree_id ids[BTREE_ID_NR]; 1088 unsigned i; 1089 int ret = 0; 1090 1091 for (i = 0; i < BTREE_ID_NR; i++) 1092 ids[i] = i; 1093 bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); 1094 1095 for (i = 0; i < BTREE_ID_NR && !ret; i++) 1096 ret = initial 1097 ? bch2_gc_btree_init(trans, ids[i], metadata_only) 1098 : bch2_gc_btree(trans, ids[i], initial, metadata_only); 1099 1100 for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { 1101 if (!bch2_btree_id_root(c, i)->alive) 1102 continue; 1103 1104 ret = initial 1105 ? bch2_gc_btree_init(trans, i, metadata_only) 1106 : bch2_gc_btree(trans, i, initial, metadata_only); 1107 } 1108 1109 bch2_trans_put(trans); 1110 bch_err_fn(c, ret); 1111 return ret; 1112 } 1113 1114 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, 1115 u64 start, u64 end, 1116 enum bch_data_type type, 1117 unsigned flags) 1118 { 1119 u64 b = sector_to_bucket(ca, start); 1120 1121 do { 1122 unsigned sectors = 1123 min_t(u64, bucket_to_sector(ca, b + 1), end) - start; 1124 1125 bch2_mark_metadata_bucket(c, ca, b, type, sectors, 1126 gc_phase(GC_PHASE_SB), flags); 1127 b++; 1128 start += sectors; 1129 } while (start < end); 1130 } 1131 1132 static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, 1133 unsigned flags) 1134 { 1135 struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; 1136 unsigned i; 1137 u64 b; 1138 1139 for (i = 0; i < layout->nr_superblocks; i++) { 1140 u64 offset = le64_to_cpu(layout->sb_offset[i]); 1141 1142 if (offset == BCH_SB_SECTOR) 1143 mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, 1144 BCH_DATA_sb, flags); 1145 1146 mark_metadata_sectors(c, ca, offset, 1147 offset + (1 << layout->sb_max_size_bits), 1148 BCH_DATA_sb, flags); 1149 } 1150 1151 for (i = 0; i < ca->journal.nr; i++) { 1152 b = ca->journal.buckets[i]; 1153 bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, 1154 ca->mi.bucket_size, 1155 gc_phase(GC_PHASE_SB), flags); 1156 } 1157 } 1158 1159 static void bch2_mark_superblocks(struct bch_fs *c) 1160 { 1161 mutex_lock(&c->sb_lock); 1162 gc_pos_set(c, gc_phase(GC_PHASE_SB)); 1163 1164 for_each_online_member(c, ca) 1165 bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); 1166 mutex_unlock(&c->sb_lock); 1167 } 1168 1169 #if 0 1170 /* Also see bch2_pending_btree_node_free_insert_done() */ 1171 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) 1172 { 1173 struct btree_update *as; 1174 struct pending_btree_node_free *d; 1175 1176 mutex_lock(&c->btree_interior_update_lock); 1177 gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); 1178 1179 for_each_pending_btree_node_free(c, as, d) 1180 if (d->index_update_done) 1181 bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); 1182 1183 mutex_unlock(&c->btree_interior_update_lock); 1184 } 1185 #endif 1186 1187 static void bch2_gc_free(struct bch_fs *c) 1188 { 1189 genradix_free(&c->reflink_gc_table); 1190 genradix_free(&c->gc_stripes); 1191 1192 for_each_member_device(c, ca) { 1193 kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), 1194 sizeof(struct bucket_array) + 1195 ca->mi.nbuckets * sizeof(struct bucket)); 1196 ca->buckets_gc = NULL; 1197 1198 free_percpu(ca->usage_gc); 1199 ca->usage_gc = NULL; 1200 } 1201 1202 free_percpu(c->usage_gc); 1203 c->usage_gc = NULL; 1204 } 1205 1206 static int bch2_gc_done(struct bch_fs *c, 1207 bool initial, bool metadata_only) 1208 { 1209 struct bch_dev *ca = NULL; 1210 struct printbuf buf = PRINTBUF; 1211 bool verify = !metadata_only && 1212 !c->opts.reconstruct_alloc && 1213 (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); 1214 unsigned i; 1215 int ret = 0; 1216 1217 percpu_down_write(&c->mark_lock); 1218 1219 #define copy_field(_err, _f, _msg, ...) \ 1220 if (dst->_f != src->_f && \ 1221 (!verify || \ 1222 fsck_err(c, _err, _msg ": got %llu, should be %llu" \ 1223 , ##__VA_ARGS__, dst->_f, src->_f))) \ 1224 dst->_f = src->_f 1225 #define copy_dev_field(_err, _f, _msg, ...) \ 1226 copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__) 1227 #define copy_fs_field(_err, _f, _msg, ...) \ 1228 copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) 1229 1230 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 1231 bch2_fs_usage_acc_to_base(c, i); 1232 1233 __for_each_member_device(c, ca) { 1234 struct bch_dev_usage *dst = ca->usage_base; 1235 struct bch_dev_usage *src = (void *) 1236 bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, 1237 dev_usage_u64s()); 1238 1239 for (i = 0; i < BCH_DATA_NR; i++) { 1240 copy_dev_field(dev_usage_buckets_wrong, 1241 d[i].buckets, "%s buckets", bch2_data_type_str(i)); 1242 copy_dev_field(dev_usage_sectors_wrong, 1243 d[i].sectors, "%s sectors", bch2_data_type_str(i)); 1244 copy_dev_field(dev_usage_fragmented_wrong, 1245 d[i].fragmented, "%s fragmented", bch2_data_type_str(i)); 1246 } 1247 } 1248 1249 { 1250 unsigned nr = fs_usage_u64s(c); 1251 struct bch_fs_usage *dst = c->usage_base; 1252 struct bch_fs_usage *src = (void *) 1253 bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); 1254 1255 copy_fs_field(fs_usage_hidden_wrong, 1256 b.hidden, "hidden"); 1257 copy_fs_field(fs_usage_btree_wrong, 1258 b.btree, "btree"); 1259 1260 if (!metadata_only) { 1261 copy_fs_field(fs_usage_data_wrong, 1262 b.data, "data"); 1263 copy_fs_field(fs_usage_cached_wrong, 1264 b.cached, "cached"); 1265 copy_fs_field(fs_usage_reserved_wrong, 1266 b.reserved, "reserved"); 1267 copy_fs_field(fs_usage_nr_inodes_wrong, 1268 b.nr_inodes,"nr_inodes"); 1269 1270 for (i = 0; i < BCH_REPLICAS_MAX; i++) 1271 copy_fs_field(fs_usage_persistent_reserved_wrong, 1272 persistent_reserved[i], 1273 "persistent_reserved[%i]", i); 1274 } 1275 1276 for (i = 0; i < c->replicas.nr; i++) { 1277 struct bch_replicas_entry_v1 *e = 1278 cpu_replicas_entry(&c->replicas, i); 1279 1280 if (metadata_only && 1281 (e->data_type == BCH_DATA_user || 1282 e->data_type == BCH_DATA_cached)) 1283 continue; 1284 1285 printbuf_reset(&buf); 1286 bch2_replicas_entry_to_text(&buf, e); 1287 1288 copy_fs_field(fs_usage_replicas_wrong, 1289 replicas[i], "%s", buf.buf); 1290 } 1291 } 1292 1293 #undef copy_fs_field 1294 #undef copy_dev_field 1295 #undef copy_stripe_field 1296 #undef copy_field 1297 fsck_err: 1298 if (ca) 1299 percpu_ref_put(&ca->ref); 1300 bch_err_fn(c, ret); 1301 1302 percpu_up_write(&c->mark_lock); 1303 printbuf_exit(&buf); 1304 return ret; 1305 } 1306 1307 static int bch2_gc_start(struct bch_fs *c) 1308 { 1309 BUG_ON(c->usage_gc); 1310 1311 c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), 1312 sizeof(u64), GFP_KERNEL); 1313 if (!c->usage_gc) { 1314 bch_err(c, "error allocating c->usage_gc"); 1315 return -BCH_ERR_ENOMEM_gc_start; 1316 } 1317 1318 for_each_member_device(c, ca) { 1319 BUG_ON(ca->usage_gc); 1320 1321 ca->usage_gc = alloc_percpu(struct bch_dev_usage); 1322 if (!ca->usage_gc) { 1323 bch_err(c, "error allocating ca->usage_gc"); 1324 percpu_ref_put(&ca->ref); 1325 return -BCH_ERR_ENOMEM_gc_start; 1326 } 1327 1328 this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets, 1329 ca->mi.nbuckets - ca->mi.first_bucket); 1330 } 1331 1332 return 0; 1333 } 1334 1335 static int bch2_gc_reset(struct bch_fs *c) 1336 { 1337 for_each_member_device(c, ca) { 1338 free_percpu(ca->usage_gc); 1339 ca->usage_gc = NULL; 1340 } 1341 1342 free_percpu(c->usage_gc); 1343 c->usage_gc = NULL; 1344 1345 return bch2_gc_start(c); 1346 } 1347 1348 /* returns true if not equal */ 1349 static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, 1350 struct bch_alloc_v4 r) 1351 { 1352 return l.gen != r.gen || 1353 l.oldest_gen != r.oldest_gen || 1354 l.data_type != r.data_type || 1355 l.dirty_sectors != r.dirty_sectors || 1356 l.cached_sectors != r.cached_sectors || 1357 l.stripe_redundancy != r.stripe_redundancy || 1358 l.stripe != r.stripe; 1359 } 1360 1361 static int bch2_alloc_write_key(struct btree_trans *trans, 1362 struct btree_iter *iter, 1363 struct bkey_s_c k, 1364 bool metadata_only) 1365 { 1366 struct bch_fs *c = trans->c; 1367 struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); 1368 struct bucket gc, *b; 1369 struct bkey_i_alloc_v4 *a; 1370 struct bch_alloc_v4 old_convert, new; 1371 const struct bch_alloc_v4 *old; 1372 enum bch_data_type type; 1373 int ret; 1374 1375 old = bch2_alloc_to_v4(k, &old_convert); 1376 new = *old; 1377 1378 percpu_down_read(&c->mark_lock); 1379 b = gc_bucket(ca, iter->pos.offset); 1380 1381 /* 1382 * b->data_type doesn't yet include need_discard & need_gc_gen states - 1383 * fix that here: 1384 */ 1385 type = __alloc_data_type(b->dirty_sectors, 1386 b->cached_sectors, 1387 b->stripe, 1388 *old, 1389 b->data_type); 1390 if (b->data_type != type) { 1391 struct bch_dev_usage *u; 1392 1393 preempt_disable(); 1394 u = this_cpu_ptr(ca->usage_gc); 1395 u->d[b->data_type].buckets--; 1396 b->data_type = type; 1397 u->d[b->data_type].buckets++; 1398 preempt_enable(); 1399 } 1400 1401 gc = *b; 1402 percpu_up_read(&c->mark_lock); 1403 1404 if (metadata_only && 1405 gc.data_type != BCH_DATA_sb && 1406 gc.data_type != BCH_DATA_journal && 1407 gc.data_type != BCH_DATA_btree) 1408 return 0; 1409 1410 if (gen_after(old->gen, gc.gen)) 1411 return 0; 1412 1413 if (c->opts.reconstruct_alloc || 1414 fsck_err_on(new.data_type != gc.data_type, c, 1415 alloc_key_data_type_wrong, 1416 "bucket %llu:%llu gen %u has wrong data_type" 1417 ": got %s, should be %s", 1418 iter->pos.inode, iter->pos.offset, 1419 gc.gen, 1420 bch2_data_type_str(new.data_type), 1421 bch2_data_type_str(gc.data_type))) 1422 new.data_type = gc.data_type; 1423 1424 #define copy_bucket_field(_errtype, _f) \ 1425 if (c->opts.reconstruct_alloc || \ 1426 fsck_err_on(new._f != gc._f, c, _errtype, \ 1427 "bucket %llu:%llu gen %u data type %s has wrong " #_f \ 1428 ": got %u, should be %u", \ 1429 iter->pos.inode, iter->pos.offset, \ 1430 gc.gen, \ 1431 bch2_data_type_str(gc.data_type), \ 1432 new._f, gc._f)) \ 1433 new._f = gc._f; \ 1434 1435 copy_bucket_field(alloc_key_gen_wrong, 1436 gen); 1437 copy_bucket_field(alloc_key_dirty_sectors_wrong, 1438 dirty_sectors); 1439 copy_bucket_field(alloc_key_cached_sectors_wrong, 1440 cached_sectors); 1441 copy_bucket_field(alloc_key_stripe_wrong, 1442 stripe); 1443 copy_bucket_field(alloc_key_stripe_redundancy_wrong, 1444 stripe_redundancy); 1445 #undef copy_bucket_field 1446 1447 if (!bch2_alloc_v4_cmp(*old, new)) 1448 return 0; 1449 1450 a = bch2_alloc_to_v4_mut(trans, k); 1451 ret = PTR_ERR_OR_ZERO(a); 1452 if (ret) 1453 return ret; 1454 1455 a->v = new; 1456 1457 /* 1458 * The trigger normally makes sure this is set, but we're not running 1459 * triggers: 1460 */ 1461 if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) 1462 a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); 1463 1464 ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); 1465 fsck_err: 1466 return ret; 1467 } 1468 1469 static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) 1470 { 1471 int ret = 0; 1472 1473 for_each_member_device(c, ca) { 1474 ret = bch2_trans_run(c, 1475 for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc, 1476 POS(ca->dev_idx, ca->mi.first_bucket), 1477 POS(ca->dev_idx, ca->mi.nbuckets - 1), 1478 BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, 1479 NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, 1480 bch2_alloc_write_key(trans, &iter, k, metadata_only))); 1481 if (ret) { 1482 percpu_ref_put(&ca->ref); 1483 break; 1484 } 1485 } 1486 1487 bch_err_fn(c, ret); 1488 return ret; 1489 } 1490 1491 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) 1492 { 1493 for_each_member_device(c, ca) { 1494 struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + 1495 ca->mi.nbuckets * sizeof(struct bucket), 1496 GFP_KERNEL|__GFP_ZERO); 1497 if (!buckets) { 1498 percpu_ref_put(&ca->ref); 1499 bch_err(c, "error allocating ca->buckets[gc]"); 1500 return -BCH_ERR_ENOMEM_gc_alloc_start; 1501 } 1502 1503 buckets->first_bucket = ca->mi.first_bucket; 1504 buckets->nbuckets = ca->mi.nbuckets; 1505 rcu_assign_pointer(ca->buckets_gc, buckets); 1506 } 1507 1508 int ret = bch2_trans_run(c, 1509 for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, 1510 BTREE_ITER_PREFETCH, k, ({ 1511 struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); 1512 struct bucket *g = gc_bucket(ca, k.k->p.offset); 1513 1514 struct bch_alloc_v4 a_convert; 1515 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); 1516 1517 g->gen_valid = 1; 1518 g->gen = a->gen; 1519 1520 if (metadata_only && 1521 (a->data_type == BCH_DATA_user || 1522 a->data_type == BCH_DATA_cached || 1523 a->data_type == BCH_DATA_parity)) { 1524 g->data_type = a->data_type; 1525 g->dirty_sectors = a->dirty_sectors; 1526 g->cached_sectors = a->cached_sectors; 1527 g->stripe = a->stripe; 1528 g->stripe_redundancy = a->stripe_redundancy; 1529 } 1530 1531 0; 1532 }))); 1533 bch_err_fn(c, ret); 1534 return ret; 1535 } 1536 1537 static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) 1538 { 1539 for_each_member_device(c, ca) { 1540 struct bucket_array *buckets = gc_bucket_array(ca); 1541 struct bucket *g; 1542 1543 for_each_bucket(g, buckets) { 1544 if (metadata_only && 1545 (g->data_type == BCH_DATA_user || 1546 g->data_type == BCH_DATA_cached || 1547 g->data_type == BCH_DATA_parity)) 1548 continue; 1549 g->data_type = 0; 1550 g->dirty_sectors = 0; 1551 g->cached_sectors = 0; 1552 } 1553 } 1554 } 1555 1556 static int bch2_gc_write_reflink_key(struct btree_trans *trans, 1557 struct btree_iter *iter, 1558 struct bkey_s_c k, 1559 size_t *idx) 1560 { 1561 struct bch_fs *c = trans->c; 1562 const __le64 *refcount = bkey_refcount_c(k); 1563 struct printbuf buf = PRINTBUF; 1564 struct reflink_gc *r; 1565 int ret = 0; 1566 1567 if (!refcount) 1568 return 0; 1569 1570 while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && 1571 r->offset < k.k->p.offset) 1572 ++*idx; 1573 1574 if (!r || 1575 r->offset != k.k->p.offset || 1576 r->size != k.k->size) { 1577 bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); 1578 return -EINVAL; 1579 } 1580 1581 if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, 1582 reflink_v_refcount_wrong, 1583 "reflink key has wrong refcount:\n" 1584 " %s\n" 1585 " should be %u", 1586 (bch2_bkey_val_to_text(&buf, c, k), buf.buf), 1587 r->refcount)) { 1588 struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0); 1589 1590 ret = PTR_ERR_OR_ZERO(new); 1591 if (ret) 1592 return ret; 1593 1594 if (!r->refcount) 1595 new->k.type = KEY_TYPE_deleted; 1596 else 1597 *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); 1598 } 1599 fsck_err: 1600 printbuf_exit(&buf); 1601 return ret; 1602 } 1603 1604 static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) 1605 { 1606 size_t idx = 0; 1607 1608 if (metadata_only) 1609 return 0; 1610 1611 int ret = bch2_trans_run(c, 1612 for_each_btree_key_commit(trans, iter, 1613 BTREE_ID_reflink, POS_MIN, 1614 BTREE_ITER_PREFETCH, k, 1615 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1616 bch2_gc_write_reflink_key(trans, &iter, k, &idx))); 1617 c->reflink_gc_nr = 0; 1618 return ret; 1619 } 1620 1621 static int bch2_gc_reflink_start(struct bch_fs *c, 1622 bool metadata_only) 1623 { 1624 1625 if (metadata_only) 1626 return 0; 1627 1628 c->reflink_gc_nr = 0; 1629 1630 int ret = bch2_trans_run(c, 1631 for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, 1632 BTREE_ITER_PREFETCH, k, ({ 1633 const __le64 *refcount = bkey_refcount_c(k); 1634 1635 if (!refcount) 1636 continue; 1637 1638 struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, 1639 c->reflink_gc_nr++, GFP_KERNEL); 1640 if (!r) { 1641 ret = -BCH_ERR_ENOMEM_gc_reflink_start; 1642 break; 1643 } 1644 1645 r->offset = k.k->p.offset; 1646 r->size = k.k->size; 1647 r->refcount = 0; 1648 0; 1649 }))); 1650 1651 bch_err_fn(c, ret); 1652 return ret; 1653 } 1654 1655 static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) 1656 { 1657 struct genradix_iter iter; 1658 struct reflink_gc *r; 1659 1660 genradix_for_each(&c->reflink_gc_table, iter, r) 1661 r->refcount = 0; 1662 } 1663 1664 static int bch2_gc_write_stripes_key(struct btree_trans *trans, 1665 struct btree_iter *iter, 1666 struct bkey_s_c k) 1667 { 1668 struct bch_fs *c = trans->c; 1669 struct printbuf buf = PRINTBUF; 1670 const struct bch_stripe *s; 1671 struct gc_stripe *m; 1672 bool bad = false; 1673 unsigned i; 1674 int ret = 0; 1675 1676 if (k.k->type != KEY_TYPE_stripe) 1677 return 0; 1678 1679 s = bkey_s_c_to_stripe(k).v; 1680 m = genradix_ptr(&c->gc_stripes, k.k->p.offset); 1681 1682 for (i = 0; i < s->nr_blocks; i++) { 1683 u32 old = stripe_blockcount_get(s, i); 1684 u32 new = (m ? m->block_sectors[i] : 0); 1685 1686 if (old != new) { 1687 prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n", 1688 i, old, new); 1689 bad = true; 1690 } 1691 } 1692 1693 if (bad) 1694 bch2_bkey_val_to_text(&buf, c, k); 1695 1696 if (fsck_err_on(bad, c, stripe_sector_count_wrong, 1697 "%s", buf.buf)) { 1698 struct bkey_i_stripe *new; 1699 1700 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); 1701 ret = PTR_ERR_OR_ZERO(new); 1702 if (ret) 1703 return ret; 1704 1705 bkey_reassemble(&new->k_i, k); 1706 1707 for (i = 0; i < new->v.nr_blocks; i++) 1708 stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); 1709 1710 ret = bch2_trans_update(trans, iter, &new->k_i, 0); 1711 } 1712 fsck_err: 1713 printbuf_exit(&buf); 1714 return ret; 1715 } 1716 1717 static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) 1718 { 1719 if (metadata_only) 1720 return 0; 1721 1722 return bch2_trans_run(c, 1723 for_each_btree_key_commit(trans, iter, 1724 BTREE_ID_stripes, POS_MIN, 1725 BTREE_ITER_PREFETCH, k, 1726 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1727 bch2_gc_write_stripes_key(trans, &iter, k))); 1728 } 1729 1730 static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) 1731 { 1732 genradix_free(&c->gc_stripes); 1733 } 1734 1735 /** 1736 * bch2_gc - walk _all_ references to buckets, and recompute them: 1737 * 1738 * @c: filesystem object 1739 * @initial: are we in recovery? 1740 * @metadata_only: are we just checking metadata references, or everything? 1741 * 1742 * Returns: 0 on success, or standard errcode on failure 1743 * 1744 * Order matters here: 1745 * - Concurrent GC relies on the fact that we have a total ordering for 1746 * everything that GC walks - see gc_will_visit_node(), 1747 * gc_will_visit_root() 1748 * 1749 * - also, references move around in the course of index updates and 1750 * various other crap: everything needs to agree on the ordering 1751 * references are allowed to move around in - e.g., we're allowed to 1752 * start with a reference owned by an open_bucket (the allocator) and 1753 * move it to the btree, but not the reverse. 1754 * 1755 * This is necessary to ensure that gc doesn't miss references that 1756 * move around - if references move backwards in the ordering GC 1757 * uses, GC could skip past them 1758 */ 1759 int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) 1760 { 1761 unsigned iter = 0; 1762 int ret; 1763 1764 lockdep_assert_held(&c->state_lock); 1765 1766 down_write(&c->gc_lock); 1767 1768 bch2_btree_interior_updates_flush(c); 1769 1770 ret = bch2_gc_start(c) ?: 1771 bch2_gc_alloc_start(c, metadata_only) ?: 1772 bch2_gc_reflink_start(c, metadata_only); 1773 if (ret) 1774 goto out; 1775 again: 1776 gc_pos_set(c, gc_phase(GC_PHASE_START)); 1777 1778 bch2_mark_superblocks(c); 1779 1780 ret = bch2_gc_btrees(c, initial, metadata_only); 1781 1782 if (ret) 1783 goto out; 1784 1785 #if 0 1786 bch2_mark_pending_btree_node_frees(c); 1787 #endif 1788 c->gc_count++; 1789 1790 if (test_bit(BCH_FS_need_another_gc, &c->flags) || 1791 (!iter && bch2_test_restart_gc)) { 1792 if (iter++ > 2) { 1793 bch_info(c, "Unable to fix bucket gens, looping"); 1794 ret = -EINVAL; 1795 goto out; 1796 } 1797 1798 /* 1799 * XXX: make sure gens we fixed got saved 1800 */ 1801 bch_info(c, "Second GC pass needed, restarting:"); 1802 clear_bit(BCH_FS_need_another_gc, &c->flags); 1803 __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); 1804 1805 bch2_gc_stripes_reset(c, metadata_only); 1806 bch2_gc_alloc_reset(c, metadata_only); 1807 bch2_gc_reflink_reset(c, metadata_only); 1808 ret = bch2_gc_reset(c); 1809 if (ret) 1810 goto out; 1811 1812 /* flush fsck errors, reset counters */ 1813 bch2_flush_fsck_errs(c); 1814 goto again; 1815 } 1816 out: 1817 if (!ret) { 1818 bch2_journal_block(&c->journal); 1819 1820 ret = bch2_gc_stripes_done(c, metadata_only) ?: 1821 bch2_gc_reflink_done(c, metadata_only) ?: 1822 bch2_gc_alloc_done(c, metadata_only) ?: 1823 bch2_gc_done(c, initial, metadata_only); 1824 1825 bch2_journal_unblock(&c->journal); 1826 } 1827 1828 percpu_down_write(&c->mark_lock); 1829 /* Indicates that gc is no longer in progress: */ 1830 __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); 1831 1832 bch2_gc_free(c); 1833 percpu_up_write(&c->mark_lock); 1834 1835 up_write(&c->gc_lock); 1836 1837 /* 1838 * At startup, allocations can happen directly instead of via the 1839 * allocator thread - issue wakeup in case they blocked on gc_lock: 1840 */ 1841 closure_wake_up(&c->freelist_wait); 1842 bch_err_fn(c, ret); 1843 return ret; 1844 } 1845 1846 static int gc_btree_gens_key(struct btree_trans *trans, 1847 struct btree_iter *iter, 1848 struct bkey_s_c k) 1849 { 1850 struct bch_fs *c = trans->c; 1851 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1852 struct bkey_i *u; 1853 int ret; 1854 1855 percpu_down_read(&c->mark_lock); 1856 bkey_for_each_ptr(ptrs, ptr) { 1857 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 1858 1859 if (ptr_stale(ca, ptr) > 16) { 1860 percpu_up_read(&c->mark_lock); 1861 goto update; 1862 } 1863 } 1864 1865 bkey_for_each_ptr(ptrs, ptr) { 1866 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 1867 u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; 1868 1869 if (gen_after(*gen, ptr->gen)) 1870 *gen = ptr->gen; 1871 } 1872 percpu_up_read(&c->mark_lock); 1873 return 0; 1874 update: 1875 u = bch2_bkey_make_mut(trans, iter, &k, 0); 1876 ret = PTR_ERR_OR_ZERO(u); 1877 if (ret) 1878 return ret; 1879 1880 bch2_extent_normalize(c, bkey_i_to_s(u)); 1881 return 0; 1882 } 1883 1884 static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, 1885 struct bkey_s_c k) 1886 { 1887 struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); 1888 struct bch_alloc_v4 a_convert; 1889 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); 1890 struct bkey_i_alloc_v4 *a_mut; 1891 int ret; 1892 1893 if (a->oldest_gen == ca->oldest_gen[iter->pos.offset]) 1894 return 0; 1895 1896 a_mut = bch2_alloc_to_v4_mut(trans, k); 1897 ret = PTR_ERR_OR_ZERO(a_mut); 1898 if (ret) 1899 return ret; 1900 1901 a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; 1902 a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type); 1903 1904 return bch2_trans_update(trans, iter, &a_mut->k_i, 0); 1905 } 1906 1907 int bch2_gc_gens(struct bch_fs *c) 1908 { 1909 u64 b, start_time = local_clock(); 1910 int ret; 1911 1912 /* 1913 * Ideally we would be using state_lock and not gc_lock here, but that 1914 * introduces a deadlock in the RO path - we currently take the state 1915 * lock at the start of going RO, thus the gc thread may get stuck: 1916 */ 1917 if (!mutex_trylock(&c->gc_gens_lock)) 1918 return 0; 1919 1920 trace_and_count(c, gc_gens_start, c); 1921 down_read(&c->gc_lock); 1922 1923 for_each_member_device(c, ca) { 1924 struct bucket_gens *gens = bucket_gens(ca); 1925 1926 BUG_ON(ca->oldest_gen); 1927 1928 ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); 1929 if (!ca->oldest_gen) { 1930 percpu_ref_put(&ca->ref); 1931 ret = -BCH_ERR_ENOMEM_gc_gens; 1932 goto err; 1933 } 1934 1935 for (b = gens->first_bucket; 1936 b < gens->nbuckets; b++) 1937 ca->oldest_gen[b] = gens->b[b]; 1938 } 1939 1940 for (unsigned i = 0; i < BTREE_ID_NR; i++) 1941 if (btree_type_has_ptrs(i)) { 1942 c->gc_gens_btree = i; 1943 c->gc_gens_pos = POS_MIN; 1944 1945 ret = bch2_trans_run(c, 1946 for_each_btree_key_commit(trans, iter, i, 1947 POS_MIN, 1948 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, 1949 k, 1950 NULL, NULL, 1951 BCH_TRANS_COMMIT_no_enospc, 1952 gc_btree_gens_key(trans, &iter, k))); 1953 if (ret) 1954 goto err; 1955 } 1956 1957 ret = bch2_trans_run(c, 1958 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, 1959 POS_MIN, 1960 BTREE_ITER_PREFETCH, 1961 k, 1962 NULL, NULL, 1963 BCH_TRANS_COMMIT_no_enospc, 1964 bch2_alloc_write_oldest_gen(trans, &iter, k))); 1965 if (ret) 1966 goto err; 1967 1968 c->gc_gens_btree = 0; 1969 c->gc_gens_pos = POS_MIN; 1970 1971 c->gc_count++; 1972 1973 bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); 1974 trace_and_count(c, gc_gens_end, c); 1975 err: 1976 for_each_member_device(c, ca) { 1977 kvfree(ca->oldest_gen); 1978 ca->oldest_gen = NULL; 1979 } 1980 1981 up_read(&c->gc_lock); 1982 mutex_unlock(&c->gc_gens_lock); 1983 if (!bch2_err_matches(ret, EROFS)) 1984 bch_err_fn(c, ret); 1985 return ret; 1986 } 1987 1988 static int bch2_gc_thread(void *arg) 1989 { 1990 struct bch_fs *c = arg; 1991 struct io_clock *clock = &c->io_clock[WRITE]; 1992 unsigned long last = atomic64_read(&clock->now); 1993 unsigned last_kick = atomic_read(&c->kick_gc); 1994 1995 set_freezable(); 1996 1997 while (1) { 1998 while (1) { 1999 set_current_state(TASK_INTERRUPTIBLE); 2000 2001 if (kthread_should_stop()) { 2002 __set_current_state(TASK_RUNNING); 2003 return 0; 2004 } 2005 2006 if (atomic_read(&c->kick_gc) != last_kick) 2007 break; 2008 2009 if (c->btree_gc_periodic) { 2010 unsigned long next = last + c->capacity / 16; 2011 2012 if (atomic64_read(&clock->now) >= next) 2013 break; 2014 2015 bch2_io_clock_schedule_timeout(clock, next); 2016 } else { 2017 schedule(); 2018 } 2019 2020 try_to_freeze(); 2021 } 2022 __set_current_state(TASK_RUNNING); 2023 2024 last = atomic64_read(&clock->now); 2025 last_kick = atomic_read(&c->kick_gc); 2026 2027 /* 2028 * Full gc is currently incompatible with btree key cache: 2029 */ 2030 #if 0 2031 ret = bch2_gc(c, false, false); 2032 #else 2033 bch2_gc_gens(c); 2034 #endif 2035 debug_check_no_locks_held(); 2036 } 2037 2038 return 0; 2039 } 2040 2041 void bch2_gc_thread_stop(struct bch_fs *c) 2042 { 2043 struct task_struct *p; 2044 2045 p = c->gc_thread; 2046 c->gc_thread = NULL; 2047 2048 if (p) { 2049 kthread_stop(p); 2050 put_task_struct(p); 2051 } 2052 } 2053 2054 int bch2_gc_thread_start(struct bch_fs *c) 2055 { 2056 struct task_struct *p; 2057 2058 if (c->gc_thread) 2059 return 0; 2060 2061 p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); 2062 if (IS_ERR(p)) { 2063 bch_err_fn(c, PTR_ERR(p)); 2064 return PTR_ERR(p); 2065 } 2066 2067 get_task_struct(p); 2068 c->gc_thread = p; 2069 wake_up_process(p); 2070 return 0; 2071 } 2072