1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> 4 * Copyright (C) 2014 Datera Inc. 5 */ 6 7 #include "bcachefs.h" 8 #include "alloc_background.h" 9 #include "alloc_foreground.h" 10 #include "bkey_methods.h" 11 #include "bkey_buf.h" 12 #include "btree_journal_iter.h" 13 #include "btree_key_cache.h" 14 #include "btree_locking.h" 15 #include "btree_update_interior.h" 16 #include "btree_io.h" 17 #include "btree_gc.h" 18 #include "buckets.h" 19 #include "clock.h" 20 #include "debug.h" 21 #include "ec.h" 22 #include "error.h" 23 #include "extents.h" 24 #include "journal.h" 25 #include "keylist.h" 26 #include "move.h" 27 #include "recovery.h" 28 #include "reflink.h" 29 #include "replicas.h" 30 #include "super-io.h" 31 #include "trace.h" 32 33 #include <linux/slab.h> 34 #include <linux/bitops.h> 35 #include <linux/freezer.h> 36 #include <linux/kthread.h> 37 #include <linux/preempt.h> 38 #include <linux/rcupdate.h> 39 #include <linux/sched/task.h> 40 41 #define DROP_THIS_NODE 10 42 #define DROP_PREV_NODE 11 43 44 static bool should_restart_for_topology_repair(struct bch_fs *c) 45 { 46 return c->opts.fix_errors != FSCK_FIX_no && 47 !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); 48 } 49 50 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) 51 { 52 preempt_disable(); 53 write_seqcount_begin(&c->gc_pos_lock); 54 c->gc_pos = new_pos; 55 write_seqcount_end(&c->gc_pos_lock); 56 preempt_enable(); 57 } 58 59 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) 60 { 61 BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); 62 __gc_pos_set(c, new_pos); 63 } 64 65 /* 66 * Missing: if an interior btree node is empty, we need to do something - 67 * perhaps just kill it 68 */ 69 static int bch2_gc_check_topology(struct bch_fs *c, 70 struct btree *b, 71 struct bkey_buf *prev, 72 struct bkey_buf cur, 73 bool is_last) 74 { 75 struct bpos node_start = b->data->min_key; 76 struct bpos node_end = b->data->max_key; 77 struct bpos expected_start = bkey_deleted(&prev->k->k) 78 ? node_start 79 : bpos_successor(prev->k->k.p); 80 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 81 int ret = 0; 82 83 if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { 84 struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); 85 86 if (!bpos_eq(expected_start, bp->v.min_key)) { 87 bch2_topology_error(c); 88 89 if (bkey_deleted(&prev->k->k)) { 90 prt_printf(&buf1, "start of node: "); 91 bch2_bpos_to_text(&buf1, node_start); 92 } else { 93 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); 94 } 95 bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); 96 97 if (__fsck_err(c, 98 FSCK_CAN_FIX| 99 FSCK_CAN_IGNORE| 100 FSCK_NO_RATELIMIT, 101 btree_node_topology_bad_min_key, 102 "btree node with incorrect min_key at btree %s level %u:\n" 103 " prev %s\n" 104 " cur %s", 105 bch2_btree_id_str(b->c.btree_id), b->c.level, 106 buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) { 107 bch_info(c, "Halting mark and sweep to start topology repair pass"); 108 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); 109 goto err; 110 } else { 111 set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); 112 } 113 } 114 } 115 116 if (is_last && !bpos_eq(cur.k->k.p, node_end)) { 117 bch2_topology_error(c); 118 119 printbuf_reset(&buf1); 120 printbuf_reset(&buf2); 121 122 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); 123 bch2_bpos_to_text(&buf2, node_end); 124 125 if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT, 126 btree_node_topology_bad_max_key, 127 "btree node with incorrect max_key at btree %s level %u:\n" 128 " %s\n" 129 " expected %s", 130 bch2_btree_id_str(b->c.btree_id), b->c.level, 131 buf1.buf, buf2.buf) && 132 should_restart_for_topology_repair(c)) { 133 bch_info(c, "Halting mark and sweep to start topology repair pass"); 134 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); 135 goto err; 136 } else { 137 set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); 138 } 139 } 140 141 bch2_bkey_buf_copy(prev, c, cur.k); 142 err: 143 fsck_err: 144 printbuf_exit(&buf2); 145 printbuf_exit(&buf1); 146 return ret; 147 } 148 149 static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) 150 { 151 switch (b->key.k.type) { 152 case KEY_TYPE_btree_ptr: { 153 struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key); 154 155 dst->k.p = src->k.p; 156 dst->v.mem_ptr = 0; 157 dst->v.seq = b->data->keys.seq; 158 dst->v.sectors_written = 0; 159 dst->v.flags = 0; 160 dst->v.min_key = b->data->min_key; 161 set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k)); 162 memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k)); 163 break; 164 } 165 case KEY_TYPE_btree_ptr_v2: 166 bkey_copy(&dst->k_i, &b->key); 167 break; 168 default: 169 BUG(); 170 } 171 } 172 173 static void bch2_btree_node_update_key_early(struct btree_trans *trans, 174 enum btree_id btree, unsigned level, 175 struct bkey_s_c old, struct bkey_i *new) 176 { 177 struct bch_fs *c = trans->c; 178 struct btree *b; 179 struct bkey_buf tmp; 180 int ret; 181 182 bch2_bkey_buf_init(&tmp); 183 bch2_bkey_buf_reassemble(&tmp, c, old); 184 185 b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); 186 if (!IS_ERR_OR_NULL(b)) { 187 mutex_lock(&c->btree_cache.lock); 188 189 bch2_btree_node_hash_remove(&c->btree_cache, b); 190 191 bkey_copy(&b->key, new); 192 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 193 BUG_ON(ret); 194 195 mutex_unlock(&c->btree_cache.lock); 196 six_unlock_read(&b->c.lock); 197 } 198 199 bch2_bkey_buf_exit(&tmp, c); 200 } 201 202 static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) 203 { 204 struct bkey_i_btree_ptr_v2 *new; 205 int ret; 206 207 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); 208 if (!new) 209 return -BCH_ERR_ENOMEM_gc_repair_key; 210 211 btree_ptr_to_v2(b, new); 212 b->data->min_key = new_min; 213 new->v.min_key = new_min; 214 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); 215 216 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); 217 if (ret) { 218 kfree(new); 219 return ret; 220 } 221 222 bch2_btree_node_drop_keys_outside_node(b); 223 bkey_copy(&b->key, &new->k_i); 224 return 0; 225 } 226 227 static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) 228 { 229 struct bkey_i_btree_ptr_v2 *new; 230 int ret; 231 232 ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); 233 if (ret) 234 return ret; 235 236 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); 237 if (!new) 238 return -BCH_ERR_ENOMEM_gc_repair_key; 239 240 btree_ptr_to_v2(b, new); 241 b->data->max_key = new_max; 242 new->k.p = new_max; 243 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); 244 245 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); 246 if (ret) { 247 kfree(new); 248 return ret; 249 } 250 251 bch2_btree_node_drop_keys_outside_node(b); 252 253 mutex_lock(&c->btree_cache.lock); 254 bch2_btree_node_hash_remove(&c->btree_cache, b); 255 256 bkey_copy(&b->key, &new->k_i); 257 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 258 BUG_ON(ret); 259 mutex_unlock(&c->btree_cache.lock); 260 return 0; 261 } 262 263 static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, 264 struct btree *prev, struct btree *cur) 265 { 266 struct bpos expected_start = !prev 267 ? b->data->min_key 268 : bpos_successor(prev->key.k.p); 269 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 270 int ret = 0; 271 272 if (!prev) { 273 prt_printf(&buf1, "start of node: "); 274 bch2_bpos_to_text(&buf1, b->data->min_key); 275 } else { 276 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); 277 } 278 279 bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); 280 281 if (prev && 282 bpos_gt(expected_start, cur->data->min_key) && 283 BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { 284 /* cur overwrites prev: */ 285 286 if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, 287 cur->data->min_key), c, 288 btree_node_topology_overwritten_by_next_node, 289 "btree node overwritten by next node at btree %s level %u:\n" 290 " node %s\n" 291 " next %s", 292 bch2_btree_id_str(b->c.btree_id), b->c.level, 293 buf1.buf, buf2.buf)) { 294 ret = DROP_PREV_NODE; 295 goto out; 296 } 297 298 if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, 299 bpos_predecessor(cur->data->min_key)), c, 300 btree_node_topology_bad_max_key, 301 "btree node with incorrect max_key at btree %s level %u:\n" 302 " node %s\n" 303 " next %s", 304 bch2_btree_id_str(b->c.btree_id), b->c.level, 305 buf1.buf, buf2.buf)) 306 ret = set_node_max(c, prev, 307 bpos_predecessor(cur->data->min_key)); 308 } else { 309 /* prev overwrites cur: */ 310 311 if (mustfix_fsck_err_on(bpos_ge(expected_start, 312 cur->data->max_key), c, 313 btree_node_topology_overwritten_by_prev_node, 314 "btree node overwritten by prev node at btree %s level %u:\n" 315 " prev %s\n" 316 " node %s", 317 bch2_btree_id_str(b->c.btree_id), b->c.level, 318 buf1.buf, buf2.buf)) { 319 ret = DROP_THIS_NODE; 320 goto out; 321 } 322 323 if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, 324 btree_node_topology_bad_min_key, 325 "btree node with incorrect min_key at btree %s level %u:\n" 326 " prev %s\n" 327 " node %s", 328 bch2_btree_id_str(b->c.btree_id), b->c.level, 329 buf1.buf, buf2.buf)) 330 ret = set_node_min(c, cur, expected_start); 331 } 332 out: 333 fsck_err: 334 printbuf_exit(&buf2); 335 printbuf_exit(&buf1); 336 return ret; 337 } 338 339 static int btree_repair_node_end(struct bch_fs *c, struct btree *b, 340 struct btree *child) 341 { 342 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 343 int ret = 0; 344 345 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); 346 bch2_bpos_to_text(&buf2, b->key.k.p); 347 348 if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, 349 btree_node_topology_bad_max_key, 350 "btree node with incorrect max_key at btree %s level %u:\n" 351 " %s\n" 352 " expected %s", 353 bch2_btree_id_str(b->c.btree_id), b->c.level, 354 buf1.buf, buf2.buf)) { 355 ret = set_node_max(c, child, b->key.k.p); 356 if (ret) 357 goto err; 358 } 359 err: 360 fsck_err: 361 printbuf_exit(&buf2); 362 printbuf_exit(&buf1); 363 return ret; 364 } 365 366 static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) 367 { 368 struct bch_fs *c = trans->c; 369 struct btree_and_journal_iter iter; 370 struct bkey_s_c k; 371 struct bkey_buf prev_k, cur_k; 372 struct btree *prev = NULL, *cur = NULL; 373 bool have_child, dropped_children = false; 374 struct printbuf buf = PRINTBUF; 375 int ret = 0; 376 377 if (!b->c.level) 378 return 0; 379 again: 380 prev = NULL; 381 have_child = dropped_children = false; 382 bch2_bkey_buf_init(&prev_k); 383 bch2_bkey_buf_init(&cur_k); 384 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 385 386 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 387 BUG_ON(bpos_lt(k.k->p, b->data->min_key)); 388 BUG_ON(bpos_gt(k.k->p, b->data->max_key)); 389 390 bch2_btree_and_journal_iter_advance(&iter); 391 bch2_bkey_buf_reassemble(&cur_k, c, k); 392 393 cur = bch2_btree_node_get_noiter(trans, cur_k.k, 394 b->c.btree_id, b->c.level - 1, 395 false); 396 ret = PTR_ERR_OR_ZERO(cur); 397 398 printbuf_reset(&buf); 399 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); 400 401 if (mustfix_fsck_err_on(ret == -EIO, c, 402 btree_node_unreadable, 403 "Topology repair: unreadable btree node at btree %s level %u:\n" 404 " %s", 405 bch2_btree_id_str(b->c.btree_id), 406 b->c.level - 1, 407 buf.buf)) { 408 bch2_btree_node_evict(trans, cur_k.k); 409 ret = bch2_journal_key_delete(c, b->c.btree_id, 410 b->c.level, cur_k.k->k.p); 411 cur = NULL; 412 if (ret) 413 break; 414 continue; 415 } 416 417 if (ret) { 418 bch_err_msg(c, ret, "getting btree node"); 419 break; 420 } 421 422 ret = btree_repair_node_boundaries(c, b, prev, cur); 423 424 if (ret == DROP_THIS_NODE) { 425 six_unlock_read(&cur->c.lock); 426 bch2_btree_node_evict(trans, cur_k.k); 427 ret = bch2_journal_key_delete(c, b->c.btree_id, 428 b->c.level, cur_k.k->k.p); 429 cur = NULL; 430 if (ret) 431 break; 432 continue; 433 } 434 435 if (prev) 436 six_unlock_read(&prev->c.lock); 437 prev = NULL; 438 439 if (ret == DROP_PREV_NODE) { 440 bch2_btree_node_evict(trans, prev_k.k); 441 ret = bch2_journal_key_delete(c, b->c.btree_id, 442 b->c.level, prev_k.k->k.p); 443 if (ret) 444 break; 445 446 bch2_btree_and_journal_iter_exit(&iter); 447 bch2_bkey_buf_exit(&prev_k, c); 448 bch2_bkey_buf_exit(&cur_k, c); 449 goto again; 450 } else if (ret) 451 break; 452 453 prev = cur; 454 cur = NULL; 455 bch2_bkey_buf_copy(&prev_k, c, cur_k.k); 456 } 457 458 if (!ret && !IS_ERR_OR_NULL(prev)) { 459 BUG_ON(cur); 460 ret = btree_repair_node_end(c, b, prev); 461 } 462 463 if (!IS_ERR_OR_NULL(prev)) 464 six_unlock_read(&prev->c.lock); 465 prev = NULL; 466 if (!IS_ERR_OR_NULL(cur)) 467 six_unlock_read(&cur->c.lock); 468 cur = NULL; 469 470 if (ret) 471 goto err; 472 473 bch2_btree_and_journal_iter_exit(&iter); 474 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 475 476 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 477 bch2_bkey_buf_reassemble(&cur_k, c, k); 478 bch2_btree_and_journal_iter_advance(&iter); 479 480 cur = bch2_btree_node_get_noiter(trans, cur_k.k, 481 b->c.btree_id, b->c.level - 1, 482 false); 483 ret = PTR_ERR_OR_ZERO(cur); 484 485 if (ret) { 486 bch_err_msg(c, ret, "getting btree node"); 487 goto err; 488 } 489 490 ret = bch2_btree_repair_topology_recurse(trans, cur); 491 six_unlock_read(&cur->c.lock); 492 cur = NULL; 493 494 if (ret == DROP_THIS_NODE) { 495 bch2_btree_node_evict(trans, cur_k.k); 496 ret = bch2_journal_key_delete(c, b->c.btree_id, 497 b->c.level, cur_k.k->k.p); 498 dropped_children = true; 499 } 500 501 if (ret) 502 goto err; 503 504 have_child = true; 505 } 506 507 printbuf_reset(&buf); 508 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 509 510 if (mustfix_fsck_err_on(!have_child, c, 511 btree_node_topology_interior_node_empty, 512 "empty interior btree node at btree %s level %u\n" 513 " %s", 514 bch2_btree_id_str(b->c.btree_id), 515 b->c.level, buf.buf)) 516 ret = DROP_THIS_NODE; 517 err: 518 fsck_err: 519 if (!IS_ERR_OR_NULL(prev)) 520 six_unlock_read(&prev->c.lock); 521 if (!IS_ERR_OR_NULL(cur)) 522 six_unlock_read(&cur->c.lock); 523 524 bch2_btree_and_journal_iter_exit(&iter); 525 bch2_bkey_buf_exit(&prev_k, c); 526 bch2_bkey_buf_exit(&cur_k, c); 527 528 if (!ret && dropped_children) 529 goto again; 530 531 printbuf_exit(&buf); 532 return ret; 533 } 534 535 int bch2_check_topology(struct bch_fs *c) 536 { 537 struct btree_trans *trans = bch2_trans_get(c); 538 struct btree *b; 539 unsigned i; 540 int ret = 0; 541 542 for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { 543 struct btree_root *r = bch2_btree_id_root(c, i); 544 545 if (!r->alive) 546 continue; 547 548 b = r->b; 549 if (btree_node_fake(b)) 550 continue; 551 552 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); 553 ret = bch2_btree_repair_topology_recurse(trans, b); 554 six_unlock_read(&b->c.lock); 555 556 if (ret == DROP_THIS_NODE) { 557 bch_err(c, "empty btree root - repair unimplemented"); 558 ret = -BCH_ERR_fsck_repair_unimplemented; 559 } 560 } 561 562 bch2_trans_put(trans); 563 564 return ret; 565 } 566 567 static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id, 568 unsigned level, bool is_root, 569 struct bkey_s_c *k) 570 { 571 struct bch_fs *c = trans->c; 572 struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k); 573 const union bch_extent_entry *entry_c; 574 struct extent_ptr_decoded p = { 0 }; 575 bool do_update = false; 576 struct printbuf buf = PRINTBUF; 577 int ret = 0; 578 579 /* 580 * XXX 581 * use check_bucket_ref here 582 */ 583 bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { 584 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); 585 struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); 586 enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); 587 588 if (!g->gen_valid && 589 (c->opts.reconstruct_alloc || 590 fsck_err(c, ptr_to_missing_alloc_key, 591 "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" 592 "while marking %s", 593 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 594 bch2_data_types[ptr_data_type(k->k, &p.ptr)], 595 p.ptr.gen, 596 (printbuf_reset(&buf), 597 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { 598 if (!p.ptr.cached) { 599 g->gen_valid = true; 600 g->gen = p.ptr.gen; 601 } else { 602 do_update = true; 603 } 604 } 605 606 if (gen_cmp(p.ptr.gen, g->gen) > 0 && 607 (c->opts.reconstruct_alloc || 608 fsck_err(c, ptr_gen_newer_than_bucket_gen, 609 "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" 610 "while marking %s", 611 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 612 bch2_data_types[ptr_data_type(k->k, &p.ptr)], 613 p.ptr.gen, g->gen, 614 (printbuf_reset(&buf), 615 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { 616 if (!p.ptr.cached) { 617 g->gen_valid = true; 618 g->gen = p.ptr.gen; 619 g->data_type = 0; 620 g->dirty_sectors = 0; 621 g->cached_sectors = 0; 622 set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); 623 } else { 624 do_update = true; 625 } 626 } 627 628 if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && 629 (c->opts.reconstruct_alloc || 630 fsck_err(c, ptr_gen_newer_than_bucket_gen, 631 "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" 632 "while marking %s", 633 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, 634 bch2_data_types[ptr_data_type(k->k, &p.ptr)], 635 p.ptr.gen, 636 (printbuf_reset(&buf), 637 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) 638 do_update = true; 639 640 if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && 641 (c->opts.reconstruct_alloc || 642 fsck_err(c, stale_dirty_ptr, 643 "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" 644 "while marking %s", 645 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 646 bch2_data_types[ptr_data_type(k->k, &p.ptr)], 647 p.ptr.gen, g->gen, 648 (printbuf_reset(&buf), 649 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) 650 do_update = true; 651 652 if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) 653 continue; 654 655 if (fsck_err_on(bucket_data_type(g->data_type) && 656 bucket_data_type(g->data_type) != data_type, c, 657 ptr_bucket_data_type_mismatch, 658 "bucket %u:%zu different types of data in same bucket: %s, %s\n" 659 "while marking %s", 660 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 661 bch2_data_types[g->data_type], 662 bch2_data_types[data_type], 663 (printbuf_reset(&buf), 664 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { 665 if (data_type == BCH_DATA_btree) { 666 g->data_type = data_type; 667 set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); 668 } else { 669 do_update = true; 670 } 671 } 672 673 if (p.has_ec) { 674 struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); 675 676 if (fsck_err_on(!m || !m->alive, c, 677 ptr_to_missing_stripe, 678 "pointer to nonexistent stripe %llu\n" 679 "while marking %s", 680 (u64) p.ec.idx, 681 (printbuf_reset(&buf), 682 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) 683 do_update = true; 684 685 if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, 686 ptr_to_incorrect_stripe, 687 "pointer does not match stripe %llu\n" 688 "while marking %s", 689 (u64) p.ec.idx, 690 (printbuf_reset(&buf), 691 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) 692 do_update = true; 693 } 694 } 695 696 if (do_update) { 697 struct bkey_ptrs ptrs; 698 union bch_extent_entry *entry; 699 struct bch_extent_ptr *ptr; 700 struct bkey_i *new; 701 702 if (is_root) { 703 bch_err(c, "cannot update btree roots yet"); 704 ret = -EINVAL; 705 goto err; 706 } 707 708 new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); 709 if (!new) { 710 bch_err_msg(c, ret, "allocating new key"); 711 ret = -BCH_ERR_ENOMEM_gc_repair_key; 712 goto err; 713 } 714 715 bkey_reassemble(new, *k); 716 717 if (level) { 718 /* 719 * We don't want to drop btree node pointers - if the 720 * btree node isn't there anymore, the read path will 721 * sort it out: 722 */ 723 ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); 724 bkey_for_each_ptr(ptrs, ptr) { 725 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 726 struct bucket *g = PTR_GC_BUCKET(ca, ptr); 727 728 ptr->gen = g->gen; 729 } 730 } else { 731 bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ 732 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 733 struct bucket *g = PTR_GC_BUCKET(ca, ptr); 734 enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); 735 736 (ptr->cached && 737 (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || 738 (!ptr->cached && 739 gen_cmp(ptr->gen, g->gen) < 0) || 740 gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || 741 (g->data_type && 742 g->data_type != data_type); 743 })); 744 again: 745 ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); 746 bkey_extent_entry_for_each(ptrs, entry) { 747 if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { 748 struct gc_stripe *m = genradix_ptr(&c->gc_stripes, 749 entry->stripe_ptr.idx); 750 union bch_extent_entry *next_ptr; 751 752 bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) 753 if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) 754 goto found; 755 next_ptr = NULL; 756 found: 757 if (!next_ptr) { 758 bch_err(c, "aieee, found stripe ptr with no data ptr"); 759 continue; 760 } 761 762 if (!m || !m->alive || 763 !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], 764 &next_ptr->ptr, 765 m->sectors)) { 766 bch2_bkey_extent_entry_drop(new, entry); 767 goto again; 768 } 769 } 770 } 771 } 772 773 ret = bch2_journal_key_insert_take(c, btree_id, level, new); 774 if (ret) { 775 kfree(new); 776 goto err; 777 } 778 779 if (level) 780 bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); 781 782 if (0) { 783 printbuf_reset(&buf); 784 bch2_bkey_val_to_text(&buf, c, *k); 785 bch_info(c, "updated %s", buf.buf); 786 787 printbuf_reset(&buf); 788 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); 789 bch_info(c, "new key %s", buf.buf); 790 } 791 792 *k = bkey_i_to_s_c(new); 793 } 794 err: 795 fsck_err: 796 printbuf_exit(&buf); 797 return ret; 798 } 799 800 /* marking of btree keys/nodes: */ 801 802 static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, 803 unsigned level, bool is_root, 804 struct bkey_s_c *k, 805 bool initial) 806 { 807 struct bch_fs *c = trans->c; 808 struct bkey deleted = KEY(0, 0, 0); 809 struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; 810 unsigned flags = 811 BTREE_TRIGGER_GC| 812 (initial ? BTREE_TRIGGER_NOATOMIC : 0); 813 int ret = 0; 814 815 deleted.p = k->k->p; 816 817 if (initial) { 818 BUG_ON(bch2_journal_seq_verify && 819 k->k->version.lo > atomic64_read(&c->journal.seq)); 820 821 ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); 822 if (ret) 823 goto err; 824 825 if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, 826 bkey_version_in_future, 827 "key version number higher than recorded: %llu > %llu", 828 k->k->version.lo, 829 atomic64_read(&c->key_version))) 830 atomic64_set(&c->key_version, k->k->version.lo); 831 } 832 833 ret = commit_do(trans, NULL, NULL, 0, 834 bch2_mark_key(trans, btree_id, level, old, *k, flags)); 835 fsck_err: 836 err: 837 if (ret) 838 bch_err_fn(c, ret); 839 return ret; 840 } 841 842 static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) 843 { 844 struct bch_fs *c = trans->c; 845 struct btree_node_iter iter; 846 struct bkey unpacked; 847 struct bkey_s_c k; 848 struct bkey_buf prev, cur; 849 int ret = 0; 850 851 if (!btree_node_type_needs_gc(btree_node_type(b))) 852 return 0; 853 854 bch2_btree_node_iter_init_from_start(&iter, b); 855 bch2_bkey_buf_init(&prev); 856 bch2_bkey_buf_init(&cur); 857 bkey_init(&prev.k->k); 858 859 while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { 860 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, 861 &k, initial); 862 if (ret) 863 break; 864 865 bch2_btree_node_iter_advance(&iter, b); 866 867 if (b->c.level) { 868 bch2_bkey_buf_reassemble(&cur, c, k); 869 870 ret = bch2_gc_check_topology(c, b, &prev, cur, 871 bch2_btree_node_iter_end(&iter)); 872 if (ret) 873 break; 874 } 875 } 876 877 bch2_bkey_buf_exit(&cur, c); 878 bch2_bkey_buf_exit(&prev, c); 879 return ret; 880 } 881 882 static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, 883 bool initial, bool metadata_only) 884 { 885 struct bch_fs *c = trans->c; 886 struct btree_iter iter; 887 struct btree *b; 888 unsigned depth = metadata_only ? 1 : 0; 889 int ret = 0; 890 891 gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); 892 893 __for_each_btree_node(trans, iter, btree_id, POS_MIN, 894 0, depth, BTREE_ITER_PREFETCH, b, ret) { 895 bch2_verify_btree_nr_keys(b); 896 897 gc_pos_set(c, gc_pos_btree_node(b)); 898 899 ret = btree_gc_mark_node(trans, b, initial); 900 if (ret) 901 break; 902 } 903 bch2_trans_iter_exit(trans, &iter); 904 905 if (ret) 906 return ret; 907 908 mutex_lock(&c->btree_root_lock); 909 b = bch2_btree_id_root(c, btree_id)->b; 910 if (!btree_node_fake(b)) { 911 struct bkey_s_c k = bkey_i_to_s_c(&b->key); 912 913 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, 914 true, &k, initial); 915 } 916 gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); 917 mutex_unlock(&c->btree_root_lock); 918 919 return ret; 920 } 921 922 static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, 923 unsigned target_depth) 924 { 925 struct bch_fs *c = trans->c; 926 struct btree_and_journal_iter iter; 927 struct bkey_s_c k; 928 struct bkey_buf cur, prev; 929 struct printbuf buf = PRINTBUF; 930 int ret = 0; 931 932 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 933 bch2_bkey_buf_init(&prev); 934 bch2_bkey_buf_init(&cur); 935 bkey_init(&prev.k->k); 936 937 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 938 BUG_ON(bpos_lt(k.k->p, b->data->min_key)); 939 BUG_ON(bpos_gt(k.k->p, b->data->max_key)); 940 941 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, 942 false, &k, true); 943 if (ret) 944 goto fsck_err; 945 946 if (b->c.level) { 947 bch2_bkey_buf_reassemble(&cur, c, k); 948 k = bkey_i_to_s_c(cur.k); 949 950 bch2_btree_and_journal_iter_advance(&iter); 951 952 ret = bch2_gc_check_topology(c, b, 953 &prev, cur, 954 !bch2_btree_and_journal_iter_peek(&iter).k); 955 if (ret) 956 goto fsck_err; 957 } else { 958 bch2_btree_and_journal_iter_advance(&iter); 959 } 960 } 961 962 if (b->c.level > target_depth) { 963 bch2_btree_and_journal_iter_exit(&iter); 964 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 965 966 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 967 struct btree *child; 968 969 bch2_bkey_buf_reassemble(&cur, c, k); 970 bch2_btree_and_journal_iter_advance(&iter); 971 972 child = bch2_btree_node_get_noiter(trans, cur.k, 973 b->c.btree_id, b->c.level - 1, 974 false); 975 ret = PTR_ERR_OR_ZERO(child); 976 977 if (ret == -EIO) { 978 bch2_topology_error(c); 979 980 if (__fsck_err(c, 981 FSCK_CAN_FIX| 982 FSCK_CAN_IGNORE| 983 FSCK_NO_RATELIMIT, 984 btree_node_read_error, 985 "Unreadable btree node at btree %s level %u:\n" 986 " %s", 987 bch2_btree_id_str(b->c.btree_id), 988 b->c.level - 1, 989 (printbuf_reset(&buf), 990 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && 991 should_restart_for_topology_repair(c)) { 992 bch_info(c, "Halting mark and sweep to start topology repair pass"); 993 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); 994 goto fsck_err; 995 } else { 996 /* Continue marking when opted to not 997 * fix the error: */ 998 ret = 0; 999 set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); 1000 continue; 1001 } 1002 } else if (ret) { 1003 bch_err_msg(c, ret, "getting btree node"); 1004 break; 1005 } 1006 1007 ret = bch2_gc_btree_init_recurse(trans, child, 1008 target_depth); 1009 six_unlock_read(&child->c.lock); 1010 1011 if (ret) 1012 break; 1013 } 1014 } 1015 fsck_err: 1016 bch2_bkey_buf_exit(&cur, c); 1017 bch2_bkey_buf_exit(&prev, c); 1018 bch2_btree_and_journal_iter_exit(&iter); 1019 printbuf_exit(&buf); 1020 return ret; 1021 } 1022 1023 static int bch2_gc_btree_init(struct btree_trans *trans, 1024 enum btree_id btree_id, 1025 bool metadata_only) 1026 { 1027 struct bch_fs *c = trans->c; 1028 struct btree *b; 1029 unsigned target_depth = metadata_only ? 1 : 0; 1030 struct printbuf buf = PRINTBUF; 1031 int ret = 0; 1032 1033 b = bch2_btree_id_root(c, btree_id)->b; 1034 1035 if (btree_node_fake(b)) 1036 return 0; 1037 1038 six_lock_read(&b->c.lock, NULL, NULL); 1039 printbuf_reset(&buf); 1040 bch2_bpos_to_text(&buf, b->data->min_key); 1041 if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, 1042 btree_root_bad_min_key, 1043 "btree root with incorrect min_key: %s", buf.buf)) { 1044 bch_err(c, "repair unimplemented"); 1045 ret = -BCH_ERR_fsck_repair_unimplemented; 1046 goto fsck_err; 1047 } 1048 1049 printbuf_reset(&buf); 1050 bch2_bpos_to_text(&buf, b->data->max_key); 1051 if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, 1052 btree_root_bad_max_key, 1053 "btree root with incorrect max_key: %s", buf.buf)) { 1054 bch_err(c, "repair unimplemented"); 1055 ret = -BCH_ERR_fsck_repair_unimplemented; 1056 goto fsck_err; 1057 } 1058 1059 if (b->c.level >= target_depth) 1060 ret = bch2_gc_btree_init_recurse(trans, b, target_depth); 1061 1062 if (!ret) { 1063 struct bkey_s_c k = bkey_i_to_s_c(&b->key); 1064 1065 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true, 1066 &k, true); 1067 } 1068 fsck_err: 1069 six_unlock_read(&b->c.lock); 1070 1071 if (ret < 0) 1072 bch_err_fn(c, ret); 1073 printbuf_exit(&buf); 1074 return ret; 1075 } 1076 1077 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) 1078 { 1079 return (int) btree_id_to_gc_phase(l) - 1080 (int) btree_id_to_gc_phase(r); 1081 } 1082 1083 static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) 1084 { 1085 struct btree_trans *trans = bch2_trans_get(c); 1086 enum btree_id ids[BTREE_ID_NR]; 1087 unsigned i; 1088 int ret = 0; 1089 1090 for (i = 0; i < BTREE_ID_NR; i++) 1091 ids[i] = i; 1092 bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); 1093 1094 for (i = 0; i < BTREE_ID_NR && !ret; i++) 1095 ret = initial 1096 ? bch2_gc_btree_init(trans, ids[i], metadata_only) 1097 : bch2_gc_btree(trans, ids[i], initial, metadata_only); 1098 1099 for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { 1100 if (!bch2_btree_id_root(c, i)->alive) 1101 continue; 1102 1103 ret = initial 1104 ? bch2_gc_btree_init(trans, i, metadata_only) 1105 : bch2_gc_btree(trans, i, initial, metadata_only); 1106 } 1107 1108 if (ret < 0) 1109 bch_err_fn(c, ret); 1110 1111 bch2_trans_put(trans); 1112 return ret; 1113 } 1114 1115 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, 1116 u64 start, u64 end, 1117 enum bch_data_type type, 1118 unsigned flags) 1119 { 1120 u64 b = sector_to_bucket(ca, start); 1121 1122 do { 1123 unsigned sectors = 1124 min_t(u64, bucket_to_sector(ca, b + 1), end) - start; 1125 1126 bch2_mark_metadata_bucket(c, ca, b, type, sectors, 1127 gc_phase(GC_PHASE_SB), flags); 1128 b++; 1129 start += sectors; 1130 } while (start < end); 1131 } 1132 1133 static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, 1134 unsigned flags) 1135 { 1136 struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; 1137 unsigned i; 1138 u64 b; 1139 1140 for (i = 0; i < layout->nr_superblocks; i++) { 1141 u64 offset = le64_to_cpu(layout->sb_offset[i]); 1142 1143 if (offset == BCH_SB_SECTOR) 1144 mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, 1145 BCH_DATA_sb, flags); 1146 1147 mark_metadata_sectors(c, ca, offset, 1148 offset + (1 << layout->sb_max_size_bits), 1149 BCH_DATA_sb, flags); 1150 } 1151 1152 for (i = 0; i < ca->journal.nr; i++) { 1153 b = ca->journal.buckets[i]; 1154 bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, 1155 ca->mi.bucket_size, 1156 gc_phase(GC_PHASE_SB), flags); 1157 } 1158 } 1159 1160 static void bch2_mark_superblocks(struct bch_fs *c) 1161 { 1162 struct bch_dev *ca; 1163 unsigned i; 1164 1165 mutex_lock(&c->sb_lock); 1166 gc_pos_set(c, gc_phase(GC_PHASE_SB)); 1167 1168 for_each_online_member(ca, c, i) 1169 bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); 1170 mutex_unlock(&c->sb_lock); 1171 } 1172 1173 #if 0 1174 /* Also see bch2_pending_btree_node_free_insert_done() */ 1175 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) 1176 { 1177 struct btree_update *as; 1178 struct pending_btree_node_free *d; 1179 1180 mutex_lock(&c->btree_interior_update_lock); 1181 gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); 1182 1183 for_each_pending_btree_node_free(c, as, d) 1184 if (d->index_update_done) 1185 bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); 1186 1187 mutex_unlock(&c->btree_interior_update_lock); 1188 } 1189 #endif 1190 1191 static void bch2_gc_free(struct bch_fs *c) 1192 { 1193 struct bch_dev *ca; 1194 unsigned i; 1195 1196 genradix_free(&c->reflink_gc_table); 1197 genradix_free(&c->gc_stripes); 1198 1199 for_each_member_device(ca, c, i) { 1200 kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), 1201 sizeof(struct bucket_array) + 1202 ca->mi.nbuckets * sizeof(struct bucket)); 1203 ca->buckets_gc = NULL; 1204 1205 free_percpu(ca->usage_gc); 1206 ca->usage_gc = NULL; 1207 } 1208 1209 free_percpu(c->usage_gc); 1210 c->usage_gc = NULL; 1211 } 1212 1213 static int bch2_gc_done(struct bch_fs *c, 1214 bool initial, bool metadata_only) 1215 { 1216 struct bch_dev *ca = NULL; 1217 struct printbuf buf = PRINTBUF; 1218 bool verify = !metadata_only && 1219 !c->opts.reconstruct_alloc && 1220 (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); 1221 unsigned i, dev; 1222 int ret = 0; 1223 1224 percpu_down_write(&c->mark_lock); 1225 1226 #define copy_field(_err, _f, _msg, ...) \ 1227 if (dst->_f != src->_f && \ 1228 (!verify || \ 1229 fsck_err(c, _err, _msg ": got %llu, should be %llu" \ 1230 , ##__VA_ARGS__, dst->_f, src->_f))) \ 1231 dst->_f = src->_f 1232 #define copy_dev_field(_err, _f, _msg, ...) \ 1233 copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) 1234 #define copy_fs_field(_err, _f, _msg, ...) \ 1235 copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) 1236 1237 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 1238 bch2_fs_usage_acc_to_base(c, i); 1239 1240 for_each_member_device(ca, c, dev) { 1241 struct bch_dev_usage *dst = ca->usage_base; 1242 struct bch_dev_usage *src = (void *) 1243 bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, 1244 dev_usage_u64s()); 1245 1246 for (i = 0; i < BCH_DATA_NR; i++) { 1247 copy_dev_field(dev_usage_buckets_wrong, 1248 d[i].buckets, "%s buckets", bch2_data_types[i]); 1249 copy_dev_field(dev_usage_sectors_wrong, 1250 d[i].sectors, "%s sectors", bch2_data_types[i]); 1251 copy_dev_field(dev_usage_fragmented_wrong, 1252 d[i].fragmented, "%s fragmented", bch2_data_types[i]); 1253 } 1254 1255 copy_dev_field(dev_usage_buckets_ec_wrong, 1256 buckets_ec, "buckets_ec"); 1257 } 1258 1259 { 1260 unsigned nr = fs_usage_u64s(c); 1261 struct bch_fs_usage *dst = c->usage_base; 1262 struct bch_fs_usage *src = (void *) 1263 bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); 1264 1265 copy_fs_field(fs_usage_hidden_wrong, 1266 hidden, "hidden"); 1267 copy_fs_field(fs_usage_btree_wrong, 1268 btree, "btree"); 1269 1270 if (!metadata_only) { 1271 copy_fs_field(fs_usage_data_wrong, 1272 data, "data"); 1273 copy_fs_field(fs_usage_cached_wrong, 1274 cached, "cached"); 1275 copy_fs_field(fs_usage_reserved_wrong, 1276 reserved, "reserved"); 1277 copy_fs_field(fs_usage_nr_inodes_wrong, 1278 nr_inodes,"nr_inodes"); 1279 1280 for (i = 0; i < BCH_REPLICAS_MAX; i++) 1281 copy_fs_field(fs_usage_persistent_reserved_wrong, 1282 persistent_reserved[i], 1283 "persistent_reserved[%i]", i); 1284 } 1285 1286 for (i = 0; i < c->replicas.nr; i++) { 1287 struct bch_replicas_entry *e = 1288 cpu_replicas_entry(&c->replicas, i); 1289 1290 if (metadata_only && 1291 (e->data_type == BCH_DATA_user || 1292 e->data_type == BCH_DATA_cached)) 1293 continue; 1294 1295 printbuf_reset(&buf); 1296 bch2_replicas_entry_to_text(&buf, e); 1297 1298 copy_fs_field(fs_usage_replicas_wrong, 1299 replicas[i], "%s", buf.buf); 1300 } 1301 } 1302 1303 #undef copy_fs_field 1304 #undef copy_dev_field 1305 #undef copy_stripe_field 1306 #undef copy_field 1307 fsck_err: 1308 if (ca) 1309 percpu_ref_put(&ca->ref); 1310 if (ret) 1311 bch_err_fn(c, ret); 1312 1313 percpu_up_write(&c->mark_lock); 1314 printbuf_exit(&buf); 1315 return ret; 1316 } 1317 1318 static int bch2_gc_start(struct bch_fs *c) 1319 { 1320 struct bch_dev *ca = NULL; 1321 unsigned i; 1322 1323 BUG_ON(c->usage_gc); 1324 1325 c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), 1326 sizeof(u64), GFP_KERNEL); 1327 if (!c->usage_gc) { 1328 bch_err(c, "error allocating c->usage_gc"); 1329 return -BCH_ERR_ENOMEM_gc_start; 1330 } 1331 1332 for_each_member_device(ca, c, i) { 1333 BUG_ON(ca->usage_gc); 1334 1335 ca->usage_gc = alloc_percpu(struct bch_dev_usage); 1336 if (!ca->usage_gc) { 1337 bch_err(c, "error allocating ca->usage_gc"); 1338 percpu_ref_put(&ca->ref); 1339 return -BCH_ERR_ENOMEM_gc_start; 1340 } 1341 1342 this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets, 1343 ca->mi.nbuckets - ca->mi.first_bucket); 1344 } 1345 1346 return 0; 1347 } 1348 1349 static int bch2_gc_reset(struct bch_fs *c) 1350 { 1351 struct bch_dev *ca; 1352 unsigned i; 1353 1354 for_each_member_device(ca, c, i) { 1355 free_percpu(ca->usage_gc); 1356 ca->usage_gc = NULL; 1357 } 1358 1359 free_percpu(c->usage_gc); 1360 c->usage_gc = NULL; 1361 1362 return bch2_gc_start(c); 1363 } 1364 1365 /* returns true if not equal */ 1366 static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, 1367 struct bch_alloc_v4 r) 1368 { 1369 return l.gen != r.gen || 1370 l.oldest_gen != r.oldest_gen || 1371 l.data_type != r.data_type || 1372 l.dirty_sectors != r.dirty_sectors || 1373 l.cached_sectors != r.cached_sectors || 1374 l.stripe_redundancy != r.stripe_redundancy || 1375 l.stripe != r.stripe; 1376 } 1377 1378 static int bch2_alloc_write_key(struct btree_trans *trans, 1379 struct btree_iter *iter, 1380 struct bkey_s_c k, 1381 bool metadata_only) 1382 { 1383 struct bch_fs *c = trans->c; 1384 struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); 1385 struct bucket gc, *b; 1386 struct bkey_i_alloc_v4 *a; 1387 struct bch_alloc_v4 old_convert, new; 1388 const struct bch_alloc_v4 *old; 1389 enum bch_data_type type; 1390 int ret; 1391 1392 if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets))) 1393 return 1; 1394 1395 old = bch2_alloc_to_v4(k, &old_convert); 1396 new = *old; 1397 1398 percpu_down_read(&c->mark_lock); 1399 b = gc_bucket(ca, iter->pos.offset); 1400 1401 /* 1402 * b->data_type doesn't yet include need_discard & need_gc_gen states - 1403 * fix that here: 1404 */ 1405 type = __alloc_data_type(b->dirty_sectors, 1406 b->cached_sectors, 1407 b->stripe, 1408 *old, 1409 b->data_type); 1410 if (b->data_type != type) { 1411 struct bch_dev_usage *u; 1412 1413 preempt_disable(); 1414 u = this_cpu_ptr(ca->usage_gc); 1415 u->d[b->data_type].buckets--; 1416 b->data_type = type; 1417 u->d[b->data_type].buckets++; 1418 preempt_enable(); 1419 } 1420 1421 gc = *b; 1422 percpu_up_read(&c->mark_lock); 1423 1424 if (metadata_only && 1425 gc.data_type != BCH_DATA_sb && 1426 gc.data_type != BCH_DATA_journal && 1427 gc.data_type != BCH_DATA_btree) 1428 return 0; 1429 1430 if (gen_after(old->gen, gc.gen)) 1431 return 0; 1432 1433 if (c->opts.reconstruct_alloc || 1434 fsck_err_on(new.data_type != gc.data_type, c, 1435 alloc_key_data_type_wrong, 1436 "bucket %llu:%llu gen %u has wrong data_type" 1437 ": got %s, should be %s", 1438 iter->pos.inode, iter->pos.offset, 1439 gc.gen, 1440 bch2_data_types[new.data_type], 1441 bch2_data_types[gc.data_type])) 1442 new.data_type = gc.data_type; 1443 1444 #define copy_bucket_field(_errtype, _f) \ 1445 if (c->opts.reconstruct_alloc || \ 1446 fsck_err_on(new._f != gc._f, c, _errtype, \ 1447 "bucket %llu:%llu gen %u data type %s has wrong " #_f \ 1448 ": got %u, should be %u", \ 1449 iter->pos.inode, iter->pos.offset, \ 1450 gc.gen, \ 1451 bch2_data_types[gc.data_type], \ 1452 new._f, gc._f)) \ 1453 new._f = gc._f; \ 1454 1455 copy_bucket_field(alloc_key_gen_wrong, 1456 gen); 1457 copy_bucket_field(alloc_key_dirty_sectors_wrong, 1458 dirty_sectors); 1459 copy_bucket_field(alloc_key_cached_sectors_wrong, 1460 cached_sectors); 1461 copy_bucket_field(alloc_key_stripe_wrong, 1462 stripe); 1463 copy_bucket_field(alloc_key_stripe_redundancy_wrong, 1464 stripe_redundancy); 1465 #undef copy_bucket_field 1466 1467 if (!bch2_alloc_v4_cmp(*old, new)) 1468 return 0; 1469 1470 a = bch2_alloc_to_v4_mut(trans, k); 1471 ret = PTR_ERR_OR_ZERO(a); 1472 if (ret) 1473 return ret; 1474 1475 a->v = new; 1476 1477 /* 1478 * The trigger normally makes sure this is set, but we're not running 1479 * triggers: 1480 */ 1481 if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) 1482 a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); 1483 1484 ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); 1485 fsck_err: 1486 return ret; 1487 } 1488 1489 static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) 1490 { 1491 struct btree_trans *trans = bch2_trans_get(c); 1492 struct btree_iter iter; 1493 struct bkey_s_c k; 1494 struct bch_dev *ca; 1495 unsigned i; 1496 int ret = 0; 1497 1498 for_each_member_device(ca, c, i) { 1499 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, 1500 POS(ca->dev_idx, ca->mi.first_bucket), 1501 BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, 1502 NULL, NULL, BTREE_INSERT_LAZY_RW, 1503 bch2_alloc_write_key(trans, &iter, k, metadata_only)); 1504 1505 if (ret < 0) { 1506 bch_err_fn(c, ret); 1507 percpu_ref_put(&ca->ref); 1508 break; 1509 } 1510 } 1511 1512 bch2_trans_put(trans); 1513 return ret < 0 ? ret : 0; 1514 } 1515 1516 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) 1517 { 1518 struct bch_dev *ca; 1519 struct btree_trans *trans = bch2_trans_get(c); 1520 struct btree_iter iter; 1521 struct bkey_s_c k; 1522 struct bucket *g; 1523 struct bch_alloc_v4 a_convert; 1524 const struct bch_alloc_v4 *a; 1525 unsigned i; 1526 int ret; 1527 1528 for_each_member_device(ca, c, i) { 1529 struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + 1530 ca->mi.nbuckets * sizeof(struct bucket), 1531 GFP_KERNEL|__GFP_ZERO); 1532 if (!buckets) { 1533 percpu_ref_put(&ca->ref); 1534 bch_err(c, "error allocating ca->buckets[gc]"); 1535 ret = -BCH_ERR_ENOMEM_gc_alloc_start; 1536 goto err; 1537 } 1538 1539 buckets->first_bucket = ca->mi.first_bucket; 1540 buckets->nbuckets = ca->mi.nbuckets; 1541 rcu_assign_pointer(ca->buckets_gc, buckets); 1542 } 1543 1544 ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN, 1545 BTREE_ITER_PREFETCH, k, ({ 1546 ca = bch_dev_bkey_exists(c, k.k->p.inode); 1547 g = gc_bucket(ca, k.k->p.offset); 1548 1549 a = bch2_alloc_to_v4(k, &a_convert); 1550 1551 g->gen_valid = 1; 1552 g->gen = a->gen; 1553 1554 if (metadata_only && 1555 (a->data_type == BCH_DATA_user || 1556 a->data_type == BCH_DATA_cached || 1557 a->data_type == BCH_DATA_parity)) { 1558 g->data_type = a->data_type; 1559 g->dirty_sectors = a->dirty_sectors; 1560 g->cached_sectors = a->cached_sectors; 1561 g->stripe = a->stripe; 1562 g->stripe_redundancy = a->stripe_redundancy; 1563 } 1564 1565 0; 1566 })); 1567 err: 1568 bch2_trans_put(trans); 1569 if (ret) 1570 bch_err_fn(c, ret); 1571 return ret; 1572 } 1573 1574 static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) 1575 { 1576 struct bch_dev *ca; 1577 unsigned i; 1578 1579 for_each_member_device(ca, c, i) { 1580 struct bucket_array *buckets = gc_bucket_array(ca); 1581 struct bucket *g; 1582 1583 for_each_bucket(g, buckets) { 1584 if (metadata_only && 1585 (g->data_type == BCH_DATA_user || 1586 g->data_type == BCH_DATA_cached || 1587 g->data_type == BCH_DATA_parity)) 1588 continue; 1589 g->data_type = 0; 1590 g->dirty_sectors = 0; 1591 g->cached_sectors = 0; 1592 } 1593 } 1594 } 1595 1596 static int bch2_gc_write_reflink_key(struct btree_trans *trans, 1597 struct btree_iter *iter, 1598 struct bkey_s_c k, 1599 size_t *idx) 1600 { 1601 struct bch_fs *c = trans->c; 1602 const __le64 *refcount = bkey_refcount_c(k); 1603 struct printbuf buf = PRINTBUF; 1604 struct reflink_gc *r; 1605 int ret = 0; 1606 1607 if (!refcount) 1608 return 0; 1609 1610 while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && 1611 r->offset < k.k->p.offset) 1612 ++*idx; 1613 1614 if (!r || 1615 r->offset != k.k->p.offset || 1616 r->size != k.k->size) { 1617 bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); 1618 return -EINVAL; 1619 } 1620 1621 if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, 1622 reflink_v_refcount_wrong, 1623 "reflink key has wrong refcount:\n" 1624 " %s\n" 1625 " should be %u", 1626 (bch2_bkey_val_to_text(&buf, c, k), buf.buf), 1627 r->refcount)) { 1628 struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0); 1629 1630 ret = PTR_ERR_OR_ZERO(new); 1631 if (ret) 1632 return ret; 1633 1634 if (!r->refcount) 1635 new->k.type = KEY_TYPE_deleted; 1636 else 1637 *bkey_refcount(new) = cpu_to_le64(r->refcount); 1638 } 1639 fsck_err: 1640 printbuf_exit(&buf); 1641 return ret; 1642 } 1643 1644 static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) 1645 { 1646 struct btree_trans *trans; 1647 struct btree_iter iter; 1648 struct bkey_s_c k; 1649 size_t idx = 0; 1650 int ret = 0; 1651 1652 if (metadata_only) 1653 return 0; 1654 1655 trans = bch2_trans_get(c); 1656 1657 ret = for_each_btree_key_commit(trans, iter, 1658 BTREE_ID_reflink, POS_MIN, 1659 BTREE_ITER_PREFETCH, k, 1660 NULL, NULL, BTREE_INSERT_NOFAIL, 1661 bch2_gc_write_reflink_key(trans, &iter, k, &idx)); 1662 1663 c->reflink_gc_nr = 0; 1664 bch2_trans_put(trans); 1665 return ret; 1666 } 1667 1668 static int bch2_gc_reflink_start(struct bch_fs *c, 1669 bool metadata_only) 1670 { 1671 struct btree_trans *trans; 1672 struct btree_iter iter; 1673 struct bkey_s_c k; 1674 struct reflink_gc *r; 1675 int ret = 0; 1676 1677 if (metadata_only) 1678 return 0; 1679 1680 trans = bch2_trans_get(c); 1681 c->reflink_gc_nr = 0; 1682 1683 for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, 1684 BTREE_ITER_PREFETCH, k, ret) { 1685 const __le64 *refcount = bkey_refcount_c(k); 1686 1687 if (!refcount) 1688 continue; 1689 1690 r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, 1691 GFP_KERNEL); 1692 if (!r) { 1693 ret = -BCH_ERR_ENOMEM_gc_reflink_start; 1694 break; 1695 } 1696 1697 r->offset = k.k->p.offset; 1698 r->size = k.k->size; 1699 r->refcount = 0; 1700 } 1701 bch2_trans_iter_exit(trans, &iter); 1702 1703 bch2_trans_put(trans); 1704 return ret; 1705 } 1706 1707 static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) 1708 { 1709 struct genradix_iter iter; 1710 struct reflink_gc *r; 1711 1712 genradix_for_each(&c->reflink_gc_table, iter, r) 1713 r->refcount = 0; 1714 } 1715 1716 static int bch2_gc_write_stripes_key(struct btree_trans *trans, 1717 struct btree_iter *iter, 1718 struct bkey_s_c k) 1719 { 1720 struct bch_fs *c = trans->c; 1721 struct printbuf buf = PRINTBUF; 1722 const struct bch_stripe *s; 1723 struct gc_stripe *m; 1724 bool bad = false; 1725 unsigned i; 1726 int ret = 0; 1727 1728 if (k.k->type != KEY_TYPE_stripe) 1729 return 0; 1730 1731 s = bkey_s_c_to_stripe(k).v; 1732 m = genradix_ptr(&c->gc_stripes, k.k->p.offset); 1733 1734 for (i = 0; i < s->nr_blocks; i++) { 1735 u32 old = stripe_blockcount_get(s, i); 1736 u32 new = (m ? m->block_sectors[i] : 0); 1737 1738 if (old != new) { 1739 prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n", 1740 i, old, new); 1741 bad = true; 1742 } 1743 } 1744 1745 if (bad) 1746 bch2_bkey_val_to_text(&buf, c, k); 1747 1748 if (fsck_err_on(bad, c, stripe_sector_count_wrong, 1749 "%s", buf.buf)) { 1750 struct bkey_i_stripe *new; 1751 1752 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); 1753 ret = PTR_ERR_OR_ZERO(new); 1754 if (ret) 1755 return ret; 1756 1757 bkey_reassemble(&new->k_i, k); 1758 1759 for (i = 0; i < new->v.nr_blocks; i++) 1760 stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); 1761 1762 ret = bch2_trans_update(trans, iter, &new->k_i, 0); 1763 } 1764 fsck_err: 1765 printbuf_exit(&buf); 1766 return ret; 1767 } 1768 1769 static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) 1770 { 1771 struct btree_trans *trans; 1772 struct btree_iter iter; 1773 struct bkey_s_c k; 1774 int ret = 0; 1775 1776 if (metadata_only) 1777 return 0; 1778 1779 trans = bch2_trans_get(c); 1780 1781 ret = for_each_btree_key_commit(trans, iter, 1782 BTREE_ID_stripes, POS_MIN, 1783 BTREE_ITER_PREFETCH, k, 1784 NULL, NULL, BTREE_INSERT_NOFAIL, 1785 bch2_gc_write_stripes_key(trans, &iter, k)); 1786 1787 bch2_trans_put(trans); 1788 return ret; 1789 } 1790 1791 static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) 1792 { 1793 genradix_free(&c->gc_stripes); 1794 } 1795 1796 /** 1797 * bch2_gc - walk _all_ references to buckets, and recompute them: 1798 * 1799 * @c: filesystem object 1800 * @initial: are we in recovery? 1801 * @metadata_only: are we just checking metadata references, or everything? 1802 * 1803 * Returns: 0 on success, or standard errcode on failure 1804 * 1805 * Order matters here: 1806 * - Concurrent GC relies on the fact that we have a total ordering for 1807 * everything that GC walks - see gc_will_visit_node(), 1808 * gc_will_visit_root() 1809 * 1810 * - also, references move around in the course of index updates and 1811 * various other crap: everything needs to agree on the ordering 1812 * references are allowed to move around in - e.g., we're allowed to 1813 * start with a reference owned by an open_bucket (the allocator) and 1814 * move it to the btree, but not the reverse. 1815 * 1816 * This is necessary to ensure that gc doesn't miss references that 1817 * move around - if references move backwards in the ordering GC 1818 * uses, GC could skip past them 1819 */ 1820 int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) 1821 { 1822 unsigned iter = 0; 1823 int ret; 1824 1825 lockdep_assert_held(&c->state_lock); 1826 1827 down_write(&c->gc_lock); 1828 1829 bch2_btree_interior_updates_flush(c); 1830 1831 ret = bch2_gc_start(c) ?: 1832 bch2_gc_alloc_start(c, metadata_only) ?: 1833 bch2_gc_reflink_start(c, metadata_only); 1834 if (ret) 1835 goto out; 1836 again: 1837 gc_pos_set(c, gc_phase(GC_PHASE_START)); 1838 1839 bch2_mark_superblocks(c); 1840 1841 ret = bch2_gc_btrees(c, initial, metadata_only); 1842 1843 if (ret) 1844 goto out; 1845 1846 #if 0 1847 bch2_mark_pending_btree_node_frees(c); 1848 #endif 1849 c->gc_count++; 1850 1851 if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || 1852 (!iter && bch2_test_restart_gc)) { 1853 if (iter++ > 2) { 1854 bch_info(c, "Unable to fix bucket gens, looping"); 1855 ret = -EINVAL; 1856 goto out; 1857 } 1858 1859 /* 1860 * XXX: make sure gens we fixed got saved 1861 */ 1862 bch_info(c, "Second GC pass needed, restarting:"); 1863 clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); 1864 __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); 1865 1866 bch2_gc_stripes_reset(c, metadata_only); 1867 bch2_gc_alloc_reset(c, metadata_only); 1868 bch2_gc_reflink_reset(c, metadata_only); 1869 ret = bch2_gc_reset(c); 1870 if (ret) 1871 goto out; 1872 1873 /* flush fsck errors, reset counters */ 1874 bch2_flush_fsck_errs(c); 1875 goto again; 1876 } 1877 out: 1878 if (!ret) { 1879 bch2_journal_block(&c->journal); 1880 1881 ret = bch2_gc_stripes_done(c, metadata_only) ?: 1882 bch2_gc_reflink_done(c, metadata_only) ?: 1883 bch2_gc_alloc_done(c, metadata_only) ?: 1884 bch2_gc_done(c, initial, metadata_only); 1885 1886 bch2_journal_unblock(&c->journal); 1887 } 1888 1889 percpu_down_write(&c->mark_lock); 1890 /* Indicates that gc is no longer in progress: */ 1891 __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); 1892 1893 bch2_gc_free(c); 1894 percpu_up_write(&c->mark_lock); 1895 1896 up_write(&c->gc_lock); 1897 1898 /* 1899 * At startup, allocations can happen directly instead of via the 1900 * allocator thread - issue wakeup in case they blocked on gc_lock: 1901 */ 1902 closure_wake_up(&c->freelist_wait); 1903 1904 if (ret) 1905 bch_err_fn(c, ret); 1906 return ret; 1907 } 1908 1909 static int gc_btree_gens_key(struct btree_trans *trans, 1910 struct btree_iter *iter, 1911 struct bkey_s_c k) 1912 { 1913 struct bch_fs *c = trans->c; 1914 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1915 const struct bch_extent_ptr *ptr; 1916 struct bkey_i *u; 1917 int ret; 1918 1919 percpu_down_read(&c->mark_lock); 1920 bkey_for_each_ptr(ptrs, ptr) { 1921 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 1922 1923 if (ptr_stale(ca, ptr) > 16) { 1924 percpu_up_read(&c->mark_lock); 1925 goto update; 1926 } 1927 } 1928 1929 bkey_for_each_ptr(ptrs, ptr) { 1930 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 1931 u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; 1932 1933 if (gen_after(*gen, ptr->gen)) 1934 *gen = ptr->gen; 1935 } 1936 percpu_up_read(&c->mark_lock); 1937 return 0; 1938 update: 1939 u = bch2_bkey_make_mut(trans, iter, &k, 0); 1940 ret = PTR_ERR_OR_ZERO(u); 1941 if (ret) 1942 return ret; 1943 1944 bch2_extent_normalize(c, bkey_i_to_s(u)); 1945 return 0; 1946 } 1947 1948 static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, 1949 struct bkey_s_c k) 1950 { 1951 struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); 1952 struct bch_alloc_v4 a_convert; 1953 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); 1954 struct bkey_i_alloc_v4 *a_mut; 1955 int ret; 1956 1957 if (a->oldest_gen == ca->oldest_gen[iter->pos.offset]) 1958 return 0; 1959 1960 a_mut = bch2_alloc_to_v4_mut(trans, k); 1961 ret = PTR_ERR_OR_ZERO(a_mut); 1962 if (ret) 1963 return ret; 1964 1965 a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; 1966 a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type); 1967 1968 return bch2_trans_update(trans, iter, &a_mut->k_i, 0); 1969 } 1970 1971 int bch2_gc_gens(struct bch_fs *c) 1972 { 1973 struct btree_trans *trans; 1974 struct btree_iter iter; 1975 struct bkey_s_c k; 1976 struct bch_dev *ca; 1977 u64 b, start_time = local_clock(); 1978 unsigned i; 1979 int ret; 1980 1981 /* 1982 * Ideally we would be using state_lock and not gc_lock here, but that 1983 * introduces a deadlock in the RO path - we currently take the state 1984 * lock at the start of going RO, thus the gc thread may get stuck: 1985 */ 1986 if (!mutex_trylock(&c->gc_gens_lock)) 1987 return 0; 1988 1989 trace_and_count(c, gc_gens_start, c); 1990 down_read(&c->gc_lock); 1991 trans = bch2_trans_get(c); 1992 1993 for_each_member_device(ca, c, i) { 1994 struct bucket_gens *gens = bucket_gens(ca); 1995 1996 BUG_ON(ca->oldest_gen); 1997 1998 ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); 1999 if (!ca->oldest_gen) { 2000 percpu_ref_put(&ca->ref); 2001 ret = -BCH_ERR_ENOMEM_gc_gens; 2002 goto err; 2003 } 2004 2005 for (b = gens->first_bucket; 2006 b < gens->nbuckets; b++) 2007 ca->oldest_gen[b] = gens->b[b]; 2008 } 2009 2010 for (i = 0; i < BTREE_ID_NR; i++) 2011 if (btree_type_has_ptrs(i)) { 2012 c->gc_gens_btree = i; 2013 c->gc_gens_pos = POS_MIN; 2014 2015 ret = for_each_btree_key_commit(trans, iter, i, 2016 POS_MIN, 2017 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, 2018 k, 2019 NULL, NULL, 2020 BTREE_INSERT_NOFAIL, 2021 gc_btree_gens_key(trans, &iter, k)); 2022 if (ret && !bch2_err_matches(ret, EROFS)) 2023 bch_err_fn(c, ret); 2024 if (ret) 2025 goto err; 2026 } 2027 2028 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, 2029 POS_MIN, 2030 BTREE_ITER_PREFETCH, 2031 k, 2032 NULL, NULL, 2033 BTREE_INSERT_NOFAIL, 2034 bch2_alloc_write_oldest_gen(trans, &iter, k)); 2035 if (ret && !bch2_err_matches(ret, EROFS)) 2036 bch_err_fn(c, ret); 2037 if (ret) 2038 goto err; 2039 2040 c->gc_gens_btree = 0; 2041 c->gc_gens_pos = POS_MIN; 2042 2043 c->gc_count++; 2044 2045 bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); 2046 trace_and_count(c, gc_gens_end, c); 2047 err: 2048 for_each_member_device(ca, c, i) { 2049 kvfree(ca->oldest_gen); 2050 ca->oldest_gen = NULL; 2051 } 2052 2053 bch2_trans_put(trans); 2054 up_read(&c->gc_lock); 2055 mutex_unlock(&c->gc_gens_lock); 2056 return ret; 2057 } 2058 2059 static int bch2_gc_thread(void *arg) 2060 { 2061 struct bch_fs *c = arg; 2062 struct io_clock *clock = &c->io_clock[WRITE]; 2063 unsigned long last = atomic64_read(&clock->now); 2064 unsigned last_kick = atomic_read(&c->kick_gc); 2065 int ret; 2066 2067 set_freezable(); 2068 2069 while (1) { 2070 while (1) { 2071 set_current_state(TASK_INTERRUPTIBLE); 2072 2073 if (kthread_should_stop()) { 2074 __set_current_state(TASK_RUNNING); 2075 return 0; 2076 } 2077 2078 if (atomic_read(&c->kick_gc) != last_kick) 2079 break; 2080 2081 if (c->btree_gc_periodic) { 2082 unsigned long next = last + c->capacity / 16; 2083 2084 if (atomic64_read(&clock->now) >= next) 2085 break; 2086 2087 bch2_io_clock_schedule_timeout(clock, next); 2088 } else { 2089 schedule(); 2090 } 2091 2092 try_to_freeze(); 2093 } 2094 __set_current_state(TASK_RUNNING); 2095 2096 last = atomic64_read(&clock->now); 2097 last_kick = atomic_read(&c->kick_gc); 2098 2099 /* 2100 * Full gc is currently incompatible with btree key cache: 2101 */ 2102 #if 0 2103 ret = bch2_gc(c, false, false); 2104 #else 2105 ret = bch2_gc_gens(c); 2106 #endif 2107 if (ret < 0) 2108 bch_err_fn(c, ret); 2109 2110 debug_check_no_locks_held(); 2111 } 2112 2113 return 0; 2114 } 2115 2116 void bch2_gc_thread_stop(struct bch_fs *c) 2117 { 2118 struct task_struct *p; 2119 2120 p = c->gc_thread; 2121 c->gc_thread = NULL; 2122 2123 if (p) { 2124 kthread_stop(p); 2125 put_task_struct(p); 2126 } 2127 } 2128 2129 int bch2_gc_thread_start(struct bch_fs *c) 2130 { 2131 struct task_struct *p; 2132 2133 if (c->gc_thread) 2134 return 0; 2135 2136 p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); 2137 if (IS_ERR(p)) { 2138 bch_err_fn(c, PTR_ERR(p)); 2139 return PTR_ERR(p); 2140 } 2141 2142 get_task_struct(p); 2143 c->gc_thread = p; 2144 wake_up_process(p); 2145 return 0; 2146 } 2147