1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> 4 * Copyright (C) 2014 Datera Inc. 5 */ 6 7 #include "bcachefs.h" 8 #include "alloc_background.h" 9 #include "alloc_foreground.h" 10 #include "bkey_methods.h" 11 #include "bkey_buf.h" 12 #include "btree_journal_iter.h" 13 #include "btree_key_cache.h" 14 #include "btree_locking.h" 15 #include "btree_update_interior.h" 16 #include "btree_io.h" 17 #include "btree_gc.h" 18 #include "buckets.h" 19 #include "clock.h" 20 #include "debug.h" 21 #include "ec.h" 22 #include "error.h" 23 #include "extents.h" 24 #include "journal.h" 25 #include "keylist.h" 26 #include "move.h" 27 #include "recovery.h" 28 #include "reflink.h" 29 #include "replicas.h" 30 #include "super-io.h" 31 #include "trace.h" 32 33 #include <linux/slab.h> 34 #include <linux/bitops.h> 35 #include <linux/freezer.h> 36 #include <linux/kthread.h> 37 #include <linux/preempt.h> 38 #include <linux/rcupdate.h> 39 #include <linux/sched/task.h> 40 41 #define DROP_THIS_NODE 10 42 #define DROP_PREV_NODE 11 43 44 static bool should_restart_for_topology_repair(struct bch_fs *c) 45 { 46 return c->opts.fix_errors != FSCK_FIX_no && 47 !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); 48 } 49 50 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) 51 { 52 preempt_disable(); 53 write_seqcount_begin(&c->gc_pos_lock); 54 c->gc_pos = new_pos; 55 write_seqcount_end(&c->gc_pos_lock); 56 preempt_enable(); 57 } 58 59 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) 60 { 61 BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); 62 __gc_pos_set(c, new_pos); 63 } 64 65 /* 66 * Missing: if an interior btree node is empty, we need to do something - 67 * perhaps just kill it 68 */ 69 static int bch2_gc_check_topology(struct bch_fs *c, 70 struct btree *b, 71 struct bkey_buf *prev, 72 struct bkey_buf cur, 73 bool is_last) 74 { 75 struct bpos node_start = b->data->min_key; 76 struct bpos node_end = b->data->max_key; 77 struct bpos expected_start = bkey_deleted(&prev->k->k) 78 ? node_start 79 : bpos_successor(prev->k->k.p); 80 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 81 int ret = 0; 82 83 if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { 84 struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); 85 86 if (!bpos_eq(expected_start, bp->v.min_key)) { 87 bch2_topology_error(c); 88 89 if (bkey_deleted(&prev->k->k)) { 90 prt_printf(&buf1, "start of node: "); 91 bch2_bpos_to_text(&buf1, node_start); 92 } else { 93 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); 94 } 95 bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); 96 97 if (__fsck_err(c, 98 FSCK_CAN_FIX| 99 FSCK_CAN_IGNORE| 100 FSCK_NO_RATELIMIT, 101 "btree node with incorrect min_key at btree %s level %u:\n" 102 " prev %s\n" 103 " cur %s", 104 bch2_btree_ids[b->c.btree_id], b->c.level, 105 buf1.buf, buf2.buf) && 106 should_restart_for_topology_repair(c)) { 107 bch_info(c, "Halting mark and sweep to start topology repair pass"); 108 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); 109 goto err; 110 } else { 111 set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); 112 } 113 } 114 } 115 116 if (is_last && !bpos_eq(cur.k->k.p, node_end)) { 117 bch2_topology_error(c); 118 119 printbuf_reset(&buf1); 120 printbuf_reset(&buf2); 121 122 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); 123 bch2_bpos_to_text(&buf2, node_end); 124 125 if (__fsck_err(c, 126 FSCK_CAN_FIX| 127 FSCK_CAN_IGNORE| 128 FSCK_NO_RATELIMIT, 129 "btree node with incorrect max_key at btree %s level %u:\n" 130 " %s\n" 131 " expected %s", 132 bch2_btree_ids[b->c.btree_id], b->c.level, 133 buf1.buf, buf2.buf) && 134 should_restart_for_topology_repair(c)) { 135 bch_info(c, "Halting mark and sweep to start topology repair pass"); 136 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); 137 goto err; 138 } else { 139 set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); 140 } 141 } 142 143 bch2_bkey_buf_copy(prev, c, cur.k); 144 err: 145 fsck_err: 146 printbuf_exit(&buf2); 147 printbuf_exit(&buf1); 148 return ret; 149 } 150 151 static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) 152 { 153 switch (b->key.k.type) { 154 case KEY_TYPE_btree_ptr: { 155 struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key); 156 157 dst->k.p = src->k.p; 158 dst->v.mem_ptr = 0; 159 dst->v.seq = b->data->keys.seq; 160 dst->v.sectors_written = 0; 161 dst->v.flags = 0; 162 dst->v.min_key = b->data->min_key; 163 set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k)); 164 memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k)); 165 break; 166 } 167 case KEY_TYPE_btree_ptr_v2: 168 bkey_copy(&dst->k_i, &b->key); 169 break; 170 default: 171 BUG(); 172 } 173 } 174 175 static void bch2_btree_node_update_key_early(struct btree_trans *trans, 176 enum btree_id btree, unsigned level, 177 struct bkey_s_c old, struct bkey_i *new) 178 { 179 struct bch_fs *c = trans->c; 180 struct btree *b; 181 struct bkey_buf tmp; 182 int ret; 183 184 bch2_bkey_buf_init(&tmp); 185 bch2_bkey_buf_reassemble(&tmp, c, old); 186 187 b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); 188 if (!IS_ERR_OR_NULL(b)) { 189 mutex_lock(&c->btree_cache.lock); 190 191 bch2_btree_node_hash_remove(&c->btree_cache, b); 192 193 bkey_copy(&b->key, new); 194 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 195 BUG_ON(ret); 196 197 mutex_unlock(&c->btree_cache.lock); 198 six_unlock_read(&b->c.lock); 199 } 200 201 bch2_bkey_buf_exit(&tmp, c); 202 } 203 204 static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) 205 { 206 struct bkey_i_btree_ptr_v2 *new; 207 int ret; 208 209 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); 210 if (!new) 211 return -BCH_ERR_ENOMEM_gc_repair_key; 212 213 btree_ptr_to_v2(b, new); 214 b->data->min_key = new_min; 215 new->v.min_key = new_min; 216 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); 217 218 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); 219 if (ret) { 220 kfree(new); 221 return ret; 222 } 223 224 bch2_btree_node_drop_keys_outside_node(b); 225 bkey_copy(&b->key, &new->k_i); 226 return 0; 227 } 228 229 static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) 230 { 231 struct bkey_i_btree_ptr_v2 *new; 232 int ret; 233 234 ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); 235 if (ret) 236 return ret; 237 238 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); 239 if (!new) 240 return -BCH_ERR_ENOMEM_gc_repair_key; 241 242 btree_ptr_to_v2(b, new); 243 b->data->max_key = new_max; 244 new->k.p = new_max; 245 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); 246 247 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); 248 if (ret) { 249 kfree(new); 250 return ret; 251 } 252 253 bch2_btree_node_drop_keys_outside_node(b); 254 255 mutex_lock(&c->btree_cache.lock); 256 bch2_btree_node_hash_remove(&c->btree_cache, b); 257 258 bkey_copy(&b->key, &new->k_i); 259 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 260 BUG_ON(ret); 261 mutex_unlock(&c->btree_cache.lock); 262 return 0; 263 } 264 265 static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, 266 struct btree *prev, struct btree *cur) 267 { 268 struct bpos expected_start = !prev 269 ? b->data->min_key 270 : bpos_successor(prev->key.k.p); 271 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 272 int ret = 0; 273 274 if (!prev) { 275 prt_printf(&buf1, "start of node: "); 276 bch2_bpos_to_text(&buf1, b->data->min_key); 277 } else { 278 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); 279 } 280 281 bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); 282 283 if (prev && 284 bpos_gt(expected_start, cur->data->min_key) && 285 BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { 286 /* cur overwrites prev: */ 287 288 if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, 289 cur->data->min_key), c, 290 "btree node overwritten by next node at btree %s level %u:\n" 291 " node %s\n" 292 " next %s", 293 bch2_btree_ids[b->c.btree_id], b->c.level, 294 buf1.buf, buf2.buf)) { 295 ret = DROP_PREV_NODE; 296 goto out; 297 } 298 299 if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, 300 bpos_predecessor(cur->data->min_key)), c, 301 "btree node with incorrect max_key at btree %s level %u:\n" 302 " node %s\n" 303 " next %s", 304 bch2_btree_ids[b->c.btree_id], b->c.level, 305 buf1.buf, buf2.buf)) 306 ret = set_node_max(c, prev, 307 bpos_predecessor(cur->data->min_key)); 308 } else { 309 /* prev overwrites cur: */ 310 311 if (mustfix_fsck_err_on(bpos_ge(expected_start, 312 cur->data->max_key), c, 313 "btree node overwritten by prev node at btree %s level %u:\n" 314 " prev %s\n" 315 " node %s", 316 bch2_btree_ids[b->c.btree_id], b->c.level, 317 buf1.buf, buf2.buf)) { 318 ret = DROP_THIS_NODE; 319 goto out; 320 } 321 322 if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, 323 "btree node with incorrect min_key at btree %s level %u:\n" 324 " prev %s\n" 325 " node %s", 326 bch2_btree_ids[b->c.btree_id], b->c.level, 327 buf1.buf, buf2.buf)) 328 ret = set_node_min(c, cur, expected_start); 329 } 330 out: 331 fsck_err: 332 printbuf_exit(&buf2); 333 printbuf_exit(&buf1); 334 return ret; 335 } 336 337 static int btree_repair_node_end(struct bch_fs *c, struct btree *b, 338 struct btree *child) 339 { 340 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 341 int ret = 0; 342 343 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); 344 bch2_bpos_to_text(&buf2, b->key.k.p); 345 346 if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, 347 "btree node with incorrect max_key at btree %s level %u:\n" 348 " %s\n" 349 " expected %s", 350 bch2_btree_ids[b->c.btree_id], b->c.level, 351 buf1.buf, buf2.buf)) { 352 ret = set_node_max(c, child, b->key.k.p); 353 if (ret) 354 goto err; 355 } 356 err: 357 fsck_err: 358 printbuf_exit(&buf2); 359 printbuf_exit(&buf1); 360 return ret; 361 } 362 363 static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) 364 { 365 struct bch_fs *c = trans->c; 366 struct btree_and_journal_iter iter; 367 struct bkey_s_c k; 368 struct bkey_buf prev_k, cur_k; 369 struct btree *prev = NULL, *cur = NULL; 370 bool have_child, dropped_children = false; 371 struct printbuf buf = PRINTBUF; 372 int ret = 0; 373 374 if (!b->c.level) 375 return 0; 376 again: 377 prev = NULL; 378 have_child = dropped_children = false; 379 bch2_bkey_buf_init(&prev_k); 380 bch2_bkey_buf_init(&cur_k); 381 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 382 383 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 384 BUG_ON(bpos_lt(k.k->p, b->data->min_key)); 385 BUG_ON(bpos_gt(k.k->p, b->data->max_key)); 386 387 bch2_btree_and_journal_iter_advance(&iter); 388 bch2_bkey_buf_reassemble(&cur_k, c, k); 389 390 cur = bch2_btree_node_get_noiter(trans, cur_k.k, 391 b->c.btree_id, b->c.level - 1, 392 false); 393 ret = PTR_ERR_OR_ZERO(cur); 394 395 printbuf_reset(&buf); 396 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); 397 398 if (mustfix_fsck_err_on(ret == -EIO, c, 399 "Topology repair: unreadable btree node at btree %s level %u:\n" 400 " %s", 401 bch2_btree_ids[b->c.btree_id], 402 b->c.level - 1, 403 buf.buf)) { 404 bch2_btree_node_evict(trans, cur_k.k); 405 ret = bch2_journal_key_delete(c, b->c.btree_id, 406 b->c.level, cur_k.k->k.p); 407 cur = NULL; 408 if (ret) 409 break; 410 continue; 411 } 412 413 if (ret) { 414 bch_err_msg(c, ret, "getting btree node"); 415 break; 416 } 417 418 ret = btree_repair_node_boundaries(c, b, prev, cur); 419 420 if (ret == DROP_THIS_NODE) { 421 six_unlock_read(&cur->c.lock); 422 bch2_btree_node_evict(trans, cur_k.k); 423 ret = bch2_journal_key_delete(c, b->c.btree_id, 424 b->c.level, cur_k.k->k.p); 425 cur = NULL; 426 if (ret) 427 break; 428 continue; 429 } 430 431 if (prev) 432 six_unlock_read(&prev->c.lock); 433 prev = NULL; 434 435 if (ret == DROP_PREV_NODE) { 436 bch2_btree_node_evict(trans, prev_k.k); 437 ret = bch2_journal_key_delete(c, b->c.btree_id, 438 b->c.level, prev_k.k->k.p); 439 if (ret) 440 break; 441 442 bch2_btree_and_journal_iter_exit(&iter); 443 bch2_bkey_buf_exit(&prev_k, c); 444 bch2_bkey_buf_exit(&cur_k, c); 445 goto again; 446 } else if (ret) 447 break; 448 449 prev = cur; 450 cur = NULL; 451 bch2_bkey_buf_copy(&prev_k, c, cur_k.k); 452 } 453 454 if (!ret && !IS_ERR_OR_NULL(prev)) { 455 BUG_ON(cur); 456 ret = btree_repair_node_end(c, b, prev); 457 } 458 459 if (!IS_ERR_OR_NULL(prev)) 460 six_unlock_read(&prev->c.lock); 461 prev = NULL; 462 if (!IS_ERR_OR_NULL(cur)) 463 six_unlock_read(&cur->c.lock); 464 cur = NULL; 465 466 if (ret) 467 goto err; 468 469 bch2_btree_and_journal_iter_exit(&iter); 470 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 471 472 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 473 bch2_bkey_buf_reassemble(&cur_k, c, k); 474 bch2_btree_and_journal_iter_advance(&iter); 475 476 cur = bch2_btree_node_get_noiter(trans, cur_k.k, 477 b->c.btree_id, b->c.level - 1, 478 false); 479 ret = PTR_ERR_OR_ZERO(cur); 480 481 if (ret) { 482 bch_err_msg(c, ret, "getting btree node"); 483 goto err; 484 } 485 486 ret = bch2_btree_repair_topology_recurse(trans, cur); 487 six_unlock_read(&cur->c.lock); 488 cur = NULL; 489 490 if (ret == DROP_THIS_NODE) { 491 bch2_btree_node_evict(trans, cur_k.k); 492 ret = bch2_journal_key_delete(c, b->c.btree_id, 493 b->c.level, cur_k.k->k.p); 494 dropped_children = true; 495 } 496 497 if (ret) 498 goto err; 499 500 have_child = true; 501 } 502 503 printbuf_reset(&buf); 504 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 505 506 if (mustfix_fsck_err_on(!have_child, c, 507 "empty interior btree node at btree %s level %u\n" 508 " %s", 509 bch2_btree_ids[b->c.btree_id], 510 b->c.level, buf.buf)) 511 ret = DROP_THIS_NODE; 512 err: 513 fsck_err: 514 if (!IS_ERR_OR_NULL(prev)) 515 six_unlock_read(&prev->c.lock); 516 if (!IS_ERR_OR_NULL(cur)) 517 six_unlock_read(&cur->c.lock); 518 519 bch2_btree_and_journal_iter_exit(&iter); 520 bch2_bkey_buf_exit(&prev_k, c); 521 bch2_bkey_buf_exit(&cur_k, c); 522 523 if (!ret && dropped_children) 524 goto again; 525 526 printbuf_exit(&buf); 527 return ret; 528 } 529 530 int bch2_check_topology(struct bch_fs *c) 531 { 532 struct btree_trans *trans = bch2_trans_get(c); 533 struct btree *b; 534 unsigned i; 535 int ret = 0; 536 537 for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { 538 struct btree_root *r = bch2_btree_id_root(c, i); 539 540 if (!r->alive) 541 continue; 542 543 b = r->b; 544 if (btree_node_fake(b)) 545 continue; 546 547 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); 548 ret = bch2_btree_repair_topology_recurse(trans, b); 549 six_unlock_read(&b->c.lock); 550 551 if (ret == DROP_THIS_NODE) { 552 bch_err(c, "empty btree root - repair unimplemented"); 553 ret = -BCH_ERR_fsck_repair_unimplemented; 554 } 555 } 556 557 bch2_trans_put(trans); 558 559 return ret; 560 } 561 562 static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id, 563 unsigned level, bool is_root, 564 struct bkey_s_c *k) 565 { 566 struct bch_fs *c = trans->c; 567 struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k); 568 const union bch_extent_entry *entry_c; 569 struct extent_ptr_decoded p = { 0 }; 570 bool do_update = false; 571 struct printbuf buf = PRINTBUF; 572 int ret = 0; 573 574 /* 575 * XXX 576 * use check_bucket_ref here 577 */ 578 bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { 579 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); 580 struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); 581 enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); 582 583 if (!g->gen_valid && 584 (c->opts.reconstruct_alloc || 585 fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" 586 "while marking %s", 587 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 588 bch2_data_types[ptr_data_type(k->k, &p.ptr)], 589 p.ptr.gen, 590 (printbuf_reset(&buf), 591 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { 592 if (!p.ptr.cached) { 593 g->gen_valid = true; 594 g->gen = p.ptr.gen; 595 } else { 596 do_update = true; 597 } 598 } 599 600 if (gen_cmp(p.ptr.gen, g->gen) > 0 && 601 (c->opts.reconstruct_alloc || 602 fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" 603 "while marking %s", 604 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 605 bch2_data_types[ptr_data_type(k->k, &p.ptr)], 606 p.ptr.gen, g->gen, 607 (printbuf_reset(&buf), 608 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { 609 if (!p.ptr.cached) { 610 g->gen_valid = true; 611 g->gen = p.ptr.gen; 612 g->data_type = 0; 613 g->dirty_sectors = 0; 614 g->cached_sectors = 0; 615 set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); 616 } else { 617 do_update = true; 618 } 619 } 620 621 if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && 622 (c->opts.reconstruct_alloc || 623 fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" 624 "while marking %s", 625 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, 626 bch2_data_types[ptr_data_type(k->k, &p.ptr)], 627 p.ptr.gen, 628 (printbuf_reset(&buf), 629 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) 630 do_update = true; 631 632 if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && 633 (c->opts.reconstruct_alloc || 634 fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" 635 "while marking %s", 636 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 637 bch2_data_types[ptr_data_type(k->k, &p.ptr)], 638 p.ptr.gen, g->gen, 639 (printbuf_reset(&buf), 640 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) 641 do_update = true; 642 643 if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) 644 continue; 645 646 if (fsck_err_on(bucket_data_type(g->data_type) && 647 bucket_data_type(g->data_type) != data_type, c, 648 "bucket %u:%zu different types of data in same bucket: %s, %s\n" 649 "while marking %s", 650 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), 651 bch2_data_types[g->data_type], 652 bch2_data_types[data_type], 653 (printbuf_reset(&buf), 654 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { 655 if (data_type == BCH_DATA_btree) { 656 g->data_type = data_type; 657 set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); 658 } else { 659 do_update = true; 660 } 661 } 662 663 if (p.has_ec) { 664 struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); 665 666 if (fsck_err_on(!m || !m->alive, c, 667 "pointer to nonexistent stripe %llu\n" 668 "while marking %s", 669 (u64) p.ec.idx, 670 (printbuf_reset(&buf), 671 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) 672 do_update = true; 673 674 if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, 675 "pointer does not match stripe %llu\n" 676 "while marking %s", 677 (u64) p.ec.idx, 678 (printbuf_reset(&buf), 679 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) 680 do_update = true; 681 } 682 } 683 684 if (do_update) { 685 struct bkey_ptrs ptrs; 686 union bch_extent_entry *entry; 687 struct bch_extent_ptr *ptr; 688 struct bkey_i *new; 689 690 if (is_root) { 691 bch_err(c, "cannot update btree roots yet"); 692 ret = -EINVAL; 693 goto err; 694 } 695 696 new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); 697 if (!new) { 698 bch_err_msg(c, ret, "allocating new key"); 699 ret = -BCH_ERR_ENOMEM_gc_repair_key; 700 goto err; 701 } 702 703 bkey_reassemble(new, *k); 704 705 if (level) { 706 /* 707 * We don't want to drop btree node pointers - if the 708 * btree node isn't there anymore, the read path will 709 * sort it out: 710 */ 711 ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); 712 bkey_for_each_ptr(ptrs, ptr) { 713 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 714 struct bucket *g = PTR_GC_BUCKET(ca, ptr); 715 716 ptr->gen = g->gen; 717 } 718 } else { 719 bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ 720 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 721 struct bucket *g = PTR_GC_BUCKET(ca, ptr); 722 enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); 723 724 (ptr->cached && 725 (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || 726 (!ptr->cached && 727 gen_cmp(ptr->gen, g->gen) < 0) || 728 gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || 729 (g->data_type && 730 g->data_type != data_type); 731 })); 732 again: 733 ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); 734 bkey_extent_entry_for_each(ptrs, entry) { 735 if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { 736 struct gc_stripe *m = genradix_ptr(&c->gc_stripes, 737 entry->stripe_ptr.idx); 738 union bch_extent_entry *next_ptr; 739 740 bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) 741 if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) 742 goto found; 743 next_ptr = NULL; 744 found: 745 if (!next_ptr) { 746 bch_err(c, "aieee, found stripe ptr with no data ptr"); 747 continue; 748 } 749 750 if (!m || !m->alive || 751 !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], 752 &next_ptr->ptr, 753 m->sectors)) { 754 bch2_bkey_extent_entry_drop(new, entry); 755 goto again; 756 } 757 } 758 } 759 } 760 761 ret = bch2_journal_key_insert_take(c, btree_id, level, new); 762 if (ret) { 763 kfree(new); 764 goto err; 765 } 766 767 if (level) 768 bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); 769 770 if (0) { 771 printbuf_reset(&buf); 772 bch2_bkey_val_to_text(&buf, c, *k); 773 bch_info(c, "updated %s", buf.buf); 774 775 printbuf_reset(&buf); 776 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); 777 bch_info(c, "new key %s", buf.buf); 778 } 779 780 *k = bkey_i_to_s_c(new); 781 } 782 err: 783 fsck_err: 784 printbuf_exit(&buf); 785 return ret; 786 } 787 788 /* marking of btree keys/nodes: */ 789 790 static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, 791 unsigned level, bool is_root, 792 struct bkey_s_c *k, 793 bool initial) 794 { 795 struct bch_fs *c = trans->c; 796 struct bkey deleted = KEY(0, 0, 0); 797 struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; 798 unsigned flags = 799 BTREE_TRIGGER_GC| 800 (initial ? BTREE_TRIGGER_NOATOMIC : 0); 801 int ret = 0; 802 803 deleted.p = k->k->p; 804 805 if (initial) { 806 BUG_ON(bch2_journal_seq_verify && 807 k->k->version.lo > atomic64_read(&c->journal.seq)); 808 809 ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); 810 if (ret) 811 goto err; 812 813 if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, 814 "key version number higher than recorded: %llu > %llu", 815 k->k->version.lo, 816 atomic64_read(&c->key_version))) 817 atomic64_set(&c->key_version, k->k->version.lo); 818 } 819 820 ret = commit_do(trans, NULL, NULL, 0, 821 bch2_mark_key(trans, btree_id, level, old, *k, flags)); 822 fsck_err: 823 err: 824 if (ret) 825 bch_err_fn(c, ret); 826 return ret; 827 } 828 829 static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) 830 { 831 struct bch_fs *c = trans->c; 832 struct btree_node_iter iter; 833 struct bkey unpacked; 834 struct bkey_s_c k; 835 struct bkey_buf prev, cur; 836 int ret = 0; 837 838 if (!btree_node_type_needs_gc(btree_node_type(b))) 839 return 0; 840 841 bch2_btree_node_iter_init_from_start(&iter, b); 842 bch2_bkey_buf_init(&prev); 843 bch2_bkey_buf_init(&cur); 844 bkey_init(&prev.k->k); 845 846 while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { 847 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, 848 &k, initial); 849 if (ret) 850 break; 851 852 bch2_btree_node_iter_advance(&iter, b); 853 854 if (b->c.level) { 855 bch2_bkey_buf_reassemble(&cur, c, k); 856 857 ret = bch2_gc_check_topology(c, b, &prev, cur, 858 bch2_btree_node_iter_end(&iter)); 859 if (ret) 860 break; 861 } 862 } 863 864 bch2_bkey_buf_exit(&cur, c); 865 bch2_bkey_buf_exit(&prev, c); 866 return ret; 867 } 868 869 static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, 870 bool initial, bool metadata_only) 871 { 872 struct bch_fs *c = trans->c; 873 struct btree_iter iter; 874 struct btree *b; 875 unsigned depth = metadata_only ? 1 : 0; 876 int ret = 0; 877 878 gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); 879 880 __for_each_btree_node(trans, iter, btree_id, POS_MIN, 881 0, depth, BTREE_ITER_PREFETCH, b, ret) { 882 bch2_verify_btree_nr_keys(b); 883 884 gc_pos_set(c, gc_pos_btree_node(b)); 885 886 ret = btree_gc_mark_node(trans, b, initial); 887 if (ret) 888 break; 889 } 890 bch2_trans_iter_exit(trans, &iter); 891 892 if (ret) 893 return ret; 894 895 mutex_lock(&c->btree_root_lock); 896 b = bch2_btree_id_root(c, btree_id)->b; 897 if (!btree_node_fake(b)) { 898 struct bkey_s_c k = bkey_i_to_s_c(&b->key); 899 900 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, 901 true, &k, initial); 902 } 903 gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); 904 mutex_unlock(&c->btree_root_lock); 905 906 return ret; 907 } 908 909 static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, 910 unsigned target_depth) 911 { 912 struct bch_fs *c = trans->c; 913 struct btree_and_journal_iter iter; 914 struct bkey_s_c k; 915 struct bkey_buf cur, prev; 916 struct printbuf buf = PRINTBUF; 917 int ret = 0; 918 919 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 920 bch2_bkey_buf_init(&prev); 921 bch2_bkey_buf_init(&cur); 922 bkey_init(&prev.k->k); 923 924 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 925 BUG_ON(bpos_lt(k.k->p, b->data->min_key)); 926 BUG_ON(bpos_gt(k.k->p, b->data->max_key)); 927 928 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, 929 false, &k, true); 930 if (ret) 931 goto fsck_err; 932 933 if (b->c.level) { 934 bch2_bkey_buf_reassemble(&cur, c, k); 935 k = bkey_i_to_s_c(cur.k); 936 937 bch2_btree_and_journal_iter_advance(&iter); 938 939 ret = bch2_gc_check_topology(c, b, 940 &prev, cur, 941 !bch2_btree_and_journal_iter_peek(&iter).k); 942 if (ret) 943 goto fsck_err; 944 } else { 945 bch2_btree_and_journal_iter_advance(&iter); 946 } 947 } 948 949 if (b->c.level > target_depth) { 950 bch2_btree_and_journal_iter_exit(&iter); 951 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); 952 953 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 954 struct btree *child; 955 956 bch2_bkey_buf_reassemble(&cur, c, k); 957 bch2_btree_and_journal_iter_advance(&iter); 958 959 child = bch2_btree_node_get_noiter(trans, cur.k, 960 b->c.btree_id, b->c.level - 1, 961 false); 962 ret = PTR_ERR_OR_ZERO(child); 963 964 if (ret == -EIO) { 965 bch2_topology_error(c); 966 967 if (__fsck_err(c, 968 FSCK_CAN_FIX| 969 FSCK_CAN_IGNORE| 970 FSCK_NO_RATELIMIT, 971 "Unreadable btree node at btree %s level %u:\n" 972 " %s", 973 bch2_btree_ids[b->c.btree_id], 974 b->c.level - 1, 975 (printbuf_reset(&buf), 976 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && 977 should_restart_for_topology_repair(c)) { 978 bch_info(c, "Halting mark and sweep to start topology repair pass"); 979 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); 980 goto fsck_err; 981 } else { 982 /* Continue marking when opted to not 983 * fix the error: */ 984 ret = 0; 985 set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); 986 continue; 987 } 988 } else if (ret) { 989 bch_err_msg(c, ret, "getting btree node"); 990 break; 991 } 992 993 ret = bch2_gc_btree_init_recurse(trans, child, 994 target_depth); 995 six_unlock_read(&child->c.lock); 996 997 if (ret) 998 break; 999 } 1000 } 1001 fsck_err: 1002 bch2_bkey_buf_exit(&cur, c); 1003 bch2_bkey_buf_exit(&prev, c); 1004 bch2_btree_and_journal_iter_exit(&iter); 1005 printbuf_exit(&buf); 1006 return ret; 1007 } 1008 1009 static int bch2_gc_btree_init(struct btree_trans *trans, 1010 enum btree_id btree_id, 1011 bool metadata_only) 1012 { 1013 struct bch_fs *c = trans->c; 1014 struct btree *b; 1015 unsigned target_depth = metadata_only ? 1 : 0; 1016 struct printbuf buf = PRINTBUF; 1017 int ret = 0; 1018 1019 b = bch2_btree_id_root(c, btree_id)->b; 1020 1021 if (btree_node_fake(b)) 1022 return 0; 1023 1024 six_lock_read(&b->c.lock, NULL, NULL); 1025 printbuf_reset(&buf); 1026 bch2_bpos_to_text(&buf, b->data->min_key); 1027 if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, 1028 "btree root with incorrect min_key: %s", buf.buf)) { 1029 bch_err(c, "repair unimplemented"); 1030 ret = -BCH_ERR_fsck_repair_unimplemented; 1031 goto fsck_err; 1032 } 1033 1034 printbuf_reset(&buf); 1035 bch2_bpos_to_text(&buf, b->data->max_key); 1036 if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, 1037 "btree root with incorrect max_key: %s", buf.buf)) { 1038 bch_err(c, "repair unimplemented"); 1039 ret = -BCH_ERR_fsck_repair_unimplemented; 1040 goto fsck_err; 1041 } 1042 1043 if (b->c.level >= target_depth) 1044 ret = bch2_gc_btree_init_recurse(trans, b, target_depth); 1045 1046 if (!ret) { 1047 struct bkey_s_c k = bkey_i_to_s_c(&b->key); 1048 1049 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true, 1050 &k, true); 1051 } 1052 fsck_err: 1053 six_unlock_read(&b->c.lock); 1054 1055 if (ret < 0) 1056 bch_err_fn(c, ret); 1057 printbuf_exit(&buf); 1058 return ret; 1059 } 1060 1061 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) 1062 { 1063 return (int) btree_id_to_gc_phase(l) - 1064 (int) btree_id_to_gc_phase(r); 1065 } 1066 1067 static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) 1068 { 1069 struct btree_trans *trans = bch2_trans_get(c); 1070 enum btree_id ids[BTREE_ID_NR]; 1071 unsigned i; 1072 int ret = 0; 1073 1074 for (i = 0; i < BTREE_ID_NR; i++) 1075 ids[i] = i; 1076 bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); 1077 1078 for (i = 0; i < BTREE_ID_NR && !ret; i++) 1079 ret = initial 1080 ? bch2_gc_btree_init(trans, ids[i], metadata_only) 1081 : bch2_gc_btree(trans, ids[i], initial, metadata_only); 1082 1083 for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { 1084 if (!bch2_btree_id_root(c, i)->alive) 1085 continue; 1086 1087 ret = initial 1088 ? bch2_gc_btree_init(trans, i, metadata_only) 1089 : bch2_gc_btree(trans, i, initial, metadata_only); 1090 } 1091 1092 if (ret < 0) 1093 bch_err_fn(c, ret); 1094 1095 bch2_trans_put(trans); 1096 return ret; 1097 } 1098 1099 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, 1100 u64 start, u64 end, 1101 enum bch_data_type type, 1102 unsigned flags) 1103 { 1104 u64 b = sector_to_bucket(ca, start); 1105 1106 do { 1107 unsigned sectors = 1108 min_t(u64, bucket_to_sector(ca, b + 1), end) - start; 1109 1110 bch2_mark_metadata_bucket(c, ca, b, type, sectors, 1111 gc_phase(GC_PHASE_SB), flags); 1112 b++; 1113 start += sectors; 1114 } while (start < end); 1115 } 1116 1117 static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, 1118 unsigned flags) 1119 { 1120 struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; 1121 unsigned i; 1122 u64 b; 1123 1124 for (i = 0; i < layout->nr_superblocks; i++) { 1125 u64 offset = le64_to_cpu(layout->sb_offset[i]); 1126 1127 if (offset == BCH_SB_SECTOR) 1128 mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, 1129 BCH_DATA_sb, flags); 1130 1131 mark_metadata_sectors(c, ca, offset, 1132 offset + (1 << layout->sb_max_size_bits), 1133 BCH_DATA_sb, flags); 1134 } 1135 1136 for (i = 0; i < ca->journal.nr; i++) { 1137 b = ca->journal.buckets[i]; 1138 bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, 1139 ca->mi.bucket_size, 1140 gc_phase(GC_PHASE_SB), flags); 1141 } 1142 } 1143 1144 static void bch2_mark_superblocks(struct bch_fs *c) 1145 { 1146 struct bch_dev *ca; 1147 unsigned i; 1148 1149 mutex_lock(&c->sb_lock); 1150 gc_pos_set(c, gc_phase(GC_PHASE_SB)); 1151 1152 for_each_online_member(ca, c, i) 1153 bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); 1154 mutex_unlock(&c->sb_lock); 1155 } 1156 1157 #if 0 1158 /* Also see bch2_pending_btree_node_free_insert_done() */ 1159 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) 1160 { 1161 struct btree_update *as; 1162 struct pending_btree_node_free *d; 1163 1164 mutex_lock(&c->btree_interior_update_lock); 1165 gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); 1166 1167 for_each_pending_btree_node_free(c, as, d) 1168 if (d->index_update_done) 1169 bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); 1170 1171 mutex_unlock(&c->btree_interior_update_lock); 1172 } 1173 #endif 1174 1175 static void bch2_gc_free(struct bch_fs *c) 1176 { 1177 struct bch_dev *ca; 1178 unsigned i; 1179 1180 genradix_free(&c->reflink_gc_table); 1181 genradix_free(&c->gc_stripes); 1182 1183 for_each_member_device(ca, c, i) { 1184 kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), 1185 sizeof(struct bucket_array) + 1186 ca->mi.nbuckets * sizeof(struct bucket)); 1187 ca->buckets_gc = NULL; 1188 1189 free_percpu(ca->usage_gc); 1190 ca->usage_gc = NULL; 1191 } 1192 1193 free_percpu(c->usage_gc); 1194 c->usage_gc = NULL; 1195 } 1196 1197 static int bch2_gc_done(struct bch_fs *c, 1198 bool initial, bool metadata_only) 1199 { 1200 struct bch_dev *ca = NULL; 1201 struct printbuf buf = PRINTBUF; 1202 bool verify = !metadata_only && 1203 !c->opts.reconstruct_alloc && 1204 (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); 1205 unsigned i, dev; 1206 int ret = 0; 1207 1208 percpu_down_write(&c->mark_lock); 1209 1210 #define copy_field(_f, _msg, ...) \ 1211 if (dst->_f != src->_f && \ 1212 (!verify || \ 1213 fsck_err(c, _msg ": got %llu, should be %llu" \ 1214 , ##__VA_ARGS__, dst->_f, src->_f))) \ 1215 dst->_f = src->_f 1216 #define copy_dev_field(_f, _msg, ...) \ 1217 copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) 1218 #define copy_fs_field(_f, _msg, ...) \ 1219 copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) 1220 1221 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 1222 bch2_fs_usage_acc_to_base(c, i); 1223 1224 for_each_member_device(ca, c, dev) { 1225 struct bch_dev_usage *dst = ca->usage_base; 1226 struct bch_dev_usage *src = (void *) 1227 bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, 1228 dev_usage_u64s()); 1229 1230 copy_dev_field(buckets_ec, "buckets_ec"); 1231 1232 for (i = 0; i < BCH_DATA_NR; i++) { 1233 copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); 1234 copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); 1235 copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); 1236 } 1237 } 1238 1239 { 1240 unsigned nr = fs_usage_u64s(c); 1241 struct bch_fs_usage *dst = c->usage_base; 1242 struct bch_fs_usage *src = (void *) 1243 bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); 1244 1245 copy_fs_field(hidden, "hidden"); 1246 copy_fs_field(btree, "btree"); 1247 1248 if (!metadata_only) { 1249 copy_fs_field(data, "data"); 1250 copy_fs_field(cached, "cached"); 1251 copy_fs_field(reserved, "reserved"); 1252 copy_fs_field(nr_inodes,"nr_inodes"); 1253 1254 for (i = 0; i < BCH_REPLICAS_MAX; i++) 1255 copy_fs_field(persistent_reserved[i], 1256 "persistent_reserved[%i]", i); 1257 } 1258 1259 for (i = 0; i < c->replicas.nr; i++) { 1260 struct bch_replicas_entry *e = 1261 cpu_replicas_entry(&c->replicas, i); 1262 1263 if (metadata_only && 1264 (e->data_type == BCH_DATA_user || 1265 e->data_type == BCH_DATA_cached)) 1266 continue; 1267 1268 printbuf_reset(&buf); 1269 bch2_replicas_entry_to_text(&buf, e); 1270 1271 copy_fs_field(replicas[i], "%s", buf.buf); 1272 } 1273 } 1274 1275 #undef copy_fs_field 1276 #undef copy_dev_field 1277 #undef copy_stripe_field 1278 #undef copy_field 1279 fsck_err: 1280 if (ca) 1281 percpu_ref_put(&ca->ref); 1282 if (ret) 1283 bch_err_fn(c, ret); 1284 1285 percpu_up_write(&c->mark_lock); 1286 printbuf_exit(&buf); 1287 return ret; 1288 } 1289 1290 static int bch2_gc_start(struct bch_fs *c) 1291 { 1292 struct bch_dev *ca = NULL; 1293 unsigned i; 1294 1295 BUG_ON(c->usage_gc); 1296 1297 c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), 1298 sizeof(u64), GFP_KERNEL); 1299 if (!c->usage_gc) { 1300 bch_err(c, "error allocating c->usage_gc"); 1301 return -BCH_ERR_ENOMEM_gc_start; 1302 } 1303 1304 for_each_member_device(ca, c, i) { 1305 BUG_ON(ca->usage_gc); 1306 1307 ca->usage_gc = alloc_percpu(struct bch_dev_usage); 1308 if (!ca->usage_gc) { 1309 bch_err(c, "error allocating ca->usage_gc"); 1310 percpu_ref_put(&ca->ref); 1311 return -BCH_ERR_ENOMEM_gc_start; 1312 } 1313 1314 this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets, 1315 ca->mi.nbuckets - ca->mi.first_bucket); 1316 } 1317 1318 return 0; 1319 } 1320 1321 static int bch2_gc_reset(struct bch_fs *c) 1322 { 1323 struct bch_dev *ca; 1324 unsigned i; 1325 1326 for_each_member_device(ca, c, i) { 1327 free_percpu(ca->usage_gc); 1328 ca->usage_gc = NULL; 1329 } 1330 1331 free_percpu(c->usage_gc); 1332 c->usage_gc = NULL; 1333 1334 return bch2_gc_start(c); 1335 } 1336 1337 /* returns true if not equal */ 1338 static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, 1339 struct bch_alloc_v4 r) 1340 { 1341 return l.gen != r.gen || 1342 l.oldest_gen != r.oldest_gen || 1343 l.data_type != r.data_type || 1344 l.dirty_sectors != r.dirty_sectors || 1345 l.cached_sectors != r.cached_sectors || 1346 l.stripe_redundancy != r.stripe_redundancy || 1347 l.stripe != r.stripe; 1348 } 1349 1350 static int bch2_alloc_write_key(struct btree_trans *trans, 1351 struct btree_iter *iter, 1352 struct bkey_s_c k, 1353 bool metadata_only) 1354 { 1355 struct bch_fs *c = trans->c; 1356 struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); 1357 struct bucket gc, *b; 1358 struct bkey_i_alloc_v4 *a; 1359 struct bch_alloc_v4 old_convert, new; 1360 const struct bch_alloc_v4 *old; 1361 enum bch_data_type type; 1362 int ret; 1363 1364 if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets))) 1365 return 1; 1366 1367 old = bch2_alloc_to_v4(k, &old_convert); 1368 new = *old; 1369 1370 percpu_down_read(&c->mark_lock); 1371 b = gc_bucket(ca, iter->pos.offset); 1372 1373 /* 1374 * b->data_type doesn't yet include need_discard & need_gc_gen states - 1375 * fix that here: 1376 */ 1377 type = __alloc_data_type(b->dirty_sectors, 1378 b->cached_sectors, 1379 b->stripe, 1380 *old, 1381 b->data_type); 1382 if (b->data_type != type) { 1383 struct bch_dev_usage *u; 1384 1385 preempt_disable(); 1386 u = this_cpu_ptr(ca->usage_gc); 1387 u->d[b->data_type].buckets--; 1388 b->data_type = type; 1389 u->d[b->data_type].buckets++; 1390 preempt_enable(); 1391 } 1392 1393 gc = *b; 1394 percpu_up_read(&c->mark_lock); 1395 1396 if (metadata_only && 1397 gc.data_type != BCH_DATA_sb && 1398 gc.data_type != BCH_DATA_journal && 1399 gc.data_type != BCH_DATA_btree) 1400 return 0; 1401 1402 if (gen_after(old->gen, gc.gen)) 1403 return 0; 1404 1405 if (c->opts.reconstruct_alloc || 1406 fsck_err_on(new.data_type != gc.data_type, c, 1407 "bucket %llu:%llu gen %u has wrong data_type" 1408 ": got %s, should be %s", 1409 iter->pos.inode, iter->pos.offset, 1410 gc.gen, 1411 bch2_data_types[new.data_type], 1412 bch2_data_types[gc.data_type])) 1413 new.data_type = gc.data_type; 1414 1415 #define copy_bucket_field(_f) \ 1416 if (c->opts.reconstruct_alloc || \ 1417 fsck_err_on(new._f != gc._f, c, \ 1418 "bucket %llu:%llu gen %u data type %s has wrong " #_f \ 1419 ": got %u, should be %u", \ 1420 iter->pos.inode, iter->pos.offset, \ 1421 gc.gen, \ 1422 bch2_data_types[gc.data_type], \ 1423 new._f, gc._f)) \ 1424 new._f = gc._f; \ 1425 1426 copy_bucket_field(gen); 1427 copy_bucket_field(dirty_sectors); 1428 copy_bucket_field(cached_sectors); 1429 copy_bucket_field(stripe_redundancy); 1430 copy_bucket_field(stripe); 1431 #undef copy_bucket_field 1432 1433 if (!bch2_alloc_v4_cmp(*old, new)) 1434 return 0; 1435 1436 a = bch2_alloc_to_v4_mut(trans, k); 1437 ret = PTR_ERR_OR_ZERO(a); 1438 if (ret) 1439 return ret; 1440 1441 a->v = new; 1442 1443 /* 1444 * The trigger normally makes sure this is set, but we're not running 1445 * triggers: 1446 */ 1447 if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) 1448 a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); 1449 1450 ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); 1451 fsck_err: 1452 return ret; 1453 } 1454 1455 static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) 1456 { 1457 struct btree_trans *trans = bch2_trans_get(c); 1458 struct btree_iter iter; 1459 struct bkey_s_c k; 1460 struct bch_dev *ca; 1461 unsigned i; 1462 int ret = 0; 1463 1464 for_each_member_device(ca, c, i) { 1465 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, 1466 POS(ca->dev_idx, ca->mi.first_bucket), 1467 BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, 1468 NULL, NULL, BTREE_INSERT_LAZY_RW, 1469 bch2_alloc_write_key(trans, &iter, k, metadata_only)); 1470 1471 if (ret < 0) { 1472 bch_err_fn(c, ret); 1473 percpu_ref_put(&ca->ref); 1474 break; 1475 } 1476 } 1477 1478 bch2_trans_put(trans); 1479 return ret < 0 ? ret : 0; 1480 } 1481 1482 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) 1483 { 1484 struct bch_dev *ca; 1485 struct btree_trans *trans = bch2_trans_get(c); 1486 struct btree_iter iter; 1487 struct bkey_s_c k; 1488 struct bucket *g; 1489 struct bch_alloc_v4 a_convert; 1490 const struct bch_alloc_v4 *a; 1491 unsigned i; 1492 int ret; 1493 1494 for_each_member_device(ca, c, i) { 1495 struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + 1496 ca->mi.nbuckets * sizeof(struct bucket), 1497 GFP_KERNEL|__GFP_ZERO); 1498 if (!buckets) { 1499 percpu_ref_put(&ca->ref); 1500 bch_err(c, "error allocating ca->buckets[gc]"); 1501 ret = -BCH_ERR_ENOMEM_gc_alloc_start; 1502 goto err; 1503 } 1504 1505 buckets->first_bucket = ca->mi.first_bucket; 1506 buckets->nbuckets = ca->mi.nbuckets; 1507 rcu_assign_pointer(ca->buckets_gc, buckets); 1508 } 1509 1510 for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, 1511 BTREE_ITER_PREFETCH, k, ret) { 1512 ca = bch_dev_bkey_exists(c, k.k->p.inode); 1513 g = gc_bucket(ca, k.k->p.offset); 1514 1515 a = bch2_alloc_to_v4(k, &a_convert); 1516 1517 g->gen_valid = 1; 1518 g->gen = a->gen; 1519 1520 if (metadata_only && 1521 (a->data_type == BCH_DATA_user || 1522 a->data_type == BCH_DATA_cached || 1523 a->data_type == BCH_DATA_parity)) { 1524 g->data_type = a->data_type; 1525 g->dirty_sectors = a->dirty_sectors; 1526 g->cached_sectors = a->cached_sectors; 1527 g->stripe = a->stripe; 1528 g->stripe_redundancy = a->stripe_redundancy; 1529 } 1530 } 1531 bch2_trans_iter_exit(trans, &iter); 1532 err: 1533 bch2_trans_put(trans); 1534 if (ret) 1535 bch_err_fn(c, ret); 1536 return ret; 1537 } 1538 1539 static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) 1540 { 1541 struct bch_dev *ca; 1542 unsigned i; 1543 1544 for_each_member_device(ca, c, i) { 1545 struct bucket_array *buckets = gc_bucket_array(ca); 1546 struct bucket *g; 1547 1548 for_each_bucket(g, buckets) { 1549 if (metadata_only && 1550 (g->data_type == BCH_DATA_user || 1551 g->data_type == BCH_DATA_cached || 1552 g->data_type == BCH_DATA_parity)) 1553 continue; 1554 g->data_type = 0; 1555 g->dirty_sectors = 0; 1556 g->cached_sectors = 0; 1557 } 1558 } 1559 } 1560 1561 static int bch2_gc_write_reflink_key(struct btree_trans *trans, 1562 struct btree_iter *iter, 1563 struct bkey_s_c k, 1564 size_t *idx) 1565 { 1566 struct bch_fs *c = trans->c; 1567 const __le64 *refcount = bkey_refcount_c(k); 1568 struct printbuf buf = PRINTBUF; 1569 struct reflink_gc *r; 1570 int ret = 0; 1571 1572 if (!refcount) 1573 return 0; 1574 1575 while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && 1576 r->offset < k.k->p.offset) 1577 ++*idx; 1578 1579 if (!r || 1580 r->offset != k.k->p.offset || 1581 r->size != k.k->size) { 1582 bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); 1583 return -EINVAL; 1584 } 1585 1586 if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, 1587 "reflink key has wrong refcount:\n" 1588 " %s\n" 1589 " should be %u", 1590 (bch2_bkey_val_to_text(&buf, c, k), buf.buf), 1591 r->refcount)) { 1592 struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0); 1593 1594 ret = PTR_ERR_OR_ZERO(new); 1595 if (ret) 1596 return ret; 1597 1598 if (!r->refcount) 1599 new->k.type = KEY_TYPE_deleted; 1600 else 1601 *bkey_refcount(new) = cpu_to_le64(r->refcount); 1602 } 1603 fsck_err: 1604 printbuf_exit(&buf); 1605 return ret; 1606 } 1607 1608 static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) 1609 { 1610 struct btree_trans *trans; 1611 struct btree_iter iter; 1612 struct bkey_s_c k; 1613 size_t idx = 0; 1614 int ret = 0; 1615 1616 if (metadata_only) 1617 return 0; 1618 1619 trans = bch2_trans_get(c); 1620 1621 ret = for_each_btree_key_commit(trans, iter, 1622 BTREE_ID_reflink, POS_MIN, 1623 BTREE_ITER_PREFETCH, k, 1624 NULL, NULL, BTREE_INSERT_NOFAIL, 1625 bch2_gc_write_reflink_key(trans, &iter, k, &idx)); 1626 1627 c->reflink_gc_nr = 0; 1628 bch2_trans_put(trans); 1629 return ret; 1630 } 1631 1632 static int bch2_gc_reflink_start(struct bch_fs *c, 1633 bool metadata_only) 1634 { 1635 struct btree_trans *trans; 1636 struct btree_iter iter; 1637 struct bkey_s_c k; 1638 struct reflink_gc *r; 1639 int ret = 0; 1640 1641 if (metadata_only) 1642 return 0; 1643 1644 trans = bch2_trans_get(c); 1645 c->reflink_gc_nr = 0; 1646 1647 for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, 1648 BTREE_ITER_PREFETCH, k, ret) { 1649 const __le64 *refcount = bkey_refcount_c(k); 1650 1651 if (!refcount) 1652 continue; 1653 1654 r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, 1655 GFP_KERNEL); 1656 if (!r) { 1657 ret = -BCH_ERR_ENOMEM_gc_reflink_start; 1658 break; 1659 } 1660 1661 r->offset = k.k->p.offset; 1662 r->size = k.k->size; 1663 r->refcount = 0; 1664 } 1665 bch2_trans_iter_exit(trans, &iter); 1666 1667 bch2_trans_put(trans); 1668 return ret; 1669 } 1670 1671 static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) 1672 { 1673 struct genradix_iter iter; 1674 struct reflink_gc *r; 1675 1676 genradix_for_each(&c->reflink_gc_table, iter, r) 1677 r->refcount = 0; 1678 } 1679 1680 static int bch2_gc_write_stripes_key(struct btree_trans *trans, 1681 struct btree_iter *iter, 1682 struct bkey_s_c k) 1683 { 1684 struct bch_fs *c = trans->c; 1685 struct printbuf buf = PRINTBUF; 1686 const struct bch_stripe *s; 1687 struct gc_stripe *m; 1688 bool bad = false; 1689 unsigned i; 1690 int ret = 0; 1691 1692 if (k.k->type != KEY_TYPE_stripe) 1693 return 0; 1694 1695 s = bkey_s_c_to_stripe(k).v; 1696 m = genradix_ptr(&c->gc_stripes, k.k->p.offset); 1697 1698 for (i = 0; i < s->nr_blocks; i++) { 1699 u32 old = stripe_blockcount_get(s, i); 1700 u32 new = (m ? m->block_sectors[i] : 0); 1701 1702 if (old != new) { 1703 prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n", 1704 i, old, new); 1705 bad = true; 1706 } 1707 } 1708 1709 if (bad) 1710 bch2_bkey_val_to_text(&buf, c, k); 1711 1712 if (fsck_err_on(bad, c, "%s", buf.buf)) { 1713 struct bkey_i_stripe *new; 1714 1715 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); 1716 ret = PTR_ERR_OR_ZERO(new); 1717 if (ret) 1718 return ret; 1719 1720 bkey_reassemble(&new->k_i, k); 1721 1722 for (i = 0; i < new->v.nr_blocks; i++) 1723 stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); 1724 1725 ret = bch2_trans_update(trans, iter, &new->k_i, 0); 1726 } 1727 fsck_err: 1728 printbuf_exit(&buf); 1729 return ret; 1730 } 1731 1732 static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) 1733 { 1734 struct btree_trans *trans; 1735 struct btree_iter iter; 1736 struct bkey_s_c k; 1737 int ret = 0; 1738 1739 if (metadata_only) 1740 return 0; 1741 1742 trans = bch2_trans_get(c); 1743 1744 ret = for_each_btree_key_commit(trans, iter, 1745 BTREE_ID_stripes, POS_MIN, 1746 BTREE_ITER_PREFETCH, k, 1747 NULL, NULL, BTREE_INSERT_NOFAIL, 1748 bch2_gc_write_stripes_key(trans, &iter, k)); 1749 1750 bch2_trans_put(trans); 1751 return ret; 1752 } 1753 1754 static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) 1755 { 1756 genradix_free(&c->gc_stripes); 1757 } 1758 1759 /** 1760 * bch2_gc - walk _all_ references to buckets, and recompute them: 1761 * 1762 * @c: filesystem object 1763 * @initial: are we in recovery? 1764 * @metadata_only: are we just checking metadata references, or everything? 1765 * 1766 * Returns: 0 on success, or standard errcode on failure 1767 * 1768 * Order matters here: 1769 * - Concurrent GC relies on the fact that we have a total ordering for 1770 * everything that GC walks - see gc_will_visit_node(), 1771 * gc_will_visit_root() 1772 * 1773 * - also, references move around in the course of index updates and 1774 * various other crap: everything needs to agree on the ordering 1775 * references are allowed to move around in - e.g., we're allowed to 1776 * start with a reference owned by an open_bucket (the allocator) and 1777 * move it to the btree, but not the reverse. 1778 * 1779 * This is necessary to ensure that gc doesn't miss references that 1780 * move around - if references move backwards in the ordering GC 1781 * uses, GC could skip past them 1782 */ 1783 int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) 1784 { 1785 unsigned iter = 0; 1786 int ret; 1787 1788 lockdep_assert_held(&c->state_lock); 1789 1790 down_write(&c->gc_lock); 1791 1792 bch2_btree_interior_updates_flush(c); 1793 1794 ret = bch2_gc_start(c) ?: 1795 bch2_gc_alloc_start(c, metadata_only) ?: 1796 bch2_gc_reflink_start(c, metadata_only); 1797 if (ret) 1798 goto out; 1799 again: 1800 gc_pos_set(c, gc_phase(GC_PHASE_START)); 1801 1802 bch2_mark_superblocks(c); 1803 1804 ret = bch2_gc_btrees(c, initial, metadata_only); 1805 1806 if (ret) 1807 goto out; 1808 1809 #if 0 1810 bch2_mark_pending_btree_node_frees(c); 1811 #endif 1812 c->gc_count++; 1813 1814 if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || 1815 (!iter && bch2_test_restart_gc)) { 1816 if (iter++ > 2) { 1817 bch_info(c, "Unable to fix bucket gens, looping"); 1818 ret = -EINVAL; 1819 goto out; 1820 } 1821 1822 /* 1823 * XXX: make sure gens we fixed got saved 1824 */ 1825 bch_info(c, "Second GC pass needed, restarting:"); 1826 clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); 1827 __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); 1828 1829 bch2_gc_stripes_reset(c, metadata_only); 1830 bch2_gc_alloc_reset(c, metadata_only); 1831 bch2_gc_reflink_reset(c, metadata_only); 1832 ret = bch2_gc_reset(c); 1833 if (ret) 1834 goto out; 1835 1836 /* flush fsck errors, reset counters */ 1837 bch2_flush_fsck_errs(c); 1838 goto again; 1839 } 1840 out: 1841 if (!ret) { 1842 bch2_journal_block(&c->journal); 1843 1844 ret = bch2_gc_stripes_done(c, metadata_only) ?: 1845 bch2_gc_reflink_done(c, metadata_only) ?: 1846 bch2_gc_alloc_done(c, metadata_only) ?: 1847 bch2_gc_done(c, initial, metadata_only); 1848 1849 bch2_journal_unblock(&c->journal); 1850 } 1851 1852 percpu_down_write(&c->mark_lock); 1853 /* Indicates that gc is no longer in progress: */ 1854 __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); 1855 1856 bch2_gc_free(c); 1857 percpu_up_write(&c->mark_lock); 1858 1859 up_write(&c->gc_lock); 1860 1861 /* 1862 * At startup, allocations can happen directly instead of via the 1863 * allocator thread - issue wakeup in case they blocked on gc_lock: 1864 */ 1865 closure_wake_up(&c->freelist_wait); 1866 1867 if (ret) 1868 bch_err_fn(c, ret); 1869 return ret; 1870 } 1871 1872 static int gc_btree_gens_key(struct btree_trans *trans, 1873 struct btree_iter *iter, 1874 struct bkey_s_c k) 1875 { 1876 struct bch_fs *c = trans->c; 1877 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1878 const struct bch_extent_ptr *ptr; 1879 struct bkey_i *u; 1880 int ret; 1881 1882 percpu_down_read(&c->mark_lock); 1883 bkey_for_each_ptr(ptrs, ptr) { 1884 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 1885 1886 if (ptr_stale(ca, ptr) > 16) { 1887 percpu_up_read(&c->mark_lock); 1888 goto update; 1889 } 1890 } 1891 1892 bkey_for_each_ptr(ptrs, ptr) { 1893 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 1894 u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; 1895 1896 if (gen_after(*gen, ptr->gen)) 1897 *gen = ptr->gen; 1898 } 1899 percpu_up_read(&c->mark_lock); 1900 return 0; 1901 update: 1902 u = bch2_bkey_make_mut(trans, iter, &k, 0); 1903 ret = PTR_ERR_OR_ZERO(u); 1904 if (ret) 1905 return ret; 1906 1907 bch2_extent_normalize(c, bkey_i_to_s(u)); 1908 return 0; 1909 } 1910 1911 static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, 1912 struct bkey_s_c k) 1913 { 1914 struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); 1915 struct bch_alloc_v4 a_convert; 1916 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); 1917 struct bkey_i_alloc_v4 *a_mut; 1918 int ret; 1919 1920 if (a->oldest_gen == ca->oldest_gen[iter->pos.offset]) 1921 return 0; 1922 1923 a_mut = bch2_alloc_to_v4_mut(trans, k); 1924 ret = PTR_ERR_OR_ZERO(a_mut); 1925 if (ret) 1926 return ret; 1927 1928 a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; 1929 a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type); 1930 1931 return bch2_trans_update(trans, iter, &a_mut->k_i, 0); 1932 } 1933 1934 int bch2_gc_gens(struct bch_fs *c) 1935 { 1936 struct btree_trans *trans; 1937 struct btree_iter iter; 1938 struct bkey_s_c k; 1939 struct bch_dev *ca; 1940 u64 b, start_time = local_clock(); 1941 unsigned i; 1942 int ret; 1943 1944 /* 1945 * Ideally we would be using state_lock and not gc_lock here, but that 1946 * introduces a deadlock in the RO path - we currently take the state 1947 * lock at the start of going RO, thus the gc thread may get stuck: 1948 */ 1949 if (!mutex_trylock(&c->gc_gens_lock)) 1950 return 0; 1951 1952 trace_and_count(c, gc_gens_start, c); 1953 down_read(&c->gc_lock); 1954 trans = bch2_trans_get(c); 1955 1956 for_each_member_device(ca, c, i) { 1957 struct bucket_gens *gens; 1958 1959 BUG_ON(ca->oldest_gen); 1960 1961 ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL); 1962 if (!ca->oldest_gen) { 1963 percpu_ref_put(&ca->ref); 1964 ret = -BCH_ERR_ENOMEM_gc_gens; 1965 goto err; 1966 } 1967 1968 gens = bucket_gens(ca); 1969 1970 for (b = gens->first_bucket; 1971 b < gens->nbuckets; b++) 1972 ca->oldest_gen[b] = gens->b[b]; 1973 } 1974 1975 for (i = 0; i < BTREE_ID_NR; i++) 1976 if (btree_type_has_ptrs(i)) { 1977 c->gc_gens_btree = i; 1978 c->gc_gens_pos = POS_MIN; 1979 1980 ret = for_each_btree_key_commit(trans, iter, i, 1981 POS_MIN, 1982 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, 1983 k, 1984 NULL, NULL, 1985 BTREE_INSERT_NOFAIL, 1986 gc_btree_gens_key(trans, &iter, k)); 1987 if (ret && !bch2_err_matches(ret, EROFS)) 1988 bch_err_fn(c, ret); 1989 if (ret) 1990 goto err; 1991 } 1992 1993 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, 1994 POS_MIN, 1995 BTREE_ITER_PREFETCH, 1996 k, 1997 NULL, NULL, 1998 BTREE_INSERT_NOFAIL, 1999 bch2_alloc_write_oldest_gen(trans, &iter, k)); 2000 if (ret && !bch2_err_matches(ret, EROFS)) 2001 bch_err_fn(c, ret); 2002 if (ret) 2003 goto err; 2004 2005 c->gc_gens_btree = 0; 2006 c->gc_gens_pos = POS_MIN; 2007 2008 c->gc_count++; 2009 2010 bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); 2011 trace_and_count(c, gc_gens_end, c); 2012 err: 2013 for_each_member_device(ca, c, i) { 2014 kvfree(ca->oldest_gen); 2015 ca->oldest_gen = NULL; 2016 } 2017 2018 bch2_trans_put(trans); 2019 up_read(&c->gc_lock); 2020 mutex_unlock(&c->gc_gens_lock); 2021 return ret; 2022 } 2023 2024 static int bch2_gc_thread(void *arg) 2025 { 2026 struct bch_fs *c = arg; 2027 struct io_clock *clock = &c->io_clock[WRITE]; 2028 unsigned long last = atomic64_read(&clock->now); 2029 unsigned last_kick = atomic_read(&c->kick_gc); 2030 int ret; 2031 2032 set_freezable(); 2033 2034 while (1) { 2035 while (1) { 2036 set_current_state(TASK_INTERRUPTIBLE); 2037 2038 if (kthread_should_stop()) { 2039 __set_current_state(TASK_RUNNING); 2040 return 0; 2041 } 2042 2043 if (atomic_read(&c->kick_gc) != last_kick) 2044 break; 2045 2046 if (c->btree_gc_periodic) { 2047 unsigned long next = last + c->capacity / 16; 2048 2049 if (atomic64_read(&clock->now) >= next) 2050 break; 2051 2052 bch2_io_clock_schedule_timeout(clock, next); 2053 } else { 2054 schedule(); 2055 } 2056 2057 try_to_freeze(); 2058 } 2059 __set_current_state(TASK_RUNNING); 2060 2061 last = atomic64_read(&clock->now); 2062 last_kick = atomic_read(&c->kick_gc); 2063 2064 /* 2065 * Full gc is currently incompatible with btree key cache: 2066 */ 2067 #if 0 2068 ret = bch2_gc(c, false, false); 2069 #else 2070 ret = bch2_gc_gens(c); 2071 #endif 2072 if (ret < 0) 2073 bch_err_fn(c, ret); 2074 2075 debug_check_no_locks_held(); 2076 } 2077 2078 return 0; 2079 } 2080 2081 void bch2_gc_thread_stop(struct bch_fs *c) 2082 { 2083 struct task_struct *p; 2084 2085 p = c->gc_thread; 2086 c->gc_thread = NULL; 2087 2088 if (p) { 2089 kthread_stop(p); 2090 put_task_struct(p); 2091 } 2092 } 2093 2094 int bch2_gc_thread_start(struct bch_fs *c) 2095 { 2096 struct task_struct *p; 2097 2098 if (c->gc_thread) 2099 return 0; 2100 2101 p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); 2102 if (IS_ERR(p)) { 2103 bch_err_fn(c, PTR_ERR(p)); 2104 return PTR_ERR(p); 2105 } 2106 2107 get_task_struct(p); 2108 c->gc_thread = p; 2109 wake_up_process(p); 2110 return 0; 2111 } 2112