1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * bcachefs setup/teardown code, and some metadata io - read a superblock and 4 * figure out what to do with it. 5 * 6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 7 * Copyright 2012 Google, Inc. 8 */ 9 10 #include "bcachefs.h" 11 #include "alloc_background.h" 12 #include "alloc_foreground.h" 13 #include "async_objs.h" 14 #include "backpointers.h" 15 #include "bkey_sort.h" 16 #include "btree_cache.h" 17 #include "btree_gc.h" 18 #include "btree_journal_iter.h" 19 #include "btree_key_cache.h" 20 #include "btree_node_scan.h" 21 #include "btree_update_interior.h" 22 #include "btree_io.h" 23 #include "btree_write_buffer.h" 24 #include "buckets_waiting_for_journal.h" 25 #include "chardev.h" 26 #include "checksum.h" 27 #include "clock.h" 28 #include "compress.h" 29 #include "debug.h" 30 #include "disk_accounting.h" 31 #include "disk_groups.h" 32 #include "ec.h" 33 #include "enumerated_ref.h" 34 #include "errcode.h" 35 #include "error.h" 36 #include "fs.h" 37 #include "fs-io.h" 38 #include "fs-io-buffered.h" 39 #include "fs-io-direct.h" 40 #include "fsck.h" 41 #include "inode.h" 42 #include "io_read.h" 43 #include "io_write.h" 44 #include "journal.h" 45 #include "journal_reclaim.h" 46 #include "journal_seq_blacklist.h" 47 #include "move.h" 48 #include "migrate.h" 49 #include "movinggc.h" 50 #include "nocow_locking.h" 51 #include "quota.h" 52 #include "rebalance.h" 53 #include "recovery.h" 54 #include "recovery_passes.h" 55 #include "replicas.h" 56 #include "sb-clean.h" 57 #include "sb-counters.h" 58 #include "sb-errors.h" 59 #include "sb-members.h" 60 #include "snapshot.h" 61 #include "subvolume.h" 62 #include "super.h" 63 #include "super-io.h" 64 #include "sysfs.h" 65 #include "thread_with_file.h" 66 #include "trace.h" 67 68 #include <linux/backing-dev.h> 69 #include <linux/blkdev.h> 70 #include <linux/debugfs.h> 71 #include <linux/device.h> 72 #include <linux/idr.h> 73 #include <linux/module.h> 74 #include <linux/percpu.h> 75 #include <linux/random.h> 76 #include <linux/sysfs.h> 77 78 MODULE_LICENSE("GPL"); 79 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); 80 MODULE_DESCRIPTION("bcachefs filesystem"); 81 82 typedef DARRAY(struct bch_sb_handle) bch_sb_handles; 83 84 #define x(n) #n, 85 const char * const bch2_fs_flag_strs[] = { 86 BCH_FS_FLAGS() 87 NULL 88 }; 89 90 const char * const bch2_write_refs[] = { 91 BCH_WRITE_REFS() 92 NULL 93 }; 94 95 const char * const bch2_dev_read_refs[] = { 96 BCH_DEV_READ_REFS() 97 NULL 98 }; 99 100 const char * const bch2_dev_write_refs[] = { 101 BCH_DEV_WRITE_REFS() 102 NULL 103 }; 104 #undef x 105 106 static void __bch2_print_str(struct bch_fs *c, const char *prefix, 107 const char *str) 108 { 109 #ifdef __KERNEL__ 110 struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); 111 112 if (unlikely(stdio)) { 113 bch2_stdio_redirect_printf(stdio, true, "%s", str); 114 return; 115 } 116 #endif 117 bch2_print_string_as_lines(KERN_ERR, str); 118 } 119 120 void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) 121 { 122 __bch2_print_str(c, prefix, str); 123 } 124 125 __printf(2, 0) 126 static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) 127 { 128 #ifdef __KERNEL__ 129 if (unlikely(stdio)) { 130 if (fmt[0] == KERN_SOH[0]) 131 fmt += 2; 132 133 bch2_stdio_redirect_vprintf(stdio, true, fmt, args); 134 return; 135 } 136 #endif 137 vprintk(fmt, args); 138 } 139 140 void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) 141 { 142 struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; 143 144 va_list args; 145 va_start(args, fmt); 146 bch2_print_maybe_redirect(stdio, fmt, args); 147 va_end(args); 148 } 149 150 void __bch2_print(struct bch_fs *c, const char *fmt, ...) 151 { 152 struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); 153 154 va_list args; 155 va_start(args, fmt); 156 bch2_print_maybe_redirect(stdio, fmt, args); 157 va_end(args); 158 } 159 160 #define KTYPE(type) \ 161 static const struct attribute_group type ## _group = { \ 162 .attrs = type ## _files \ 163 }; \ 164 \ 165 static const struct attribute_group *type ## _groups[] = { \ 166 &type ## _group, \ 167 NULL \ 168 }; \ 169 \ 170 static const struct kobj_type type ## _ktype = { \ 171 .release = type ## _release, \ 172 .sysfs_ops = &type ## _sysfs_ops, \ 173 .default_groups = type ## _groups \ 174 } 175 176 static void bch2_fs_release(struct kobject *); 177 static void bch2_dev_release(struct kobject *); 178 static void bch2_fs_counters_release(struct kobject *k) 179 { 180 } 181 182 static void bch2_fs_internal_release(struct kobject *k) 183 { 184 } 185 186 static void bch2_fs_opts_dir_release(struct kobject *k) 187 { 188 } 189 190 static void bch2_fs_time_stats_release(struct kobject *k) 191 { 192 } 193 194 KTYPE(bch2_fs); 195 KTYPE(bch2_fs_counters); 196 KTYPE(bch2_fs_internal); 197 KTYPE(bch2_fs_opts_dir); 198 KTYPE(bch2_fs_time_stats); 199 KTYPE(bch2_dev); 200 201 static struct kset *bcachefs_kset; 202 static LIST_HEAD(bch_fs_list); 203 static DEFINE_MUTEX(bch_fs_list_lock); 204 205 DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); 206 207 static void bch2_dev_unlink(struct bch_dev *); 208 static void bch2_dev_free(struct bch_dev *); 209 static int bch2_dev_alloc(struct bch_fs *, unsigned); 210 static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); 211 static void bch2_dev_io_ref_stop(struct bch_dev *, int); 212 static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); 213 214 struct bch_fs *bch2_dev_to_fs(dev_t dev) 215 { 216 guard(mutex)(&bch_fs_list_lock); 217 guard(rcu)(); 218 219 struct bch_fs *c; 220 list_for_each_entry(c, &bch_fs_list, list) 221 for_each_member_device_rcu(c, ca, NULL) 222 if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { 223 closure_get(&c->cl); 224 return c; 225 } 226 return NULL; 227 } 228 229 static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) 230 { 231 struct bch_fs *c; 232 233 lockdep_assert_held(&bch_fs_list_lock); 234 235 list_for_each_entry(c, &bch_fs_list, list) 236 if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) 237 return c; 238 239 return NULL; 240 } 241 242 struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) 243 { 244 struct bch_fs *c; 245 246 mutex_lock(&bch_fs_list_lock); 247 c = __bch2_uuid_to_fs(uuid); 248 if (c) 249 closure_get(&c->cl); 250 mutex_unlock(&bch_fs_list_lock); 251 252 return c; 253 } 254 255 /* Filesystem RO/RW: */ 256 257 /* 258 * For startup/shutdown of RW stuff, the dependencies are: 259 * 260 * - foreground writes depend on copygc and rebalance (to free up space) 261 * 262 * - copygc and rebalance depend on mark and sweep gc (they actually probably 263 * don't because they either reserve ahead of time or don't block if 264 * allocations fail, but allocations can require mark and sweep gc to run 265 * because of generation number wraparound) 266 * 267 * - all of the above depends on the allocator threads 268 * 269 * - allocator depends on the journal (when it rewrites prios and gens) 270 */ 271 272 static void __bch2_fs_read_only(struct bch_fs *c) 273 { 274 unsigned clean_passes = 0; 275 u64 seq = 0; 276 277 bch2_fs_ec_stop(c); 278 bch2_open_buckets_stop(c, NULL, true); 279 bch2_rebalance_stop(c); 280 bch2_copygc_stop(c); 281 bch2_fs_ec_flush(c); 282 283 bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", 284 journal_cur_seq(&c->journal)); 285 286 do { 287 clean_passes++; 288 289 if (bch2_btree_interior_updates_flush(c) || 290 bch2_btree_write_buffer_flush_going_ro(c) || 291 bch2_journal_flush_all_pins(&c->journal) || 292 bch2_btree_flush_all_writes(c) || 293 seq != atomic64_read(&c->journal.seq)) { 294 seq = atomic64_read(&c->journal.seq); 295 clean_passes = 0; 296 } 297 } while (clean_passes < 2); 298 299 bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", 300 journal_cur_seq(&c->journal)); 301 302 if (test_bit(JOURNAL_replay_done, &c->journal.flags) && 303 !test_bit(BCH_FS_emergency_ro, &c->flags)) 304 set_bit(BCH_FS_clean_shutdown, &c->flags); 305 306 bch2_fs_journal_stop(&c->journal); 307 308 bch_info(c, "%sclean shutdown complete, journal seq %llu", 309 test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", 310 c->journal.seq_ondisk); 311 312 /* 313 * After stopping journal: 314 */ 315 for_each_member_device(c, ca) { 316 bch2_dev_io_ref_stop(ca, WRITE); 317 bch2_dev_allocator_remove(c, ca); 318 } 319 } 320 321 static void bch2_writes_disabled(struct enumerated_ref *writes) 322 { 323 struct bch_fs *c = container_of(writes, struct bch_fs, writes); 324 325 set_bit(BCH_FS_write_disable_complete, &c->flags); 326 wake_up(&bch2_read_only_wait); 327 } 328 329 void bch2_fs_read_only(struct bch_fs *c) 330 { 331 if (!test_bit(BCH_FS_rw, &c->flags)) { 332 bch2_journal_reclaim_stop(&c->journal); 333 return; 334 } 335 336 BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); 337 338 bch_verbose(c, "going read-only"); 339 340 /* 341 * Block new foreground-end write operations from starting - any new 342 * writes will return -EROFS: 343 */ 344 set_bit(BCH_FS_going_ro, &c->flags); 345 enumerated_ref_stop_async(&c->writes); 346 347 /* 348 * If we're not doing an emergency shutdown, we want to wait on 349 * outstanding writes to complete so they don't see spurious errors due 350 * to shutting down the allocator: 351 * 352 * If we are doing an emergency shutdown outstanding writes may 353 * hang until we shutdown the allocator so we don't want to wait 354 * on outstanding writes before shutting everything down - but 355 * we do need to wait on them before returning and signalling 356 * that going RO is complete: 357 */ 358 wait_event(bch2_read_only_wait, 359 test_bit(BCH_FS_write_disable_complete, &c->flags) || 360 test_bit(BCH_FS_emergency_ro, &c->flags)); 361 362 bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); 363 if (writes_disabled) 364 bch_verbose(c, "finished waiting for writes to stop"); 365 366 __bch2_fs_read_only(c); 367 368 wait_event(bch2_read_only_wait, 369 test_bit(BCH_FS_write_disable_complete, &c->flags)); 370 371 if (!writes_disabled) 372 bch_verbose(c, "finished waiting for writes to stop"); 373 374 clear_bit(BCH_FS_write_disable_complete, &c->flags); 375 clear_bit(BCH_FS_going_ro, &c->flags); 376 clear_bit(BCH_FS_rw, &c->flags); 377 378 if (!bch2_journal_error(&c->journal) && 379 !test_bit(BCH_FS_error, &c->flags) && 380 !test_bit(BCH_FS_emergency_ro, &c->flags) && 381 test_bit(BCH_FS_started, &c->flags) && 382 test_bit(BCH_FS_clean_shutdown, &c->flags) && 383 c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { 384 BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); 385 BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); 386 BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); 387 BUG_ON(c->btree_write_buffer.inc.keys.nr); 388 BUG_ON(c->btree_write_buffer.flushing.keys.nr); 389 bch2_verify_accounting_clean(c); 390 391 bch_verbose(c, "marking filesystem clean"); 392 bch2_fs_mark_clean(c); 393 } else { 394 /* Make sure error counts/counters are persisted */ 395 mutex_lock(&c->sb_lock); 396 bch2_write_super(c); 397 mutex_unlock(&c->sb_lock); 398 399 bch_verbose(c, "done going read-only, filesystem not clean"); 400 } 401 } 402 403 static void bch2_fs_read_only_work(struct work_struct *work) 404 { 405 struct bch_fs *c = 406 container_of(work, struct bch_fs, read_only_work); 407 408 down_write(&c->state_lock); 409 bch2_fs_read_only(c); 410 up_write(&c->state_lock); 411 } 412 413 static void bch2_fs_read_only_async(struct bch_fs *c) 414 { 415 queue_work(system_long_wq, &c->read_only_work); 416 } 417 418 bool bch2_fs_emergency_read_only(struct bch_fs *c) 419 { 420 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); 421 422 bch2_journal_halt(&c->journal); 423 bch2_fs_read_only_async(c); 424 425 wake_up(&bch2_read_only_wait); 426 return ret; 427 } 428 429 static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out, 430 bool locked) 431 { 432 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); 433 434 if (!locked) 435 bch2_journal_halt(&c->journal); 436 else 437 bch2_journal_halt_locked(&c->journal); 438 bch2_fs_read_only_async(c); 439 wake_up(&bch2_read_only_wait); 440 441 if (ret) 442 prt_printf(out, "emergency read only at seq %llu\n", 443 journal_cur_seq(&c->journal)); 444 445 return ret; 446 } 447 448 bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out) 449 { 450 return __bch2_fs_emergency_read_only2(c, out, false); 451 } 452 453 bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) 454 { 455 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); 456 457 bch2_journal_halt_locked(&c->journal); 458 bch2_fs_read_only_async(c); 459 460 wake_up(&bch2_read_only_wait); 461 return ret; 462 } 463 464 static int __bch2_fs_read_write(struct bch_fs *c, bool early) 465 { 466 int ret; 467 468 BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); 469 470 if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) 471 return bch_err_throw(c, erofs_no_alloc_info); 472 473 if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { 474 bch_err(c, "cannot go rw, unfixed btree errors"); 475 return bch_err_throw(c, erofs_unfixed_errors); 476 } 477 478 if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { 479 bch_err(c, "cannot go rw, filesystem is an unresized image file"); 480 return bch_err_throw(c, erofs_filesystem_full); 481 } 482 483 if (test_bit(BCH_FS_rw, &c->flags)) 484 return 0; 485 486 bch_info(c, "going read-write"); 487 488 ret = bch2_fs_init_rw(c); 489 if (ret) 490 goto err; 491 492 ret = bch2_sb_members_v2_init(c); 493 if (ret) 494 goto err; 495 496 clear_bit(BCH_FS_clean_shutdown, &c->flags); 497 498 scoped_guard(rcu) 499 for_each_online_member_rcu(c, ca) 500 if (ca->mi.state == BCH_MEMBER_STATE_rw) { 501 bch2_dev_allocator_add(c, ca); 502 enumerated_ref_start(&ca->io_ref[WRITE]); 503 } 504 505 bch2_recalc_capacity(c); 506 507 /* 508 * First journal write must be a flush write: after a clean shutdown we 509 * don't read the journal, so the first journal write may end up 510 * overwriting whatever was there previously, and there must always be 511 * at least one non-flush write in the journal or recovery will fail: 512 */ 513 spin_lock(&c->journal.lock); 514 set_bit(JOURNAL_need_flush_write, &c->journal.flags); 515 set_bit(JOURNAL_running, &c->journal.flags); 516 bch2_journal_space_available(&c->journal); 517 spin_unlock(&c->journal.lock); 518 519 ret = bch2_fs_mark_dirty(c); 520 if (ret) 521 goto err; 522 523 ret = bch2_journal_reclaim_start(&c->journal); 524 if (ret) 525 goto err; 526 527 set_bit(BCH_FS_rw, &c->flags); 528 set_bit(BCH_FS_was_rw, &c->flags); 529 530 enumerated_ref_start(&c->writes); 531 532 ret = bch2_copygc_start(c); 533 if (ret) { 534 bch_err_msg(c, ret, "error starting copygc thread"); 535 goto err; 536 } 537 538 ret = bch2_rebalance_start(c); 539 if (ret) { 540 bch_err_msg(c, ret, "error starting rebalance thread"); 541 goto err; 542 } 543 544 bch2_do_discards(c); 545 bch2_do_invalidates(c); 546 bch2_do_stripe_deletes(c); 547 bch2_do_pending_node_rewrites(c); 548 return 0; 549 err: 550 if (test_bit(BCH_FS_rw, &c->flags)) 551 bch2_fs_read_only(c); 552 else 553 __bch2_fs_read_only(c); 554 return ret; 555 } 556 557 int bch2_fs_read_write(struct bch_fs *c) 558 { 559 if (c->opts.recovery_pass_last && 560 c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) 561 return bch_err_throw(c, erofs_norecovery); 562 563 if (c->opts.nochanges) 564 return bch_err_throw(c, erofs_nochanges); 565 566 if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) 567 return bch_err_throw(c, erofs_no_alloc_info); 568 569 return __bch2_fs_read_write(c, false); 570 } 571 572 int bch2_fs_read_write_early(struct bch_fs *c) 573 { 574 down_write(&c->state_lock); 575 int ret = __bch2_fs_read_write(c, true); 576 up_write(&c->state_lock); 577 578 return ret; 579 } 580 581 /* Filesystem startup/shutdown: */ 582 583 static void __bch2_fs_free(struct bch_fs *c) 584 { 585 for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) 586 bch2_time_stats_exit(&c->times[i]); 587 588 #ifdef CONFIG_UNICODE 589 utf8_unload(c->cf_encoding); 590 #endif 591 592 bch2_find_btree_nodes_exit(&c->found_btree_nodes); 593 bch2_free_pending_node_rewrites(c); 594 bch2_free_fsck_errs(c); 595 bch2_fs_vfs_exit(c); 596 bch2_fs_snapshots_exit(c); 597 bch2_fs_sb_errors_exit(c); 598 bch2_fs_replicas_exit(c); 599 bch2_fs_rebalance_exit(c); 600 bch2_fs_quota_exit(c); 601 bch2_fs_nocow_locking_exit(c); 602 bch2_fs_journal_exit(&c->journal); 603 bch2_fs_fs_io_direct_exit(c); 604 bch2_fs_fs_io_buffered_exit(c); 605 bch2_fs_fsio_exit(c); 606 bch2_fs_io_write_exit(c); 607 bch2_fs_io_read_exit(c); 608 bch2_fs_encryption_exit(c); 609 bch2_fs_ec_exit(c); 610 bch2_fs_counters_exit(c); 611 bch2_fs_compress_exit(c); 612 bch2_io_clock_exit(&c->io_clock[WRITE]); 613 bch2_io_clock_exit(&c->io_clock[READ]); 614 bch2_fs_buckets_waiting_for_journal_exit(c); 615 bch2_fs_btree_write_buffer_exit(c); 616 bch2_fs_btree_key_cache_exit(&c->btree_key_cache); 617 bch2_fs_btree_iter_exit(c); 618 bch2_fs_btree_interior_update_exit(c); 619 bch2_fs_btree_cache_exit(c); 620 bch2_fs_accounting_exit(c); 621 bch2_fs_async_obj_exit(c); 622 bch2_journal_keys_put_initial(c); 623 bch2_find_btree_nodes_exit(&c->found_btree_nodes); 624 625 BUG_ON(atomic_read(&c->journal_keys.ref)); 626 percpu_free_rwsem(&c->mark_lock); 627 if (c->online_reserved) { 628 u64 v = percpu_u64_get(c->online_reserved); 629 WARN(v, "online_reserved not 0 at shutdown: %lli", v); 630 free_percpu(c->online_reserved); 631 } 632 633 darray_exit(&c->incompat_versions_requested); 634 darray_exit(&c->btree_roots_extra); 635 free_percpu(c->pcpu); 636 free_percpu(c->usage); 637 mempool_exit(&c->large_bkey_pool); 638 mempool_exit(&c->btree_bounce_pool); 639 bioset_exit(&c->btree_bio); 640 mempool_exit(&c->fill_iter); 641 enumerated_ref_exit(&c->writes); 642 kfree(rcu_dereference_protected(c->disk_groups, 1)); 643 kfree(c->journal_seq_blacklist_table); 644 645 if (c->write_ref_wq) 646 destroy_workqueue(c->write_ref_wq); 647 if (c->btree_write_submit_wq) 648 destroy_workqueue(c->btree_write_submit_wq); 649 if (c->btree_read_complete_wq) 650 destroy_workqueue(c->btree_read_complete_wq); 651 if (c->copygc_wq) 652 destroy_workqueue(c->copygc_wq); 653 if (c->btree_write_complete_wq) 654 destroy_workqueue(c->btree_write_complete_wq); 655 if (c->btree_update_wq) 656 destroy_workqueue(c->btree_update_wq); 657 658 bch2_free_super(&c->disk_sb); 659 kvfree(c); 660 module_put(THIS_MODULE); 661 } 662 663 static void bch2_fs_release(struct kobject *kobj) 664 { 665 struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); 666 667 __bch2_fs_free(c); 668 } 669 670 void __bch2_fs_stop(struct bch_fs *c) 671 { 672 bch_verbose(c, "shutting down"); 673 674 set_bit(BCH_FS_stopping, &c->flags); 675 676 down_write(&c->state_lock); 677 bch2_fs_read_only(c); 678 up_write(&c->state_lock); 679 680 for (unsigned i = 0; i < c->sb.nr_devices; i++) { 681 struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); 682 if (ca) 683 bch2_dev_io_ref_stop(ca, READ); 684 } 685 686 for_each_member_device(c, ca) 687 bch2_dev_unlink(ca); 688 689 if (c->kobj.state_in_sysfs) 690 kobject_del(&c->kobj); 691 692 bch2_fs_debug_exit(c); 693 bch2_fs_chardev_exit(c); 694 695 bch2_ro_ref_put(c); 696 wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); 697 698 kobject_put(&c->counters_kobj); 699 kobject_put(&c->time_stats); 700 kobject_put(&c->opts_dir); 701 kobject_put(&c->internal); 702 703 /* btree prefetch might have kicked off reads in the background: */ 704 bch2_btree_flush_all_reads(c); 705 706 for_each_member_device(c, ca) 707 cancel_work_sync(&ca->io_error_work); 708 709 cancel_work_sync(&c->read_only_work); 710 } 711 712 void bch2_fs_free(struct bch_fs *c) 713 { 714 mutex_lock(&bch_fs_list_lock); 715 list_del(&c->list); 716 mutex_unlock(&bch_fs_list_lock); 717 718 closure_sync(&c->cl); 719 closure_debug_destroy(&c->cl); 720 721 for (unsigned i = 0; i < c->sb.nr_devices; i++) { 722 struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); 723 724 if (ca) { 725 EBUG_ON(atomic_long_read(&ca->ref) != 1); 726 bch2_dev_io_ref_stop(ca, READ); 727 bch2_free_super(&ca->disk_sb); 728 bch2_dev_free(ca); 729 } 730 } 731 732 bch_verbose(c, "shutdown complete"); 733 734 kobject_put(&c->kobj); 735 } 736 737 void bch2_fs_stop(struct bch_fs *c) 738 { 739 __bch2_fs_stop(c); 740 bch2_fs_free(c); 741 } 742 743 static int bch2_fs_online(struct bch_fs *c) 744 { 745 int ret = 0; 746 747 lockdep_assert_held(&bch_fs_list_lock); 748 749 if (c->sb.multi_device && 750 __bch2_uuid_to_fs(c->sb.uuid)) { 751 bch_err(c, "filesystem UUID already open"); 752 return bch_err_throw(c, filesystem_uuid_already_open); 753 } 754 755 ret = bch2_fs_chardev_init(c); 756 if (ret) { 757 bch_err(c, "error creating character device"); 758 return ret; 759 } 760 761 bch2_fs_debug_init(c); 762 763 ret = (c->sb.multi_device 764 ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) 765 : kobject_add(&c->kobj, NULL, "%s", c->name)) ?: 766 kobject_add(&c->internal, &c->kobj, "internal") ?: 767 kobject_add(&c->opts_dir, &c->kobj, "options") ?: 768 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 769 kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: 770 #endif 771 kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: 772 bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS); 773 if (ret) { 774 bch_err(c, "error creating sysfs objects"); 775 return ret; 776 } 777 778 down_write(&c->state_lock); 779 780 for_each_member_device(c, ca) { 781 ret = bch2_dev_sysfs_online(c, ca); 782 if (ret) { 783 bch_err(c, "error creating sysfs objects"); 784 bch2_dev_put(ca); 785 goto err; 786 } 787 } 788 789 BUG_ON(!list_empty(&c->list)); 790 list_add(&c->list, &bch_fs_list); 791 err: 792 up_write(&c->state_lock); 793 return ret; 794 } 795 796 int bch2_fs_init_rw(struct bch_fs *c) 797 { 798 if (test_bit(BCH_FS_rw_init_done, &c->flags)) 799 return 0; 800 801 if (!(c->btree_update_wq = alloc_workqueue("bcachefs", 802 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || 803 !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", 804 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || 805 !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", 806 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || 807 !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", 808 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || 809 !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", 810 WQ_FREEZABLE, 0))) 811 return bch_err_throw(c, ENOMEM_fs_other_alloc); 812 813 int ret = bch2_fs_btree_interior_update_init(c) ?: 814 bch2_fs_btree_write_buffer_init(c) ?: 815 bch2_fs_fs_io_buffered_init(c) ?: 816 bch2_fs_io_write_init(c) ?: 817 bch2_fs_journal_init(&c->journal); 818 if (ret) 819 return ret; 820 821 set_bit(BCH_FS_rw_init_done, &c->flags); 822 return 0; 823 } 824 825 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, 826 bch_sb_handles *sbs) 827 { 828 struct bch_fs *c; 829 struct printbuf name = PRINTBUF; 830 unsigned i, iter_size; 831 int ret = 0; 832 833 c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); 834 if (!c) { 835 c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); 836 goto out; 837 } 838 839 c->stdio = (void *)(unsigned long) opts->stdio; 840 841 __module_get(THIS_MODULE); 842 843 closure_init(&c->cl, NULL); 844 845 c->kobj.kset = bcachefs_kset; 846 kobject_init(&c->kobj, &bch2_fs_ktype); 847 kobject_init(&c->internal, &bch2_fs_internal_ktype); 848 kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); 849 kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); 850 kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); 851 852 c->minor = -1; 853 c->disk_sb.fs_sb = true; 854 855 init_rwsem(&c->state_lock); 856 mutex_init(&c->sb_lock); 857 mutex_init(&c->replicas_gc_lock); 858 mutex_init(&c->btree_root_lock); 859 INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); 860 861 refcount_set(&c->ro_ref, 1); 862 init_waitqueue_head(&c->ro_ref_wait); 863 864 for (i = 0; i < BCH_TIME_STAT_NR; i++) 865 bch2_time_stats_init(&c->times[i]); 866 867 bch2_fs_allocator_background_init(c); 868 bch2_fs_allocator_foreground_init(c); 869 bch2_fs_btree_cache_init_early(&c->btree_cache); 870 bch2_fs_btree_gc_init_early(c); 871 bch2_fs_btree_interior_update_init_early(c); 872 bch2_fs_btree_iter_init_early(c); 873 bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); 874 bch2_fs_btree_write_buffer_init_early(c); 875 bch2_fs_copygc_init(c); 876 bch2_fs_ec_init_early(c); 877 bch2_fs_journal_init_early(&c->journal); 878 bch2_fs_journal_keys_init(c); 879 bch2_fs_move_init(c); 880 bch2_fs_nocow_locking_init_early(c); 881 bch2_fs_quota_init(c); 882 bch2_fs_recovery_passes_init(c); 883 bch2_fs_sb_errors_init_early(c); 884 bch2_fs_snapshots_init_early(c); 885 bch2_fs_subvolumes_init_early(c); 886 887 INIT_LIST_HEAD(&c->list); 888 889 mutex_init(&c->bio_bounce_pages_lock); 890 mutex_init(&c->snapshot_table_lock); 891 init_rwsem(&c->snapshot_create_lock); 892 893 spin_lock_init(&c->btree_write_error_lock); 894 895 INIT_LIST_HEAD(&c->journal_iters); 896 897 INIT_LIST_HEAD(&c->fsck_error_msgs); 898 mutex_init(&c->fsck_error_msgs_lock); 899 900 seqcount_init(&c->usage_lock); 901 902 sema_init(&c->io_in_flight, 128); 903 904 INIT_LIST_HEAD(&c->vfs_inodes_list); 905 mutex_init(&c->vfs_inodes_lock); 906 907 c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; 908 c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; 909 c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; 910 911 mutex_init(&c->sectors_available_lock); 912 913 ret = percpu_init_rwsem(&c->mark_lock); 914 if (ret) 915 goto err; 916 917 mutex_lock(&c->sb_lock); 918 ret = bch2_sb_to_fs(c, sb); 919 mutex_unlock(&c->sb_lock); 920 921 if (ret) 922 goto err; 923 924 /* Compat: */ 925 if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && 926 !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) 927 SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); 928 929 if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && 930 !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) 931 SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); 932 933 c->opts = bch2_opts_default; 934 ret = bch2_opts_from_sb(&c->opts, sb); 935 if (ret) 936 goto err; 937 938 bch2_opts_apply(&c->opts, *opts); 939 940 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 941 c->opts.block_size > PAGE_SIZE) { 942 bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); 943 ret = -EINVAL; 944 goto err; 945 } 946 947 c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; 948 if (c->opts.inodes_use_key_cache) 949 c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; 950 c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops; 951 952 c->block_bits = ilog2(block_sectors(c)); 953 c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); 954 955 if (bch2_fs_init_fault("fs_alloc")) { 956 bch_err(c, "fs_alloc fault injected"); 957 ret = -EFAULT; 958 goto err; 959 } 960 961 if (c->sb.multi_device) 962 pr_uuid(&name, c->sb.user_uuid.b); 963 else 964 prt_bdevname(&name, sbs->data[0].bdev); 965 966 ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; 967 if (ret) 968 goto err; 969 970 strscpy(c->name, name.buf, sizeof(c->name)); 971 printbuf_exit(&name); 972 973 iter_size = sizeof(struct sort_iter) + 974 (btree_blocks(c) + 1) * 2 * 975 sizeof(struct sort_iter_set); 976 977 if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", 978 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || 979 enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, 980 bch2_writes_disabled) || 981 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || 982 bioset_init(&c->btree_bio, 1, 983 max(offsetof(struct btree_read_bio, bio), 984 offsetof(struct btree_write_bio, wbio.bio)), 985 BIOSET_NEED_BVECS) || 986 !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || 987 !(c->usage = alloc_percpu(struct bch_fs_usage_base)) || 988 !(c->online_reserved = alloc_percpu(u64)) || 989 mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, 990 c->opts.btree_node_size) || 991 mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { 992 ret = bch_err_throw(c, ENOMEM_fs_other_alloc); 993 goto err; 994 } 995 996 ret = 997 bch2_fs_async_obj_init(c) ?: 998 bch2_fs_btree_cache_init(c) ?: 999 bch2_fs_btree_iter_init(c) ?: 1000 bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: 1001 bch2_fs_buckets_waiting_for_journal_init(c) ?: 1002 bch2_io_clock_init(&c->io_clock[READ]) ?: 1003 bch2_io_clock_init(&c->io_clock[WRITE]) ?: 1004 bch2_fs_compress_init(c) ?: 1005 bch2_fs_counters_init(c) ?: 1006 bch2_fs_ec_init(c) ?: 1007 bch2_fs_encryption_init(c) ?: 1008 bch2_fs_fsio_init(c) ?: 1009 bch2_fs_fs_io_direct_init(c) ?: 1010 bch2_fs_io_read_init(c) ?: 1011 bch2_fs_rebalance_init(c) ?: 1012 bch2_fs_sb_errors_init(c) ?: 1013 bch2_fs_vfs_init(c); 1014 if (ret) 1015 goto err; 1016 1017 if (go_rw_in_recovery(c)) { 1018 /* 1019 * start workqueues/kworkers early - kthread creation checks for 1020 * pending signals, which is _very_ annoying 1021 */ 1022 ret = bch2_fs_init_rw(c); 1023 if (ret) 1024 goto err; 1025 } 1026 1027 #ifdef CONFIG_UNICODE 1028 if (bch2_fs_casefold_enabled(c)) { 1029 /* Default encoding until we can potentially have more as an option. */ 1030 c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); 1031 if (IS_ERR(c->cf_encoding)) { 1032 printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", 1033 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), 1034 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), 1035 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); 1036 ret = -EINVAL; 1037 goto err; 1038 } 1039 } 1040 #else 1041 if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { 1042 printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); 1043 ret = -EINVAL; 1044 goto err; 1045 } 1046 #endif 1047 1048 for (i = 0; i < c->sb.nr_devices; i++) { 1049 if (!bch2_member_exists(c->disk_sb.sb, i)) 1050 continue; 1051 ret = bch2_dev_alloc(c, i); 1052 if (ret) 1053 goto err; 1054 } 1055 1056 bch2_journal_entry_res_resize(&c->journal, 1057 &c->btree_root_journal_res, 1058 BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); 1059 bch2_journal_entry_res_resize(&c->journal, 1060 &c->clock_journal_res, 1061 (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); 1062 1063 mutex_lock(&bch_fs_list_lock); 1064 ret = bch2_fs_online(c); 1065 mutex_unlock(&bch_fs_list_lock); 1066 1067 if (ret) 1068 goto err; 1069 out: 1070 return c; 1071 err: 1072 bch2_fs_free(c); 1073 c = ERR_PTR(ret); 1074 goto out; 1075 } 1076 1077 noinline_for_stack 1078 static void print_mount_opts(struct bch_fs *c) 1079 { 1080 enum bch_opt_id i; 1081 CLASS(printbuf, p)(); 1082 bch2_log_msg_start(c, &p); 1083 1084 prt_str(&p, "starting version "); 1085 bch2_version_to_text(&p, c->sb.version); 1086 1087 bool first = true; 1088 for (i = 0; i < bch2_opts_nr; i++) { 1089 const struct bch_option *opt = &bch2_opt_table[i]; 1090 u64 v = bch2_opt_get_by_id(&c->opts, i); 1091 1092 if (!(opt->flags & OPT_MOUNT)) 1093 continue; 1094 1095 if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) 1096 continue; 1097 1098 prt_str(&p, first ? " opts=" : ","); 1099 first = false; 1100 bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); 1101 } 1102 1103 if (c->sb.version_incompat_allowed != c->sb.version) { 1104 prt_printf(&p, "\nallowing incompatible features above "); 1105 bch2_version_to_text(&p, c->sb.version_incompat_allowed); 1106 } 1107 1108 if (c->opts.verbose) { 1109 prt_printf(&p, "\nfeatures: "); 1110 prt_bitflags(&p, bch2_sb_features, c->sb.features); 1111 } 1112 1113 if (c->sb.multi_device) { 1114 prt_printf(&p, "\nwith devices"); 1115 for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) { 1116 prt_char(&p, ' '); 1117 prt_str(&p, ca->name); 1118 } 1119 } 1120 1121 bch2_print_str(c, KERN_INFO, p.buf); 1122 } 1123 1124 static bool bch2_fs_may_start(struct bch_fs *c) 1125 { 1126 struct bch_dev *ca; 1127 unsigned flags = 0; 1128 1129 switch (c->opts.degraded) { 1130 case BCH_DEGRADED_very: 1131 flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; 1132 break; 1133 case BCH_DEGRADED_yes: 1134 flags |= BCH_FORCE_IF_DEGRADED; 1135 break; 1136 default: 1137 mutex_lock(&c->sb_lock); 1138 for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) { 1139 if (!bch2_member_exists(c->disk_sb.sb, i)) 1140 continue; 1141 1142 ca = bch2_dev_locked(c, i); 1143 1144 if (!bch2_dev_is_online(ca) && 1145 (ca->mi.state == BCH_MEMBER_STATE_rw || 1146 ca->mi.state == BCH_MEMBER_STATE_ro)) { 1147 mutex_unlock(&c->sb_lock); 1148 return false; 1149 } 1150 } 1151 mutex_unlock(&c->sb_lock); 1152 break; 1153 } 1154 1155 return bch2_have_enough_devs(c, c->online_devs, flags, true); 1156 } 1157 1158 int bch2_fs_start(struct bch_fs *c) 1159 { 1160 time64_t now = ktime_get_real_seconds(); 1161 int ret = 0; 1162 1163 print_mount_opts(c); 1164 1165 if (c->cf_encoding) 1166 bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", 1167 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), 1168 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), 1169 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); 1170 1171 if (!bch2_fs_may_start(c)) 1172 return bch_err_throw(c, insufficient_devices_to_start); 1173 1174 down_write(&c->state_lock); 1175 mutex_lock(&c->sb_lock); 1176 1177 BUG_ON(test_bit(BCH_FS_started, &c->flags)); 1178 1179 if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, 1180 sizeof(struct bch_sb_field_ext) / sizeof(u64))) { 1181 mutex_unlock(&c->sb_lock); 1182 up_write(&c->state_lock); 1183 ret = bch_err_throw(c, ENOSPC_sb); 1184 goto err; 1185 } 1186 1187 ret = bch2_sb_members_v2_init(c); 1188 if (ret) { 1189 mutex_unlock(&c->sb_lock); 1190 up_write(&c->state_lock); 1191 goto err; 1192 } 1193 1194 scoped_guard(rcu) 1195 for_each_online_member_rcu(c, ca) 1196 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = 1197 cpu_to_le64(now); 1198 1199 /* 1200 * Dno't write superblock yet: recovery might have to downgrade 1201 */ 1202 mutex_unlock(&c->sb_lock); 1203 1204 scoped_guard(rcu) 1205 for_each_online_member_rcu(c, ca) 1206 if (ca->mi.state == BCH_MEMBER_STATE_rw) 1207 bch2_dev_allocator_add(c, ca); 1208 bch2_recalc_capacity(c); 1209 up_write(&c->state_lock); 1210 1211 c->recovery_task = current; 1212 ret = BCH_SB_INITIALIZED(c->disk_sb.sb) 1213 ? bch2_fs_recovery(c) 1214 : bch2_fs_initialize(c); 1215 c->recovery_task = NULL; 1216 1217 if (ret) 1218 goto err; 1219 1220 ret = bch2_opts_hooks_pre_set(c); 1221 if (ret) 1222 goto err; 1223 1224 if (bch2_fs_init_fault("fs_start")) { 1225 ret = bch_err_throw(c, injected_fs_start); 1226 goto err; 1227 } 1228 1229 set_bit(BCH_FS_started, &c->flags); 1230 wake_up(&c->ro_ref_wait); 1231 1232 down_write(&c->state_lock); 1233 if (c->opts.read_only) 1234 bch2_fs_read_only(c); 1235 else if (!test_bit(BCH_FS_rw, &c->flags)) 1236 ret = bch2_fs_read_write(c); 1237 up_write(&c->state_lock); 1238 1239 err: 1240 if (ret) 1241 bch_err_msg(c, ret, "starting filesystem"); 1242 else 1243 bch_verbose(c, "done starting filesystem"); 1244 return ret; 1245 } 1246 1247 static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) 1248 { 1249 struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); 1250 1251 if (le16_to_cpu(sb->block_size) != block_sectors(c)) 1252 return bch_err_throw(c, mismatched_block_size); 1253 1254 if (le16_to_cpu(m.bucket_size) < 1255 BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) 1256 return bch_err_throw(c, bucket_size_too_small); 1257 1258 return 0; 1259 } 1260 1261 static int bch2_dev_in_fs(struct bch_sb_handle *fs, 1262 struct bch_sb_handle *sb, 1263 struct bch_opts *opts) 1264 { 1265 if (fs == sb) 1266 return 0; 1267 1268 if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) 1269 return -BCH_ERR_device_not_a_member_of_filesystem; 1270 1271 if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) 1272 return -BCH_ERR_device_has_been_removed; 1273 1274 if (fs->sb->block_size != sb->sb->block_size) 1275 return -BCH_ERR_mismatched_block_size; 1276 1277 if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || 1278 le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) 1279 return 0; 1280 1281 if (fs->sb->seq == sb->sb->seq && 1282 fs->sb->write_time != sb->sb->write_time) { 1283 struct printbuf buf = PRINTBUF; 1284 1285 prt_str(&buf, "Split brain detected between "); 1286 prt_bdevname(&buf, sb->bdev); 1287 prt_str(&buf, " and "); 1288 prt_bdevname(&buf, fs->bdev); 1289 prt_char(&buf, ':'); 1290 prt_newline(&buf); 1291 prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); 1292 prt_newline(&buf); 1293 1294 prt_bdevname(&buf, fs->bdev); 1295 prt_char(&buf, ' '); 1296 bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); 1297 prt_newline(&buf); 1298 1299 prt_bdevname(&buf, sb->bdev); 1300 prt_char(&buf, ' '); 1301 bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); 1302 prt_newline(&buf); 1303 1304 if (!opts->no_splitbrain_check) 1305 prt_printf(&buf, "Not using older sb"); 1306 1307 pr_err("%s", buf.buf); 1308 printbuf_exit(&buf); 1309 1310 if (!opts->no_splitbrain_check) 1311 return -BCH_ERR_device_splitbrain; 1312 } 1313 1314 struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); 1315 u64 seq_from_fs = le64_to_cpu(m.seq); 1316 u64 seq_from_member = le64_to_cpu(sb->sb->seq); 1317 1318 if (seq_from_fs && seq_from_fs < seq_from_member) { 1319 struct printbuf buf = PRINTBUF; 1320 1321 prt_str(&buf, "Split brain detected between "); 1322 prt_bdevname(&buf, sb->bdev); 1323 prt_str(&buf, " and "); 1324 prt_bdevname(&buf, fs->bdev); 1325 prt_char(&buf, ':'); 1326 prt_newline(&buf); 1327 1328 prt_bdevname(&buf, fs->bdev); 1329 prt_str(&buf, " believes seq of "); 1330 prt_bdevname(&buf, sb->bdev); 1331 prt_printf(&buf, " to be %llu, but ", seq_from_fs); 1332 prt_bdevname(&buf, sb->bdev); 1333 prt_printf(&buf, " has %llu\n", seq_from_member); 1334 1335 if (!opts->no_splitbrain_check) { 1336 prt_str(&buf, "Not using "); 1337 prt_bdevname(&buf, sb->bdev); 1338 } 1339 1340 pr_err("%s", buf.buf); 1341 printbuf_exit(&buf); 1342 1343 if (!opts->no_splitbrain_check) 1344 return -BCH_ERR_device_splitbrain; 1345 } 1346 1347 return 0; 1348 } 1349 1350 /* Device startup/shutdown: */ 1351 1352 static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) 1353 { 1354 if (rw == READ) 1355 clear_bit(ca->dev_idx, ca->fs->online_devs.d); 1356 1357 if (!enumerated_ref_is_zero(&ca->io_ref[rw])) 1358 enumerated_ref_stop(&ca->io_ref[rw], 1359 rw == READ 1360 ? bch2_dev_read_refs 1361 : bch2_dev_write_refs); 1362 } 1363 1364 static void bch2_dev_release(struct kobject *kobj) 1365 { 1366 struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); 1367 1368 kfree(ca); 1369 } 1370 1371 static void bch2_dev_free(struct bch_dev *ca) 1372 { 1373 WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); 1374 WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); 1375 1376 cancel_work_sync(&ca->io_error_work); 1377 1378 bch2_dev_unlink(ca); 1379 1380 if (ca->kobj.state_in_sysfs) 1381 kobject_del(&ca->kobj); 1382 1383 bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); 1384 bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); 1385 1386 bch2_free_super(&ca->disk_sb); 1387 bch2_dev_allocator_background_exit(ca); 1388 bch2_dev_journal_exit(ca); 1389 1390 free_percpu(ca->io_done); 1391 bch2_dev_buckets_free(ca); 1392 kfree(ca->sb_read_scratch); 1393 1394 bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); 1395 bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); 1396 1397 enumerated_ref_exit(&ca->io_ref[WRITE]); 1398 enumerated_ref_exit(&ca->io_ref[READ]); 1399 #ifndef CONFIG_BCACHEFS_DEBUG 1400 percpu_ref_exit(&ca->ref); 1401 #endif 1402 kobject_put(&ca->kobj); 1403 } 1404 1405 static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) 1406 { 1407 1408 lockdep_assert_held(&c->state_lock); 1409 1410 if (enumerated_ref_is_zero(&ca->io_ref[READ])) 1411 return; 1412 1413 __bch2_dev_read_only(c, ca); 1414 1415 bch2_dev_io_ref_stop(ca, READ); 1416 1417 bch2_dev_unlink(ca); 1418 1419 bch2_free_super(&ca->disk_sb); 1420 bch2_dev_journal_exit(ca); 1421 } 1422 1423 #ifndef CONFIG_BCACHEFS_DEBUG 1424 static void bch2_dev_ref_complete(struct percpu_ref *ref) 1425 { 1426 struct bch_dev *ca = container_of(ref, struct bch_dev, ref); 1427 1428 complete(&ca->ref_completion); 1429 } 1430 #endif 1431 1432 static void bch2_dev_unlink(struct bch_dev *ca) 1433 { 1434 struct kobject *b; 1435 1436 /* 1437 * This is racy w.r.t. the underlying block device being hot-removed, 1438 * which removes it from sysfs. 1439 * 1440 * It'd be lovely if we had a way to handle this race, but the sysfs 1441 * code doesn't appear to provide a good method and block/holder.c is 1442 * susceptible as well: 1443 */ 1444 if (ca->kobj.state_in_sysfs && 1445 ca->disk_sb.bdev && 1446 (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { 1447 sysfs_remove_link(b, "bcachefs"); 1448 sysfs_remove_link(&ca->kobj, "block"); 1449 } 1450 } 1451 1452 static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) 1453 { 1454 int ret; 1455 1456 if (!c->kobj.state_in_sysfs) 1457 return 0; 1458 1459 if (!ca->kobj.state_in_sysfs) { 1460 ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?: 1461 bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE); 1462 if (ret) 1463 return ret; 1464 } 1465 1466 if (ca->disk_sb.bdev) { 1467 struct kobject *block = bdev_kobj(ca->disk_sb.bdev); 1468 1469 ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); 1470 if (ret) 1471 return ret; 1472 1473 ret = sysfs_create_link(&ca->kobj, block, "block"); 1474 if (ret) 1475 return ret; 1476 } 1477 1478 return 0; 1479 } 1480 1481 static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, 1482 struct bch_member *member) 1483 { 1484 struct bch_dev *ca; 1485 unsigned i; 1486 1487 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 1488 if (!ca) 1489 return NULL; 1490 1491 kobject_init(&ca->kobj, &bch2_dev_ktype); 1492 init_completion(&ca->ref_completion); 1493 1494 INIT_WORK(&ca->io_error_work, bch2_io_error_work); 1495 1496 bch2_time_stats_quantiles_init(&ca->io_latency[READ]); 1497 bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); 1498 1499 ca->mi = bch2_mi_to_cpu(member); 1500 1501 for (i = 0; i < ARRAY_SIZE(member->errors); i++) 1502 atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); 1503 1504 ca->uuid = member->uuid; 1505 1506 ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, 1507 ca->mi.bucket_size / btree_sectors(c)); 1508 1509 #ifndef CONFIG_BCACHEFS_DEBUG 1510 if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) 1511 goto err; 1512 #else 1513 atomic_long_set(&ca->ref, 1); 1514 #endif 1515 1516 mutex_init(&ca->bucket_backpointer_mismatch.lock); 1517 mutex_init(&ca->bucket_backpointer_empty.lock); 1518 1519 bch2_dev_allocator_background_init(ca); 1520 1521 if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) || 1522 enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) || 1523 !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || 1524 bch2_dev_buckets_alloc(c, ca) || 1525 !(ca->io_done = alloc_percpu(*ca->io_done))) 1526 goto err; 1527 1528 return ca; 1529 err: 1530 bch2_dev_free(ca); 1531 return NULL; 1532 } 1533 1534 static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, 1535 unsigned dev_idx) 1536 { 1537 ca->dev_idx = dev_idx; 1538 __set_bit(ca->dev_idx, ca->self.d); 1539 1540 if (!ca->name[0]) 1541 scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); 1542 1543 ca->fs = c; 1544 rcu_assign_pointer(c->devs[ca->dev_idx], ca); 1545 1546 if (bch2_dev_sysfs_online(c, ca)) 1547 pr_warn("error creating sysfs objects"); 1548 } 1549 1550 static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) 1551 { 1552 struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); 1553 struct bch_dev *ca = NULL; 1554 1555 if (bch2_fs_init_fault("dev_alloc")) 1556 goto err; 1557 1558 ca = __bch2_dev_alloc(c, &member); 1559 if (!ca) 1560 goto err; 1561 1562 ca->fs = c; 1563 1564 bch2_dev_attach(c, ca, dev_idx); 1565 return 0; 1566 err: 1567 return bch_err_throw(c, ENOMEM_dev_alloc); 1568 } 1569 1570 static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) 1571 { 1572 unsigned ret; 1573 1574 if (bch2_dev_is_online(ca)) { 1575 bch_err(ca, "already have device online in slot %u", 1576 sb->sb->dev_idx); 1577 return bch_err_throw(ca->fs, device_already_online); 1578 } 1579 1580 if (get_capacity(sb->bdev->bd_disk) < 1581 ca->mi.bucket_size * ca->mi.nbuckets) { 1582 bch_err(ca, "cannot online: device too small"); 1583 return bch_err_throw(ca->fs, device_size_too_small); 1584 } 1585 1586 BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); 1587 BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); 1588 1589 ret = bch2_dev_journal_init(ca, sb->sb); 1590 if (ret) 1591 return ret; 1592 1593 struct printbuf name = PRINTBUF; 1594 prt_bdevname(&name, sb->bdev); 1595 strscpy(ca->name, name.buf, sizeof(ca->name)); 1596 printbuf_exit(&name); 1597 1598 /* Commit: */ 1599 ca->disk_sb = *sb; 1600 memset(sb, 0, sizeof(*sb)); 1601 1602 /* 1603 * Stash pointer to the filesystem for blk_holder_ops - note that once 1604 * attached to a filesystem, we will always close the block device 1605 * before tearing down the filesystem object. 1606 */ 1607 ca->disk_sb.holder->c = ca->fs; 1608 1609 ca->dev = ca->disk_sb.bdev->bd_dev; 1610 1611 enumerated_ref_start(&ca->io_ref[READ]); 1612 1613 return 0; 1614 } 1615 1616 static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) 1617 { 1618 struct bch_dev *ca; 1619 int ret; 1620 1621 lockdep_assert_held(&c->state_lock); 1622 1623 if (le64_to_cpu(sb->sb->seq) > 1624 le64_to_cpu(c->disk_sb.sb->seq)) 1625 bch2_sb_to_fs(c, sb->sb); 1626 1627 BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); 1628 1629 ca = bch2_dev_locked(c, sb->sb->dev_idx); 1630 1631 ret = __bch2_dev_attach_bdev(ca, sb); 1632 if (ret) 1633 return ret; 1634 1635 set_bit(ca->dev_idx, c->online_devs.d); 1636 1637 bch2_dev_sysfs_online(c, ca); 1638 1639 bch2_rebalance_wakeup(c); 1640 return 0; 1641 } 1642 1643 /* Device management: */ 1644 1645 /* 1646 * Note: this function is also used by the error paths - when a particular 1647 * device sees an error, we call it to determine whether we can just set the 1648 * device RO, or - if this function returns false - we'll set the whole 1649 * filesystem RO: 1650 * 1651 * XXX: maybe we should be more explicit about whether we're changing state 1652 * because we got an error or what have you? 1653 */ 1654 bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, 1655 enum bch_member_state new_state, int flags) 1656 { 1657 struct bch_devs_mask new_online_devs; 1658 int nr_rw = 0, required; 1659 1660 lockdep_assert_held(&c->state_lock); 1661 1662 switch (new_state) { 1663 case BCH_MEMBER_STATE_rw: 1664 return true; 1665 case BCH_MEMBER_STATE_ro: 1666 if (ca->mi.state != BCH_MEMBER_STATE_rw) 1667 return true; 1668 1669 /* do we have enough devices to write to? */ 1670 for_each_member_device(c, ca2) 1671 if (ca2 != ca) 1672 nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; 1673 1674 required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) 1675 ? c->opts.metadata_replicas 1676 : metadata_replicas_required(c), 1677 !(flags & BCH_FORCE_IF_DATA_DEGRADED) 1678 ? c->opts.data_replicas 1679 : data_replicas_required(c)); 1680 1681 return nr_rw >= required; 1682 case BCH_MEMBER_STATE_failed: 1683 case BCH_MEMBER_STATE_spare: 1684 if (ca->mi.state != BCH_MEMBER_STATE_rw && 1685 ca->mi.state != BCH_MEMBER_STATE_ro) 1686 return true; 1687 1688 /* do we have enough devices to read from? */ 1689 new_online_devs = c->online_devs; 1690 __clear_bit(ca->dev_idx, new_online_devs.d); 1691 1692 return bch2_have_enough_devs(c, new_online_devs, flags, false); 1693 default: 1694 BUG(); 1695 } 1696 } 1697 1698 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) 1699 { 1700 bch2_dev_io_ref_stop(ca, WRITE); 1701 1702 /* 1703 * The allocator thread itself allocates btree nodes, so stop it first: 1704 */ 1705 bch2_dev_allocator_remove(c, ca); 1706 bch2_recalc_capacity(c); 1707 bch2_dev_journal_stop(&c->journal, ca); 1708 } 1709 1710 static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) 1711 { 1712 lockdep_assert_held(&c->state_lock); 1713 1714 BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); 1715 1716 bch2_dev_allocator_add(c, ca); 1717 bch2_recalc_capacity(c); 1718 1719 if (enumerated_ref_is_zero(&ca->io_ref[WRITE])) 1720 enumerated_ref_start(&ca->io_ref[WRITE]); 1721 1722 bch2_dev_do_discards(ca); 1723 } 1724 1725 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, 1726 enum bch_member_state new_state, int flags) 1727 { 1728 struct bch_member *m; 1729 int ret = 0; 1730 1731 if (ca->mi.state == new_state) 1732 return 0; 1733 1734 if (!bch2_dev_state_allowed(c, ca, new_state, flags)) 1735 return bch_err_throw(c, device_state_not_allowed); 1736 1737 if (new_state != BCH_MEMBER_STATE_rw) 1738 __bch2_dev_read_only(c, ca); 1739 1740 bch_notice(ca, "%s", bch2_member_states[new_state]); 1741 1742 mutex_lock(&c->sb_lock); 1743 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 1744 SET_BCH_MEMBER_STATE(m, new_state); 1745 bch2_write_super(c); 1746 mutex_unlock(&c->sb_lock); 1747 1748 if (new_state == BCH_MEMBER_STATE_rw) 1749 __bch2_dev_read_write(c, ca); 1750 1751 bch2_rebalance_wakeup(c); 1752 1753 return ret; 1754 } 1755 1756 int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, 1757 enum bch_member_state new_state, int flags) 1758 { 1759 int ret; 1760 1761 down_write(&c->state_lock); 1762 ret = __bch2_dev_set_state(c, ca, new_state, flags); 1763 up_write(&c->state_lock); 1764 1765 return ret; 1766 } 1767 1768 /* Device add/removal: */ 1769 1770 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) 1771 { 1772 struct bch_member *m; 1773 unsigned dev_idx = ca->dev_idx, data; 1774 bool fast_device_removal = !bch2_request_incompat_feature(c, 1775 bcachefs_metadata_version_fast_device_removal); 1776 int ret; 1777 1778 down_write(&c->state_lock); 1779 1780 /* 1781 * We consume a reference to ca->ref, regardless of whether we succeed 1782 * or fail: 1783 */ 1784 bch2_dev_put(ca); 1785 1786 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { 1787 bch_err(ca, "Cannot remove without losing data"); 1788 ret = bch_err_throw(c, device_state_not_allowed); 1789 goto err; 1790 } 1791 1792 __bch2_dev_read_only(c, ca); 1793 1794 ret = fast_device_removal 1795 ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) 1796 : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: 1797 bch2_dev_remove_stripes(c, ca->dev_idx, flags)); 1798 if (ret) 1799 goto err; 1800 1801 /* Check if device still has data before blowing away alloc info */ 1802 struct bch_dev_usage usage = bch2_dev_usage_read(ca); 1803 for (unsigned i = 0; i < BCH_DATA_NR; i++) 1804 if (!data_type_is_empty(i) && 1805 !data_type_is_hidden(i) && 1806 usage.buckets[i]) { 1807 bch_err(ca, "Remove failed: still has data (%s, %llu buckets)", 1808 __bch2_data_types[i], usage.buckets[i]); 1809 ret = -EBUSY; 1810 goto err; 1811 } 1812 1813 ret = bch2_dev_remove_alloc(c, ca); 1814 bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); 1815 if (ret) 1816 goto err; 1817 1818 /* 1819 * We need to flush the entire journal to get rid of keys that reference 1820 * the device being removed before removing the superblock entry 1821 */ 1822 bch2_journal_flush_all_pins(&c->journal); 1823 1824 /* 1825 * this is really just needed for the bch2_replicas_gc_(start|end) 1826 * calls, and could be cleaned up: 1827 */ 1828 ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); 1829 bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); 1830 if (ret) 1831 goto err; 1832 1833 ret = bch2_journal_flush(&c->journal); 1834 bch_err_msg(ca, ret, "bch2_journal_flush()"); 1835 if (ret) 1836 goto err; 1837 1838 ret = bch2_replicas_gc2(c); 1839 bch_err_msg(ca, ret, "bch2_replicas_gc2()"); 1840 if (ret) 1841 goto err; 1842 1843 data = bch2_dev_has_data(c, ca); 1844 if (data) { 1845 struct printbuf data_has = PRINTBUF; 1846 1847 prt_bitflags(&data_has, __bch2_data_types, data); 1848 bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); 1849 printbuf_exit(&data_has); 1850 ret = -EBUSY; 1851 goto err; 1852 } 1853 1854 __bch2_dev_offline(c, ca); 1855 1856 mutex_lock(&c->sb_lock); 1857 rcu_assign_pointer(c->devs[ca->dev_idx], NULL); 1858 mutex_unlock(&c->sb_lock); 1859 1860 #ifndef CONFIG_BCACHEFS_DEBUG 1861 percpu_ref_kill(&ca->ref); 1862 #else 1863 ca->dying = true; 1864 bch2_dev_put(ca); 1865 #endif 1866 wait_for_completion(&ca->ref_completion); 1867 1868 bch2_dev_free(ca); 1869 1870 /* 1871 * Free this device's slot in the bch_member array - all pointers to 1872 * this device must be gone: 1873 */ 1874 mutex_lock(&c->sb_lock); 1875 m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); 1876 1877 if (fast_device_removal) 1878 m->uuid = BCH_SB_MEMBER_DELETED_UUID; 1879 else 1880 memset(&m->uuid, 0, sizeof(m->uuid)); 1881 1882 bch2_write_super(c); 1883 1884 mutex_unlock(&c->sb_lock); 1885 up_write(&c->state_lock); 1886 return 0; 1887 err: 1888 if (test_bit(BCH_FS_rw, &c->flags) && 1889 ca->mi.state == BCH_MEMBER_STATE_rw && 1890 !enumerated_ref_is_zero(&ca->io_ref[READ])) 1891 __bch2_dev_read_write(c, ca); 1892 up_write(&c->state_lock); 1893 return ret; 1894 } 1895 1896 /* Add new device to running filesystem: */ 1897 int bch2_dev_add(struct bch_fs *c, const char *path) 1898 { 1899 struct bch_opts opts = bch2_opts_empty(); 1900 struct bch_sb_handle sb = {}; 1901 struct bch_dev *ca = NULL; 1902 struct printbuf errbuf = PRINTBUF; 1903 struct printbuf label = PRINTBUF; 1904 int ret = 0; 1905 1906 ret = bch2_read_super(path, &opts, &sb); 1907 bch_err_msg(c, ret, "reading super"); 1908 if (ret) 1909 goto err; 1910 1911 struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); 1912 1913 if (BCH_MEMBER_GROUP(&dev_mi)) { 1914 bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); 1915 if (label.allocation_failure) { 1916 ret = -ENOMEM; 1917 goto err; 1918 } 1919 } 1920 1921 if (list_empty(&c->list)) { 1922 mutex_lock(&bch_fs_list_lock); 1923 if (__bch2_uuid_to_fs(c->sb.uuid)) 1924 ret = bch_err_throw(c, filesystem_uuid_already_open); 1925 else 1926 list_add(&c->list, &bch_fs_list); 1927 mutex_unlock(&bch_fs_list_lock); 1928 1929 if (ret) { 1930 bch_err(c, "filesystem UUID already open"); 1931 goto err; 1932 } 1933 } 1934 1935 ret = bch2_dev_may_add(sb.sb, c); 1936 if (ret) 1937 goto err; 1938 1939 ca = __bch2_dev_alloc(c, &dev_mi); 1940 if (!ca) { 1941 ret = -ENOMEM; 1942 goto err; 1943 } 1944 1945 ret = __bch2_dev_attach_bdev(ca, &sb); 1946 if (ret) 1947 goto err; 1948 1949 down_write(&c->state_lock); 1950 mutex_lock(&c->sb_lock); 1951 SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); 1952 1953 ret = bch2_sb_from_fs(c, ca); 1954 bch_err_msg(c, ret, "setting up new superblock"); 1955 if (ret) 1956 goto err_unlock; 1957 1958 if (dynamic_fault("bcachefs:add:no_slot")) 1959 goto err_unlock; 1960 1961 ret = bch2_sb_member_alloc(c); 1962 if (ret < 0) { 1963 bch_err_msg(c, ret, "setting up new superblock"); 1964 goto err_unlock; 1965 } 1966 unsigned dev_idx = ret; 1967 ret = 0; 1968 1969 /* success: */ 1970 1971 dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); 1972 *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; 1973 1974 ca->disk_sb.sb->dev_idx = dev_idx; 1975 bch2_dev_attach(c, ca, dev_idx); 1976 1977 if (BCH_MEMBER_GROUP(&dev_mi)) { 1978 ret = __bch2_dev_group_set(c, ca, label.buf); 1979 bch_err_msg(c, ret, "creating new label"); 1980 if (ret) 1981 goto err_unlock; 1982 } 1983 1984 bch2_write_super(c); 1985 mutex_unlock(&c->sb_lock); 1986 1987 if (test_bit(BCH_FS_started, &c->flags)) { 1988 ret = bch2_dev_usage_init(ca, false); 1989 if (ret) 1990 goto err_late; 1991 1992 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 1993 bch_err_msg(ca, ret, "marking new superblock"); 1994 if (ret) 1995 goto err_late; 1996 1997 ret = bch2_fs_freespace_init(c); 1998 bch_err_msg(ca, ret, "initializing free space"); 1999 if (ret) 2000 goto err_late; 2001 2002 if (ca->mi.state == BCH_MEMBER_STATE_rw) 2003 __bch2_dev_read_write(c, ca); 2004 2005 ret = bch2_dev_journal_alloc(ca, false); 2006 bch_err_msg(c, ret, "allocating journal"); 2007 if (ret) 2008 goto err_late; 2009 } 2010 2011 /* 2012 * We just changed the superblock UUID, invalidate cache and send a 2013 * uevent to update /dev/disk/by-uuid 2014 */ 2015 invalidate_bdev(ca->disk_sb.bdev); 2016 2017 char uuid_str[37]; 2018 snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid); 2019 2020 char *envp[] = { 2021 "CHANGE=uuid", 2022 uuid_str, 2023 NULL, 2024 }; 2025 kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp); 2026 2027 up_write(&c->state_lock); 2028 out: 2029 printbuf_exit(&label); 2030 printbuf_exit(&errbuf); 2031 bch_err_fn(c, ret); 2032 return ret; 2033 2034 err_unlock: 2035 mutex_unlock(&c->sb_lock); 2036 up_write(&c->state_lock); 2037 err: 2038 if (ca) 2039 bch2_dev_free(ca); 2040 bch2_free_super(&sb); 2041 goto out; 2042 err_late: 2043 up_write(&c->state_lock); 2044 ca = NULL; 2045 goto err; 2046 } 2047 2048 /* Hot add existing device to running filesystem: */ 2049 int bch2_dev_online(struct bch_fs *c, const char *path) 2050 { 2051 struct bch_opts opts = bch2_opts_empty(); 2052 struct bch_sb_handle sb = { NULL }; 2053 struct bch_dev *ca; 2054 unsigned dev_idx; 2055 int ret; 2056 2057 down_write(&c->state_lock); 2058 2059 ret = bch2_read_super(path, &opts, &sb); 2060 if (ret) { 2061 up_write(&c->state_lock); 2062 return ret; 2063 } 2064 2065 dev_idx = sb.sb->dev_idx; 2066 2067 ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); 2068 bch_err_msg(c, ret, "bringing %s online", path); 2069 if (ret) 2070 goto err; 2071 2072 ret = bch2_dev_attach_bdev(c, &sb); 2073 if (ret) 2074 goto err; 2075 2076 ca = bch2_dev_locked(c, dev_idx); 2077 2078 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 2079 bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); 2080 if (ret) 2081 goto err; 2082 2083 if (ca->mi.state == BCH_MEMBER_STATE_rw) 2084 __bch2_dev_read_write(c, ca); 2085 2086 if (!ca->mi.freespace_initialized) { 2087 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); 2088 bch_err_msg(ca, ret, "initializing free space"); 2089 if (ret) 2090 goto err; 2091 } 2092 2093 if (!ca->journal.nr) { 2094 ret = bch2_dev_journal_alloc(ca, false); 2095 bch_err_msg(ca, ret, "allocating journal"); 2096 if (ret) 2097 goto err; 2098 } 2099 2100 mutex_lock(&c->sb_lock); 2101 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = 2102 cpu_to_le64(ktime_get_real_seconds()); 2103 bch2_write_super(c); 2104 mutex_unlock(&c->sb_lock); 2105 2106 up_write(&c->state_lock); 2107 return 0; 2108 err: 2109 up_write(&c->state_lock); 2110 bch2_free_super(&sb); 2111 return ret; 2112 } 2113 2114 int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) 2115 { 2116 down_write(&c->state_lock); 2117 2118 if (!bch2_dev_is_online(ca)) { 2119 bch_err(ca, "Already offline"); 2120 up_write(&c->state_lock); 2121 return 0; 2122 } 2123 2124 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { 2125 bch_err(ca, "Cannot offline required disk"); 2126 up_write(&c->state_lock); 2127 return bch_err_throw(c, device_state_not_allowed); 2128 } 2129 2130 __bch2_dev_offline(c, ca); 2131 2132 up_write(&c->state_lock); 2133 return 0; 2134 } 2135 2136 static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets) 2137 { 2138 struct bch_fs *c = ca->fs; 2139 u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 }; 2140 2141 return bch2_trans_commit_do(ca->fs, NULL, NULL, 0, 2142 bch2_disk_accounting_mod2(trans, false, v, dev_data_type, 2143 .dev = ca->dev_idx, 2144 .data_type = BCH_DATA_free)) ?: 2145 bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets); 2146 } 2147 2148 int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) 2149 { 2150 struct bch_member *m; 2151 u64 old_nbuckets; 2152 int ret = 0; 2153 2154 down_write(&c->state_lock); 2155 old_nbuckets = ca->mi.nbuckets; 2156 2157 if (nbuckets < ca->mi.nbuckets) { 2158 bch_err(ca, "Cannot shrink yet"); 2159 ret = -EINVAL; 2160 goto err; 2161 } 2162 2163 if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { 2164 bch_err(ca, "New device size too big (%llu greater than max %u)", 2165 nbuckets, BCH_MEMBER_NBUCKETS_MAX); 2166 ret = bch_err_throw(c, device_size_too_big); 2167 goto err; 2168 } 2169 2170 if (bch2_dev_is_online(ca) && 2171 get_capacity(ca->disk_sb.bdev->bd_disk) < 2172 ca->mi.bucket_size * nbuckets) { 2173 bch_err(ca, "New size larger than device"); 2174 ret = bch_err_throw(c, device_size_too_small); 2175 goto err; 2176 } 2177 2178 ret = bch2_dev_buckets_resize(c, ca, nbuckets); 2179 bch_err_msg(ca, ret, "resizing buckets"); 2180 if (ret) 2181 goto err; 2182 2183 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 2184 if (ret) 2185 goto err; 2186 2187 mutex_lock(&c->sb_lock); 2188 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 2189 m->nbuckets = cpu_to_le64(nbuckets); 2190 2191 bch2_write_super(c); 2192 mutex_unlock(&c->sb_lock); 2193 2194 if (ca->mi.freespace_initialized) { 2195 ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); 2196 if (ret) 2197 goto err; 2198 } 2199 2200 bch2_recalc_capacity(c); 2201 err: 2202 up_write(&c->state_lock); 2203 return ret; 2204 } 2205 2206 int bch2_fs_resize_on_mount(struct bch_fs *c) 2207 { 2208 for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) { 2209 u64 old_nbuckets = ca->mi.nbuckets; 2210 u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), 2211 ca->mi.bucket_size); 2212 2213 if (ca->mi.resize_on_mount && 2214 new_nbuckets > ca->mi.nbuckets) { 2215 bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); 2216 int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); 2217 bch_err_fn(ca, ret); 2218 if (ret) { 2219 enumerated_ref_put(&ca->io_ref[READ], 2220 BCH_DEV_READ_REF_fs_resize_on_mount); 2221 up_write(&c->state_lock); 2222 return ret; 2223 } 2224 2225 mutex_lock(&c->sb_lock); 2226 struct bch_member *m = 2227 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 2228 m->nbuckets = cpu_to_le64(new_nbuckets); 2229 SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false); 2230 2231 c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image)); 2232 bch2_write_super(c); 2233 mutex_unlock(&c->sb_lock); 2234 2235 if (ca->mi.freespace_initialized) { 2236 ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); 2237 if (ret) { 2238 enumerated_ref_put(&ca->io_ref[READ], 2239 BCH_DEV_READ_REF_fs_resize_on_mount); 2240 up_write(&c->state_lock); 2241 return ret; 2242 } 2243 } 2244 } 2245 } 2246 return 0; 2247 } 2248 2249 /* return with ref on ca->ref: */ 2250 struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) 2251 { 2252 if (!strncmp(name, "/dev/", strlen("/dev/"))) 2253 name += strlen("/dev/"); 2254 2255 for_each_member_device(c, ca) 2256 if (!strcmp(name, ca->name)) 2257 return ca; 2258 return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); 2259 } 2260 2261 /* blk_holder_ops: */ 2262 2263 static struct bch_fs *bdev_get_fs(struct block_device *bdev) 2264 __releases(&bdev->bd_holder_lock) 2265 { 2266 struct bch_sb_handle_holder *holder = bdev->bd_holder; 2267 struct bch_fs *c = holder->c; 2268 2269 if (c && !bch2_ro_ref_tryget(c)) 2270 c = NULL; 2271 2272 mutex_unlock(&bdev->bd_holder_lock); 2273 2274 if (c) 2275 wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags)); 2276 return c; 2277 } 2278 2279 /* returns with ref on ca->ref */ 2280 static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) 2281 { 2282 for_each_member_device(c, ca) 2283 if (ca->disk_sb.bdev == bdev) 2284 return ca; 2285 return NULL; 2286 } 2287 2288 static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) 2289 { 2290 struct bch_fs *c = bdev_get_fs(bdev); 2291 if (!c) 2292 return; 2293 2294 struct super_block *sb = c->vfs_sb; 2295 if (sb) { 2296 /* 2297 * Not necessary, c->ro_ref guards against the filesystem being 2298 * unmounted - we only take this to avoid a warning in 2299 * sync_filesystem: 2300 */ 2301 down_read(&sb->s_umount); 2302 } 2303 2304 down_write(&c->state_lock); 2305 struct bch_dev *ca = bdev_to_bch_dev(c, bdev); 2306 if (!ca) 2307 goto unlock; 2308 2309 bool dev = bch2_dev_state_allowed(c, ca, 2310 BCH_MEMBER_STATE_failed, 2311 BCH_FORCE_IF_DEGRADED); 2312 2313 if (!dev && sb) { 2314 if (!surprise) 2315 sync_filesystem(sb); 2316 shrink_dcache_sb(sb); 2317 evict_inodes(sb); 2318 } 2319 2320 struct printbuf buf = PRINTBUF; 2321 __bch2_log_msg_start(ca->name, &buf); 2322 2323 prt_printf(&buf, "offline from block layer"); 2324 2325 if (dev) { 2326 __bch2_dev_offline(c, ca); 2327 } else { 2328 bch2_journal_flush(&c->journal); 2329 bch2_fs_emergency_read_only2(c, &buf); 2330 } 2331 2332 bch2_print_str(c, KERN_ERR, buf.buf); 2333 printbuf_exit(&buf); 2334 2335 bch2_dev_put(ca); 2336 unlock: 2337 if (sb) 2338 up_read(&sb->s_umount); 2339 up_write(&c->state_lock); 2340 bch2_ro_ref_put(c); 2341 } 2342 2343 static void bch2_fs_bdev_sync(struct block_device *bdev) 2344 { 2345 struct bch_fs *c = bdev_get_fs(bdev); 2346 if (!c) 2347 return; 2348 2349 struct super_block *sb = c->vfs_sb; 2350 if (sb) { 2351 /* 2352 * Not necessary, c->ro_ref guards against the filesystem being 2353 * unmounted - we only take this to avoid a warning in 2354 * sync_filesystem: 2355 */ 2356 down_read(&sb->s_umount); 2357 sync_filesystem(sb); 2358 up_read(&sb->s_umount); 2359 } 2360 2361 bch2_ro_ref_put(c); 2362 } 2363 2364 const struct blk_holder_ops bch2_sb_handle_bdev_ops = { 2365 .mark_dead = bch2_fs_bdev_mark_dead, 2366 .sync = bch2_fs_bdev_sync, 2367 }; 2368 2369 /* Filesystem open: */ 2370 2371 static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) 2372 { 2373 return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: 2374 cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); 2375 } 2376 2377 struct bch_fs *bch2_fs_open(darray_const_str *devices, 2378 struct bch_opts *opts) 2379 { 2380 bch_sb_handles sbs = {}; 2381 struct bch_fs *c = NULL; 2382 struct bch_sb_handle *best = NULL; 2383 struct printbuf errbuf = PRINTBUF; 2384 int ret = 0; 2385 2386 if (!try_module_get(THIS_MODULE)) 2387 return ERR_PTR(-ENODEV); 2388 2389 if (!devices->nr) { 2390 ret = -EINVAL; 2391 goto err; 2392 } 2393 2394 ret = darray_make_room(&sbs, devices->nr); 2395 if (ret) 2396 goto err; 2397 2398 darray_for_each(*devices, i) { 2399 struct bch_sb_handle sb = { NULL }; 2400 2401 ret = bch2_read_super(*i, opts, &sb); 2402 if (ret) 2403 goto err; 2404 2405 BUG_ON(darray_push(&sbs, sb)); 2406 } 2407 2408 if (opts->nochanges && !opts->read_only) { 2409 ret = bch_err_throw(c, erofs_nochanges); 2410 goto err_print; 2411 } 2412 2413 darray_for_each(sbs, sb) 2414 if (!best || sb_cmp(sb->sb, best->sb) > 0) 2415 best = sb; 2416 2417 darray_for_each_reverse(sbs, sb) { 2418 ret = bch2_dev_in_fs(best, sb, opts); 2419 2420 if (ret == -BCH_ERR_device_has_been_removed || 2421 ret == -BCH_ERR_device_splitbrain) { 2422 bch2_free_super(sb); 2423 darray_remove_item(&sbs, sb); 2424 best -= best > sb; 2425 ret = 0; 2426 continue; 2427 } 2428 2429 if (ret) 2430 goto err_print; 2431 } 2432 2433 c = bch2_fs_alloc(best->sb, opts, &sbs); 2434 ret = PTR_ERR_OR_ZERO(c); 2435 if (ret) 2436 goto err; 2437 2438 down_write(&c->state_lock); 2439 darray_for_each(sbs, sb) { 2440 ret = bch2_dev_attach_bdev(c, sb); 2441 if (ret) { 2442 up_write(&c->state_lock); 2443 goto err; 2444 } 2445 } 2446 up_write(&c->state_lock); 2447 2448 if (!c->opts.nostart) { 2449 ret = bch2_fs_start(c); 2450 if (ret) 2451 goto err; 2452 } 2453 out: 2454 darray_for_each(sbs, sb) 2455 bch2_free_super(sb); 2456 darray_exit(&sbs); 2457 printbuf_exit(&errbuf); 2458 module_put(THIS_MODULE); 2459 return c; 2460 err_print: 2461 pr_err("bch_fs_open err opening %s: %s", 2462 devices->data[0], bch2_err_str(ret)); 2463 err: 2464 if (!IS_ERR_OR_NULL(c)) 2465 bch2_fs_stop(c); 2466 c = ERR_PTR(ret); 2467 goto out; 2468 } 2469 2470 /* Global interfaces/init */ 2471 2472 static void bcachefs_exit(void) 2473 { 2474 bch2_debug_exit(); 2475 bch2_vfs_exit(); 2476 bch2_chardev_exit(); 2477 bch2_btree_key_cache_exit(); 2478 if (bcachefs_kset) 2479 kset_unregister(bcachefs_kset); 2480 } 2481 2482 static int __init bcachefs_init(void) 2483 { 2484 bch2_bkey_pack_test(); 2485 2486 if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || 2487 bch2_btree_key_cache_init() || 2488 bch2_chardev_init() || 2489 bch2_vfs_init() || 2490 bch2_debug_init()) 2491 goto err; 2492 2493 return 0; 2494 err: 2495 bcachefs_exit(); 2496 return -ENOMEM; 2497 } 2498 2499 #define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name); 2500 BCH_DEBUG_PARAMS_ALL() 2501 #undef BCH_DEBUG_PARAM 2502 2503 static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp) 2504 { 2505 /* Match bool exactly, by re-using it. */ 2506 struct static_key *key = kp->arg; 2507 struct kernel_param boolkp = *kp; 2508 bool v; 2509 int ret; 2510 2511 boolkp.arg = &v; 2512 2513 ret = param_set_bool(val, &boolkp); 2514 if (ret) 2515 return ret; 2516 if (v) 2517 static_key_enable(key); 2518 else 2519 static_key_disable(key); 2520 return 0; 2521 } 2522 2523 static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp) 2524 { 2525 struct static_key *key = kp->arg; 2526 return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y'); 2527 } 2528 2529 static const struct kernel_param_ops bch2_param_ops_static_key_t = { 2530 .flags = KERNEL_PARAM_OPS_FL_NOARG, 2531 .set = bch2_param_set_static_key_t, 2532 .get = bch2_param_get_static_key_t, 2533 }; 2534 2535 #define BCH_DEBUG_PARAM(name, description) \ 2536 module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\ 2537 __MODULE_PARM_TYPE(name, "static_key_t"); \ 2538 MODULE_PARM_DESC(name, description); 2539 BCH_DEBUG_PARAMS() 2540 #undef BCH_DEBUG_PARAM 2541 2542 __maybe_unused 2543 static unsigned bch2_metadata_version = bcachefs_metadata_version_current; 2544 module_param_named(version, bch2_metadata_version, uint, 0444); 2545 2546 module_exit(bcachefs_exit); 2547 module_init(bcachefs_init); 2548