1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * bcachefs setup/teardown code, and some metadata io - read a superblock and 4 * figure out what to do with it. 5 * 6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 7 * Copyright 2012 Google, Inc. 8 */ 9 10 #include "bcachefs.h" 11 #include "alloc_background.h" 12 #include "alloc_foreground.h" 13 #include "bkey_sort.h" 14 #include "btree_cache.h" 15 #include "btree_gc.h" 16 #include "btree_journal_iter.h" 17 #include "btree_key_cache.h" 18 #include "btree_node_scan.h" 19 #include "btree_update_interior.h" 20 #include "btree_io.h" 21 #include "btree_write_buffer.h" 22 #include "buckets_waiting_for_journal.h" 23 #include "chardev.h" 24 #include "checksum.h" 25 #include "clock.h" 26 #include "compress.h" 27 #include "debug.h" 28 #include "disk_accounting.h" 29 #include "disk_groups.h" 30 #include "ec.h" 31 #include "errcode.h" 32 #include "error.h" 33 #include "fs.h" 34 #include "fs-io.h" 35 #include "fs-io-buffered.h" 36 #include "fs-io-direct.h" 37 #include "fsck.h" 38 #include "inode.h" 39 #include "io_read.h" 40 #include "io_write.h" 41 #include "journal.h" 42 #include "journal_reclaim.h" 43 #include "journal_seq_blacklist.h" 44 #include "move.h" 45 #include "migrate.h" 46 #include "movinggc.h" 47 #include "nocow_locking.h" 48 #include "quota.h" 49 #include "rebalance.h" 50 #include "recovery.h" 51 #include "replicas.h" 52 #include "sb-clean.h" 53 #include "sb-counters.h" 54 #include "sb-errors.h" 55 #include "sb-members.h" 56 #include "snapshot.h" 57 #include "subvolume.h" 58 #include "super.h" 59 #include "super-io.h" 60 #include "sysfs.h" 61 #include "thread_with_file.h" 62 #include "trace.h" 63 64 #include <linux/backing-dev.h> 65 #include <linux/blkdev.h> 66 #include <linux/debugfs.h> 67 #include <linux/device.h> 68 #include <linux/idr.h> 69 #include <linux/module.h> 70 #include <linux/percpu.h> 71 #include <linux/random.h> 72 #include <linux/sysfs.h> 73 #include <crypto/hash.h> 74 75 MODULE_LICENSE("GPL"); 76 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); 77 MODULE_DESCRIPTION("bcachefs filesystem"); 78 MODULE_SOFTDEP("pre: crc32c"); 79 MODULE_SOFTDEP("pre: crc64"); 80 MODULE_SOFTDEP("pre: sha256"); 81 MODULE_SOFTDEP("pre: chacha20"); 82 MODULE_SOFTDEP("pre: poly1305"); 83 MODULE_SOFTDEP("pre: xxhash"); 84 85 const char * const bch2_fs_flag_strs[] = { 86 #define x(n) #n, 87 BCH_FS_FLAGS() 88 #undef x 89 NULL 90 }; 91 92 void bch2_print_str(struct bch_fs *c, const char *str) 93 { 94 #ifdef __KERNEL__ 95 struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); 96 97 if (unlikely(stdio)) { 98 bch2_stdio_redirect_printf(stdio, true, "%s", str); 99 return; 100 } 101 #endif 102 bch2_print_string_as_lines(KERN_ERR, str); 103 } 104 105 __printf(2, 0) 106 static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) 107 { 108 #ifdef __KERNEL__ 109 if (unlikely(stdio)) { 110 if (fmt[0] == KERN_SOH[0]) 111 fmt += 2; 112 113 bch2_stdio_redirect_vprintf(stdio, true, fmt, args); 114 return; 115 } 116 #endif 117 vprintk(fmt, args); 118 } 119 120 void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) 121 { 122 struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; 123 124 va_list args; 125 va_start(args, fmt); 126 bch2_print_maybe_redirect(stdio, fmt, args); 127 va_end(args); 128 } 129 130 void __bch2_print(struct bch_fs *c, const char *fmt, ...) 131 { 132 struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); 133 134 va_list args; 135 va_start(args, fmt); 136 bch2_print_maybe_redirect(stdio, fmt, args); 137 va_end(args); 138 } 139 140 #define KTYPE(type) \ 141 static const struct attribute_group type ## _group = { \ 142 .attrs = type ## _files \ 143 }; \ 144 \ 145 static const struct attribute_group *type ## _groups[] = { \ 146 &type ## _group, \ 147 NULL \ 148 }; \ 149 \ 150 static const struct kobj_type type ## _ktype = { \ 151 .release = type ## _release, \ 152 .sysfs_ops = &type ## _sysfs_ops, \ 153 .default_groups = type ## _groups \ 154 } 155 156 static void bch2_fs_release(struct kobject *); 157 static void bch2_dev_release(struct kobject *); 158 static void bch2_fs_counters_release(struct kobject *k) 159 { 160 } 161 162 static void bch2_fs_internal_release(struct kobject *k) 163 { 164 } 165 166 static void bch2_fs_opts_dir_release(struct kobject *k) 167 { 168 } 169 170 static void bch2_fs_time_stats_release(struct kobject *k) 171 { 172 } 173 174 KTYPE(bch2_fs); 175 KTYPE(bch2_fs_counters); 176 KTYPE(bch2_fs_internal); 177 KTYPE(bch2_fs_opts_dir); 178 KTYPE(bch2_fs_time_stats); 179 KTYPE(bch2_dev); 180 181 static struct kset *bcachefs_kset; 182 static LIST_HEAD(bch_fs_list); 183 static DEFINE_MUTEX(bch_fs_list_lock); 184 185 DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); 186 187 static void bch2_dev_unlink(struct bch_dev *); 188 static void bch2_dev_free(struct bch_dev *); 189 static int bch2_dev_alloc(struct bch_fs *, unsigned); 190 static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); 191 static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); 192 193 struct bch_fs *bch2_dev_to_fs(dev_t dev) 194 { 195 struct bch_fs *c; 196 197 mutex_lock(&bch_fs_list_lock); 198 rcu_read_lock(); 199 200 list_for_each_entry(c, &bch_fs_list, list) 201 for_each_member_device_rcu(c, ca, NULL) 202 if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { 203 closure_get(&c->cl); 204 goto found; 205 } 206 c = NULL; 207 found: 208 rcu_read_unlock(); 209 mutex_unlock(&bch_fs_list_lock); 210 211 return c; 212 } 213 214 static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) 215 { 216 struct bch_fs *c; 217 218 lockdep_assert_held(&bch_fs_list_lock); 219 220 list_for_each_entry(c, &bch_fs_list, list) 221 if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) 222 return c; 223 224 return NULL; 225 } 226 227 struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) 228 { 229 struct bch_fs *c; 230 231 mutex_lock(&bch_fs_list_lock); 232 c = __bch2_uuid_to_fs(uuid); 233 if (c) 234 closure_get(&c->cl); 235 mutex_unlock(&bch_fs_list_lock); 236 237 return c; 238 } 239 240 /* Filesystem RO/RW: */ 241 242 /* 243 * For startup/shutdown of RW stuff, the dependencies are: 244 * 245 * - foreground writes depend on copygc and rebalance (to free up space) 246 * 247 * - copygc and rebalance depend on mark and sweep gc (they actually probably 248 * don't because they either reserve ahead of time or don't block if 249 * allocations fail, but allocations can require mark and sweep gc to run 250 * because of generation number wraparound) 251 * 252 * - all of the above depends on the allocator threads 253 * 254 * - allocator depends on the journal (when it rewrites prios and gens) 255 */ 256 257 static void __bch2_fs_read_only(struct bch_fs *c) 258 { 259 unsigned clean_passes = 0; 260 u64 seq = 0; 261 262 bch2_fs_ec_stop(c); 263 bch2_open_buckets_stop(c, NULL, true); 264 bch2_rebalance_stop(c); 265 bch2_copygc_stop(c); 266 bch2_fs_ec_flush(c); 267 268 bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", 269 journal_cur_seq(&c->journal)); 270 271 do { 272 clean_passes++; 273 274 if (bch2_btree_interior_updates_flush(c) || 275 bch2_btree_write_buffer_flush_going_ro(c) || 276 bch2_journal_flush_all_pins(&c->journal) || 277 bch2_btree_flush_all_writes(c) || 278 seq != atomic64_read(&c->journal.seq)) { 279 seq = atomic64_read(&c->journal.seq); 280 clean_passes = 0; 281 } 282 } while (clean_passes < 2); 283 284 bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", 285 journal_cur_seq(&c->journal)); 286 287 if (test_bit(JOURNAL_replay_done, &c->journal.flags) && 288 !test_bit(BCH_FS_emergency_ro, &c->flags)) 289 set_bit(BCH_FS_clean_shutdown, &c->flags); 290 291 bch2_fs_journal_stop(&c->journal); 292 293 bch_info(c, "%sclean shutdown complete, journal seq %llu", 294 test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", 295 c->journal.seq_ondisk); 296 297 /* 298 * After stopping journal: 299 */ 300 for_each_member_device(c, ca) 301 bch2_dev_allocator_remove(c, ca); 302 } 303 304 #ifndef BCH_WRITE_REF_DEBUG 305 static void bch2_writes_disabled(struct percpu_ref *writes) 306 { 307 struct bch_fs *c = container_of(writes, struct bch_fs, writes); 308 309 set_bit(BCH_FS_write_disable_complete, &c->flags); 310 wake_up(&bch2_read_only_wait); 311 } 312 #endif 313 314 void bch2_fs_read_only(struct bch_fs *c) 315 { 316 if (!test_bit(BCH_FS_rw, &c->flags)) { 317 bch2_journal_reclaim_stop(&c->journal); 318 return; 319 } 320 321 BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); 322 323 bch_verbose(c, "going read-only"); 324 325 /* 326 * Block new foreground-end write operations from starting - any new 327 * writes will return -EROFS: 328 */ 329 set_bit(BCH_FS_going_ro, &c->flags); 330 #ifndef BCH_WRITE_REF_DEBUG 331 percpu_ref_kill(&c->writes); 332 #else 333 for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) 334 bch2_write_ref_put(c, i); 335 #endif 336 337 /* 338 * If we're not doing an emergency shutdown, we want to wait on 339 * outstanding writes to complete so they don't see spurious errors due 340 * to shutting down the allocator: 341 * 342 * If we are doing an emergency shutdown outstanding writes may 343 * hang until we shutdown the allocator so we don't want to wait 344 * on outstanding writes before shutting everything down - but 345 * we do need to wait on them before returning and signalling 346 * that going RO is complete: 347 */ 348 wait_event(bch2_read_only_wait, 349 test_bit(BCH_FS_write_disable_complete, &c->flags) || 350 test_bit(BCH_FS_emergency_ro, &c->flags)); 351 352 bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); 353 if (writes_disabled) 354 bch_verbose(c, "finished waiting for writes to stop"); 355 356 __bch2_fs_read_only(c); 357 358 wait_event(bch2_read_only_wait, 359 test_bit(BCH_FS_write_disable_complete, &c->flags)); 360 361 if (!writes_disabled) 362 bch_verbose(c, "finished waiting for writes to stop"); 363 364 clear_bit(BCH_FS_write_disable_complete, &c->flags); 365 clear_bit(BCH_FS_going_ro, &c->flags); 366 clear_bit(BCH_FS_rw, &c->flags); 367 368 if (!bch2_journal_error(&c->journal) && 369 !test_bit(BCH_FS_error, &c->flags) && 370 !test_bit(BCH_FS_emergency_ro, &c->flags) && 371 test_bit(BCH_FS_started, &c->flags) && 372 test_bit(BCH_FS_clean_shutdown, &c->flags) && 373 c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { 374 BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); 375 BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); 376 BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); 377 BUG_ON(c->btree_write_buffer.inc.keys.nr); 378 BUG_ON(c->btree_write_buffer.flushing.keys.nr); 379 bch2_verify_accounting_clean(c); 380 381 bch_verbose(c, "marking filesystem clean"); 382 bch2_fs_mark_clean(c); 383 } else { 384 bch_verbose(c, "done going read-only, filesystem not clean"); 385 } 386 } 387 388 static void bch2_fs_read_only_work(struct work_struct *work) 389 { 390 struct bch_fs *c = 391 container_of(work, struct bch_fs, read_only_work); 392 393 down_write(&c->state_lock); 394 bch2_fs_read_only(c); 395 up_write(&c->state_lock); 396 } 397 398 static void bch2_fs_read_only_async(struct bch_fs *c) 399 { 400 queue_work(system_long_wq, &c->read_only_work); 401 } 402 403 bool bch2_fs_emergency_read_only(struct bch_fs *c) 404 { 405 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); 406 407 bch2_journal_halt(&c->journal); 408 bch2_fs_read_only_async(c); 409 410 wake_up(&bch2_read_only_wait); 411 return ret; 412 } 413 414 bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) 415 { 416 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); 417 418 bch2_journal_halt_locked(&c->journal); 419 bch2_fs_read_only_async(c); 420 421 wake_up(&bch2_read_only_wait); 422 return ret; 423 } 424 425 static int bch2_fs_read_write_late(struct bch_fs *c) 426 { 427 int ret; 428 429 /* 430 * Data move operations can't run until after check_snapshots has 431 * completed, and bch2_snapshot_is_ancestor() is available. 432 * 433 * Ideally we'd start copygc/rebalance earlier instead of waiting for 434 * all of recovery/fsck to complete: 435 */ 436 ret = bch2_copygc_start(c); 437 if (ret) { 438 bch_err(c, "error starting copygc thread"); 439 return ret; 440 } 441 442 ret = bch2_rebalance_start(c); 443 if (ret) { 444 bch_err(c, "error starting rebalance thread"); 445 return ret; 446 } 447 448 return 0; 449 } 450 451 static int __bch2_fs_read_write(struct bch_fs *c, bool early) 452 { 453 int ret; 454 455 BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); 456 457 if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { 458 bch_err(c, "cannot go rw, unfixed btree errors"); 459 return -BCH_ERR_erofs_unfixed_errors; 460 } 461 462 if (test_bit(BCH_FS_rw, &c->flags)) 463 return 0; 464 465 bch_info(c, "going read-write"); 466 467 ret = bch2_sb_members_v2_init(c); 468 if (ret) 469 goto err; 470 471 ret = bch2_fs_mark_dirty(c); 472 if (ret) 473 goto err; 474 475 clear_bit(BCH_FS_clean_shutdown, &c->flags); 476 477 /* 478 * First journal write must be a flush write: after a clean shutdown we 479 * don't read the journal, so the first journal write may end up 480 * overwriting whatever was there previously, and there must always be 481 * at least one non-flush write in the journal or recovery will fail: 482 */ 483 set_bit(JOURNAL_need_flush_write, &c->journal.flags); 484 set_bit(JOURNAL_running, &c->journal.flags); 485 486 for_each_rw_member(c, ca) 487 bch2_dev_allocator_add(c, ca); 488 bch2_recalc_capacity(c); 489 490 set_bit(BCH_FS_rw, &c->flags); 491 set_bit(BCH_FS_was_rw, &c->flags); 492 493 #ifndef BCH_WRITE_REF_DEBUG 494 percpu_ref_reinit(&c->writes); 495 #else 496 for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { 497 BUG_ON(atomic_long_read(&c->writes[i])); 498 atomic_long_inc(&c->writes[i]); 499 } 500 #endif 501 502 ret = bch2_journal_reclaim_start(&c->journal); 503 if (ret) 504 goto err; 505 506 if (!early) { 507 ret = bch2_fs_read_write_late(c); 508 if (ret) 509 goto err; 510 } 511 512 bch2_do_discards(c); 513 bch2_do_invalidates(c); 514 bch2_do_stripe_deletes(c); 515 bch2_do_pending_node_rewrites(c); 516 return 0; 517 err: 518 if (test_bit(BCH_FS_rw, &c->flags)) 519 bch2_fs_read_only(c); 520 else 521 __bch2_fs_read_only(c); 522 return ret; 523 } 524 525 int bch2_fs_read_write(struct bch_fs *c) 526 { 527 if (c->opts.recovery_pass_last && 528 c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) 529 return -BCH_ERR_erofs_norecovery; 530 531 if (c->opts.nochanges) 532 return -BCH_ERR_erofs_nochanges; 533 534 return __bch2_fs_read_write(c, false); 535 } 536 537 int bch2_fs_read_write_early(struct bch_fs *c) 538 { 539 lockdep_assert_held(&c->state_lock); 540 541 return __bch2_fs_read_write(c, true); 542 } 543 544 /* Filesystem startup/shutdown: */ 545 546 static void __bch2_fs_free(struct bch_fs *c) 547 { 548 for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) 549 bch2_time_stats_exit(&c->times[i]); 550 551 bch2_find_btree_nodes_exit(&c->found_btree_nodes); 552 bch2_free_pending_node_rewrites(c); 553 bch2_fs_accounting_exit(c); 554 bch2_fs_sb_errors_exit(c); 555 bch2_fs_counters_exit(c); 556 bch2_fs_snapshots_exit(c); 557 bch2_fs_quota_exit(c); 558 bch2_fs_fs_io_direct_exit(c); 559 bch2_fs_fs_io_buffered_exit(c); 560 bch2_fs_fsio_exit(c); 561 bch2_fs_vfs_exit(c); 562 bch2_fs_ec_exit(c); 563 bch2_fs_encryption_exit(c); 564 bch2_fs_nocow_locking_exit(c); 565 bch2_fs_io_write_exit(c); 566 bch2_fs_io_read_exit(c); 567 bch2_fs_buckets_waiting_for_journal_exit(c); 568 bch2_fs_btree_interior_update_exit(c); 569 bch2_fs_btree_key_cache_exit(&c->btree_key_cache); 570 bch2_fs_btree_cache_exit(c); 571 bch2_fs_btree_iter_exit(c); 572 bch2_fs_replicas_exit(c); 573 bch2_fs_journal_exit(&c->journal); 574 bch2_io_clock_exit(&c->io_clock[WRITE]); 575 bch2_io_clock_exit(&c->io_clock[READ]); 576 bch2_fs_compress_exit(c); 577 bch2_fs_btree_gc_exit(c); 578 bch2_journal_keys_put_initial(c); 579 bch2_find_btree_nodes_exit(&c->found_btree_nodes); 580 BUG_ON(atomic_read(&c->journal_keys.ref)); 581 bch2_fs_btree_write_buffer_exit(c); 582 percpu_free_rwsem(&c->mark_lock); 583 if (c->online_reserved) { 584 u64 v = percpu_u64_get(c->online_reserved); 585 WARN(v, "online_reserved not 0 at shutdown: %lli", v); 586 free_percpu(c->online_reserved); 587 } 588 589 darray_exit(&c->btree_roots_extra); 590 free_percpu(c->pcpu); 591 free_percpu(c->usage); 592 mempool_exit(&c->large_bkey_pool); 593 mempool_exit(&c->btree_bounce_pool); 594 bioset_exit(&c->btree_bio); 595 mempool_exit(&c->fill_iter); 596 #ifndef BCH_WRITE_REF_DEBUG 597 percpu_ref_exit(&c->writes); 598 #endif 599 kfree(rcu_dereference_protected(c->disk_groups, 1)); 600 kfree(c->journal_seq_blacklist_table); 601 602 if (c->write_ref_wq) 603 destroy_workqueue(c->write_ref_wq); 604 if (c->btree_write_submit_wq) 605 destroy_workqueue(c->btree_write_submit_wq); 606 if (c->btree_read_complete_wq) 607 destroy_workqueue(c->btree_read_complete_wq); 608 if (c->copygc_wq) 609 destroy_workqueue(c->copygc_wq); 610 if (c->btree_io_complete_wq) 611 destroy_workqueue(c->btree_io_complete_wq); 612 if (c->btree_update_wq) 613 destroy_workqueue(c->btree_update_wq); 614 615 bch2_free_super(&c->disk_sb); 616 kvfree(c); 617 module_put(THIS_MODULE); 618 } 619 620 static void bch2_fs_release(struct kobject *kobj) 621 { 622 struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); 623 624 __bch2_fs_free(c); 625 } 626 627 void __bch2_fs_stop(struct bch_fs *c) 628 { 629 bch_verbose(c, "shutting down"); 630 631 set_bit(BCH_FS_stopping, &c->flags); 632 633 down_write(&c->state_lock); 634 bch2_fs_read_only(c); 635 up_write(&c->state_lock); 636 637 for_each_member_device(c, ca) 638 bch2_dev_unlink(ca); 639 640 if (c->kobj.state_in_sysfs) 641 kobject_del(&c->kobj); 642 643 bch2_fs_debug_exit(c); 644 bch2_fs_chardev_exit(c); 645 646 bch2_ro_ref_put(c); 647 wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); 648 649 kobject_put(&c->counters_kobj); 650 kobject_put(&c->time_stats); 651 kobject_put(&c->opts_dir); 652 kobject_put(&c->internal); 653 654 /* btree prefetch might have kicked off reads in the background: */ 655 bch2_btree_flush_all_reads(c); 656 657 for_each_member_device(c, ca) 658 cancel_work_sync(&ca->io_error_work); 659 660 cancel_work_sync(&c->read_only_work); 661 } 662 663 void bch2_fs_free(struct bch_fs *c) 664 { 665 unsigned i; 666 667 mutex_lock(&bch_fs_list_lock); 668 list_del(&c->list); 669 mutex_unlock(&bch_fs_list_lock); 670 671 closure_sync(&c->cl); 672 closure_debug_destroy(&c->cl); 673 674 for (i = 0; i < c->sb.nr_devices; i++) { 675 struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); 676 677 if (ca) { 678 EBUG_ON(atomic_long_read(&ca->ref) != 1); 679 bch2_free_super(&ca->disk_sb); 680 bch2_dev_free(ca); 681 } 682 } 683 684 bch_verbose(c, "shutdown complete"); 685 686 kobject_put(&c->kobj); 687 } 688 689 void bch2_fs_stop(struct bch_fs *c) 690 { 691 __bch2_fs_stop(c); 692 bch2_fs_free(c); 693 } 694 695 static int bch2_fs_online(struct bch_fs *c) 696 { 697 int ret = 0; 698 699 lockdep_assert_held(&bch_fs_list_lock); 700 701 if (__bch2_uuid_to_fs(c->sb.uuid)) { 702 bch_err(c, "filesystem UUID already open"); 703 return -EINVAL; 704 } 705 706 ret = bch2_fs_chardev_init(c); 707 if (ret) { 708 bch_err(c, "error creating character device"); 709 return ret; 710 } 711 712 bch2_fs_debug_init(c); 713 714 ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: 715 kobject_add(&c->internal, &c->kobj, "internal") ?: 716 kobject_add(&c->opts_dir, &c->kobj, "options") ?: 717 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 718 kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: 719 #endif 720 kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: 721 bch2_opts_create_sysfs_files(&c->opts_dir); 722 if (ret) { 723 bch_err(c, "error creating sysfs objects"); 724 return ret; 725 } 726 727 down_write(&c->state_lock); 728 729 for_each_member_device(c, ca) { 730 ret = bch2_dev_sysfs_online(c, ca); 731 if (ret) { 732 bch_err(c, "error creating sysfs objects"); 733 bch2_dev_put(ca); 734 goto err; 735 } 736 } 737 738 BUG_ON(!list_empty(&c->list)); 739 list_add(&c->list, &bch_fs_list); 740 err: 741 up_write(&c->state_lock); 742 return ret; 743 } 744 745 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) 746 { 747 struct bch_fs *c; 748 struct printbuf name = PRINTBUF; 749 unsigned i, iter_size; 750 int ret = 0; 751 752 c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); 753 if (!c) { 754 c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); 755 goto out; 756 } 757 758 c->stdio = (void *)(unsigned long) opts.stdio; 759 760 __module_get(THIS_MODULE); 761 762 closure_init(&c->cl, NULL); 763 764 c->kobj.kset = bcachefs_kset; 765 kobject_init(&c->kobj, &bch2_fs_ktype); 766 kobject_init(&c->internal, &bch2_fs_internal_ktype); 767 kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); 768 kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); 769 kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); 770 771 c->minor = -1; 772 c->disk_sb.fs_sb = true; 773 774 init_rwsem(&c->state_lock); 775 mutex_init(&c->sb_lock); 776 mutex_init(&c->replicas_gc_lock); 777 mutex_init(&c->btree_root_lock); 778 INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); 779 780 refcount_set(&c->ro_ref, 1); 781 init_waitqueue_head(&c->ro_ref_wait); 782 spin_lock_init(&c->recovery_pass_lock); 783 sema_init(&c->online_fsck_mutex, 1); 784 785 for (i = 0; i < BCH_TIME_STAT_NR; i++) 786 bch2_time_stats_init(&c->times[i]); 787 788 bch2_fs_copygc_init(c); 789 bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); 790 bch2_fs_btree_iter_init_early(c); 791 bch2_fs_btree_interior_update_init_early(c); 792 bch2_fs_journal_keys_init(c); 793 bch2_fs_allocator_background_init(c); 794 bch2_fs_allocator_foreground_init(c); 795 bch2_fs_rebalance_init(c); 796 bch2_fs_quota_init(c); 797 bch2_fs_ec_init_early(c); 798 bch2_fs_move_init(c); 799 bch2_fs_sb_errors_init_early(c); 800 801 INIT_LIST_HEAD(&c->list); 802 803 mutex_init(&c->bio_bounce_pages_lock); 804 mutex_init(&c->snapshot_table_lock); 805 init_rwsem(&c->snapshot_create_lock); 806 807 spin_lock_init(&c->btree_write_error_lock); 808 809 INIT_LIST_HEAD(&c->journal_iters); 810 811 INIT_LIST_HEAD(&c->fsck_error_msgs); 812 mutex_init(&c->fsck_error_msgs_lock); 813 814 seqcount_init(&c->usage_lock); 815 816 sema_init(&c->io_in_flight, 128); 817 818 INIT_LIST_HEAD(&c->vfs_inodes_list); 819 mutex_init(&c->vfs_inodes_lock); 820 821 c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; 822 c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; 823 c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; 824 825 bch2_fs_btree_cache_init_early(&c->btree_cache); 826 827 mutex_init(&c->sectors_available_lock); 828 829 ret = percpu_init_rwsem(&c->mark_lock); 830 if (ret) 831 goto err; 832 833 mutex_lock(&c->sb_lock); 834 ret = bch2_sb_to_fs(c, sb); 835 mutex_unlock(&c->sb_lock); 836 837 if (ret) 838 goto err; 839 840 pr_uuid(&name, c->sb.user_uuid.b); 841 ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; 842 if (ret) 843 goto err; 844 845 strscpy(c->name, name.buf, sizeof(c->name)); 846 printbuf_exit(&name); 847 848 /* Compat: */ 849 if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && 850 !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) 851 SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); 852 853 if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && 854 !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) 855 SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); 856 857 c->opts = bch2_opts_default; 858 ret = bch2_opts_from_sb(&c->opts, sb); 859 if (ret) 860 goto err; 861 862 bch2_opts_apply(&c->opts, opts); 863 864 c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; 865 if (c->opts.inodes_use_key_cache) 866 c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; 867 c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops; 868 869 c->block_bits = ilog2(block_sectors(c)); 870 c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); 871 872 if (bch2_fs_init_fault("fs_alloc")) { 873 bch_err(c, "fs_alloc fault injected"); 874 ret = -EFAULT; 875 goto err; 876 } 877 878 iter_size = sizeof(struct sort_iter) + 879 (btree_blocks(c) + 1) * 2 * 880 sizeof(struct sort_iter_set); 881 882 if (!(c->btree_update_wq = alloc_workqueue("bcachefs", 883 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || 884 !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", 885 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || 886 !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", 887 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || 888 !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", 889 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || 890 !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", 891 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || 892 !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", 893 WQ_FREEZABLE, 0)) || 894 #ifndef BCH_WRITE_REF_DEBUG 895 percpu_ref_init(&c->writes, bch2_writes_disabled, 896 PERCPU_REF_INIT_DEAD, GFP_KERNEL) || 897 #endif 898 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || 899 bioset_init(&c->btree_bio, 1, 900 max(offsetof(struct btree_read_bio, bio), 901 offsetof(struct btree_write_bio, wbio.bio)), 902 BIOSET_NEED_BVECS) || 903 !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || 904 !(c->usage = alloc_percpu(struct bch_fs_usage_base)) || 905 !(c->online_reserved = alloc_percpu(u64)) || 906 mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, 907 c->opts.btree_node_size) || 908 mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { 909 ret = -BCH_ERR_ENOMEM_fs_other_alloc; 910 goto err; 911 } 912 913 ret = bch2_fs_counters_init(c) ?: 914 bch2_fs_sb_errors_init(c) ?: 915 bch2_io_clock_init(&c->io_clock[READ]) ?: 916 bch2_io_clock_init(&c->io_clock[WRITE]) ?: 917 bch2_fs_journal_init(&c->journal) ?: 918 bch2_fs_btree_iter_init(c) ?: 919 bch2_fs_btree_cache_init(c) ?: 920 bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: 921 bch2_fs_btree_interior_update_init(c) ?: 922 bch2_fs_btree_gc_init(c) ?: 923 bch2_fs_buckets_waiting_for_journal_init(c) ?: 924 bch2_fs_btree_write_buffer_init(c) ?: 925 bch2_fs_subvolumes_init(c) ?: 926 bch2_fs_io_read_init(c) ?: 927 bch2_fs_io_write_init(c) ?: 928 bch2_fs_nocow_locking_init(c) ?: 929 bch2_fs_encryption_init(c) ?: 930 bch2_fs_compress_init(c) ?: 931 bch2_fs_ec_init(c) ?: 932 bch2_fs_vfs_init(c) ?: 933 bch2_fs_fsio_init(c) ?: 934 bch2_fs_fs_io_buffered_init(c) ?: 935 bch2_fs_fs_io_direct_init(c); 936 if (ret) 937 goto err; 938 939 for (i = 0; i < c->sb.nr_devices; i++) { 940 if (!bch2_member_exists(c->disk_sb.sb, i)) 941 continue; 942 ret = bch2_dev_alloc(c, i); 943 if (ret) 944 goto err; 945 } 946 947 bch2_journal_entry_res_resize(&c->journal, 948 &c->btree_root_journal_res, 949 BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); 950 bch2_journal_entry_res_resize(&c->journal, 951 &c->clock_journal_res, 952 (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); 953 954 mutex_lock(&bch_fs_list_lock); 955 ret = bch2_fs_online(c); 956 mutex_unlock(&bch_fs_list_lock); 957 958 if (ret) 959 goto err; 960 out: 961 return c; 962 err: 963 bch2_fs_free(c); 964 c = ERR_PTR(ret); 965 goto out; 966 } 967 968 noinline_for_stack 969 static void print_mount_opts(struct bch_fs *c) 970 { 971 enum bch_opt_id i; 972 struct printbuf p = PRINTBUF; 973 bool first = true; 974 975 prt_str(&p, "starting version "); 976 bch2_version_to_text(&p, c->sb.version); 977 978 if (c->opts.read_only) { 979 prt_str(&p, " opts="); 980 first = false; 981 prt_printf(&p, "ro"); 982 } 983 984 for (i = 0; i < bch2_opts_nr; i++) { 985 const struct bch_option *opt = &bch2_opt_table[i]; 986 u64 v = bch2_opt_get_by_id(&c->opts, i); 987 988 if (!(opt->flags & OPT_MOUNT)) 989 continue; 990 991 if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) 992 continue; 993 994 prt_str(&p, first ? " opts=" : ","); 995 first = false; 996 bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); 997 } 998 999 bch_info(c, "%s", p.buf); 1000 printbuf_exit(&p); 1001 } 1002 1003 int bch2_fs_start(struct bch_fs *c) 1004 { 1005 time64_t now = ktime_get_real_seconds(); 1006 int ret; 1007 1008 print_mount_opts(c); 1009 1010 down_write(&c->state_lock); 1011 1012 BUG_ON(test_bit(BCH_FS_started, &c->flags)); 1013 1014 mutex_lock(&c->sb_lock); 1015 1016 ret = bch2_sb_members_v2_init(c); 1017 if (ret) { 1018 mutex_unlock(&c->sb_lock); 1019 goto err; 1020 } 1021 1022 for_each_online_member(c, ca) 1023 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); 1024 1025 struct bch_sb_field_ext *ext = 1026 bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); 1027 mutex_unlock(&c->sb_lock); 1028 1029 if (!ext) { 1030 bch_err(c, "insufficient space in superblock for sb_field_ext"); 1031 ret = -BCH_ERR_ENOSPC_sb; 1032 goto err; 1033 } 1034 1035 for_each_rw_member(c, ca) 1036 bch2_dev_allocator_add(c, ca); 1037 bch2_recalc_capacity(c); 1038 1039 c->recovery_task = current; 1040 ret = BCH_SB_INITIALIZED(c->disk_sb.sb) 1041 ? bch2_fs_recovery(c) 1042 : bch2_fs_initialize(c); 1043 c->recovery_task = NULL; 1044 1045 if (ret) 1046 goto err; 1047 1048 ret = bch2_opts_check_may_set(c); 1049 if (ret) 1050 goto err; 1051 1052 if (bch2_fs_init_fault("fs_start")) { 1053 bch_err(c, "fs_start fault injected"); 1054 ret = -EINVAL; 1055 goto err; 1056 } 1057 1058 set_bit(BCH_FS_started, &c->flags); 1059 1060 if (c->opts.read_only) { 1061 bch2_fs_read_only(c); 1062 } else { 1063 ret = !test_bit(BCH_FS_rw, &c->flags) 1064 ? bch2_fs_read_write(c) 1065 : bch2_fs_read_write_late(c); 1066 if (ret) 1067 goto err; 1068 } 1069 1070 ret = 0; 1071 err: 1072 if (ret) 1073 bch_err_msg(c, ret, "starting filesystem"); 1074 else 1075 bch_verbose(c, "done starting filesystem"); 1076 up_write(&c->state_lock); 1077 return ret; 1078 } 1079 1080 static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) 1081 { 1082 struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); 1083 1084 if (le16_to_cpu(sb->block_size) != block_sectors(c)) 1085 return -BCH_ERR_mismatched_block_size; 1086 1087 if (le16_to_cpu(m.bucket_size) < 1088 BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) 1089 return -BCH_ERR_bucket_size_too_small; 1090 1091 return 0; 1092 } 1093 1094 static int bch2_dev_in_fs(struct bch_sb_handle *fs, 1095 struct bch_sb_handle *sb, 1096 struct bch_opts *opts) 1097 { 1098 if (fs == sb) 1099 return 0; 1100 1101 if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) 1102 return -BCH_ERR_device_not_a_member_of_filesystem; 1103 1104 if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) 1105 return -BCH_ERR_device_has_been_removed; 1106 1107 if (fs->sb->block_size != sb->sb->block_size) 1108 return -BCH_ERR_mismatched_block_size; 1109 1110 if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || 1111 le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) 1112 return 0; 1113 1114 if (fs->sb->seq == sb->sb->seq && 1115 fs->sb->write_time != sb->sb->write_time) { 1116 struct printbuf buf = PRINTBUF; 1117 1118 prt_str(&buf, "Split brain detected between "); 1119 prt_bdevname(&buf, sb->bdev); 1120 prt_str(&buf, " and "); 1121 prt_bdevname(&buf, fs->bdev); 1122 prt_char(&buf, ':'); 1123 prt_newline(&buf); 1124 prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); 1125 prt_newline(&buf); 1126 1127 prt_bdevname(&buf, fs->bdev); 1128 prt_char(&buf, ' '); 1129 bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); 1130 prt_newline(&buf); 1131 1132 prt_bdevname(&buf, sb->bdev); 1133 prt_char(&buf, ' '); 1134 bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); 1135 prt_newline(&buf); 1136 1137 if (!opts->no_splitbrain_check) 1138 prt_printf(&buf, "Not using older sb"); 1139 1140 pr_err("%s", buf.buf); 1141 printbuf_exit(&buf); 1142 1143 if (!opts->no_splitbrain_check) 1144 return -BCH_ERR_device_splitbrain; 1145 } 1146 1147 struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); 1148 u64 seq_from_fs = le64_to_cpu(m.seq); 1149 u64 seq_from_member = le64_to_cpu(sb->sb->seq); 1150 1151 if (seq_from_fs && seq_from_fs < seq_from_member) { 1152 struct printbuf buf = PRINTBUF; 1153 1154 prt_str(&buf, "Split brain detected between "); 1155 prt_bdevname(&buf, sb->bdev); 1156 prt_str(&buf, " and "); 1157 prt_bdevname(&buf, fs->bdev); 1158 prt_char(&buf, ':'); 1159 prt_newline(&buf); 1160 1161 prt_bdevname(&buf, fs->bdev); 1162 prt_str(&buf, " believes seq of "); 1163 prt_bdevname(&buf, sb->bdev); 1164 prt_printf(&buf, " to be %llu, but ", seq_from_fs); 1165 prt_bdevname(&buf, sb->bdev); 1166 prt_printf(&buf, " has %llu\n", seq_from_member); 1167 1168 if (!opts->no_splitbrain_check) { 1169 prt_str(&buf, "Not using "); 1170 prt_bdevname(&buf, sb->bdev); 1171 } 1172 1173 pr_err("%s", buf.buf); 1174 printbuf_exit(&buf); 1175 1176 if (!opts->no_splitbrain_check) 1177 return -BCH_ERR_device_splitbrain; 1178 } 1179 1180 return 0; 1181 } 1182 1183 /* Device startup/shutdown: */ 1184 1185 static void bch2_dev_release(struct kobject *kobj) 1186 { 1187 struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); 1188 1189 kfree(ca); 1190 } 1191 1192 static void bch2_dev_free(struct bch_dev *ca) 1193 { 1194 cancel_work_sync(&ca->io_error_work); 1195 1196 bch2_dev_unlink(ca); 1197 1198 if (ca->kobj.state_in_sysfs) 1199 kobject_del(&ca->kobj); 1200 1201 bch2_free_super(&ca->disk_sb); 1202 bch2_dev_allocator_background_exit(ca); 1203 bch2_dev_journal_exit(ca); 1204 1205 free_percpu(ca->io_done); 1206 bch2_dev_buckets_free(ca); 1207 kfree(ca->sb_read_scratch); 1208 1209 bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); 1210 bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); 1211 1212 percpu_ref_exit(&ca->io_ref); 1213 #ifndef CONFIG_BCACHEFS_DEBUG 1214 percpu_ref_exit(&ca->ref); 1215 #endif 1216 kobject_put(&ca->kobj); 1217 } 1218 1219 static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) 1220 { 1221 1222 lockdep_assert_held(&c->state_lock); 1223 1224 if (percpu_ref_is_zero(&ca->io_ref)) 1225 return; 1226 1227 __bch2_dev_read_only(c, ca); 1228 1229 reinit_completion(&ca->io_ref_completion); 1230 percpu_ref_kill(&ca->io_ref); 1231 wait_for_completion(&ca->io_ref_completion); 1232 1233 bch2_dev_unlink(ca); 1234 1235 bch2_free_super(&ca->disk_sb); 1236 bch2_dev_journal_exit(ca); 1237 } 1238 1239 #ifndef CONFIG_BCACHEFS_DEBUG 1240 static void bch2_dev_ref_complete(struct percpu_ref *ref) 1241 { 1242 struct bch_dev *ca = container_of(ref, struct bch_dev, ref); 1243 1244 complete(&ca->ref_completion); 1245 } 1246 #endif 1247 1248 static void bch2_dev_io_ref_complete(struct percpu_ref *ref) 1249 { 1250 struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); 1251 1252 complete(&ca->io_ref_completion); 1253 } 1254 1255 static void bch2_dev_unlink(struct bch_dev *ca) 1256 { 1257 struct kobject *b; 1258 1259 /* 1260 * This is racy w.r.t. the underlying block device being hot-removed, 1261 * which removes it from sysfs. 1262 * 1263 * It'd be lovely if we had a way to handle this race, but the sysfs 1264 * code doesn't appear to provide a good method and block/holder.c is 1265 * susceptible as well: 1266 */ 1267 if (ca->kobj.state_in_sysfs && 1268 ca->disk_sb.bdev && 1269 (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { 1270 sysfs_remove_link(b, "bcachefs"); 1271 sysfs_remove_link(&ca->kobj, "block"); 1272 } 1273 } 1274 1275 static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) 1276 { 1277 int ret; 1278 1279 if (!c->kobj.state_in_sysfs) 1280 return 0; 1281 1282 if (!ca->kobj.state_in_sysfs) { 1283 ret = kobject_add(&ca->kobj, &c->kobj, 1284 "dev-%u", ca->dev_idx); 1285 if (ret) 1286 return ret; 1287 } 1288 1289 if (ca->disk_sb.bdev) { 1290 struct kobject *block = bdev_kobj(ca->disk_sb.bdev); 1291 1292 ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); 1293 if (ret) 1294 return ret; 1295 1296 ret = sysfs_create_link(&ca->kobj, block, "block"); 1297 if (ret) 1298 return ret; 1299 } 1300 1301 return 0; 1302 } 1303 1304 static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, 1305 struct bch_member *member) 1306 { 1307 struct bch_dev *ca; 1308 unsigned i; 1309 1310 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 1311 if (!ca) 1312 return NULL; 1313 1314 kobject_init(&ca->kobj, &bch2_dev_ktype); 1315 init_completion(&ca->ref_completion); 1316 init_completion(&ca->io_ref_completion); 1317 1318 INIT_WORK(&ca->io_error_work, bch2_io_error_work); 1319 1320 bch2_time_stats_quantiles_init(&ca->io_latency[READ]); 1321 bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); 1322 1323 ca->mi = bch2_mi_to_cpu(member); 1324 1325 for (i = 0; i < ARRAY_SIZE(member->errors); i++) 1326 atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); 1327 1328 ca->uuid = member->uuid; 1329 1330 ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, 1331 ca->mi.bucket_size / btree_sectors(c)); 1332 1333 #ifndef CONFIG_BCACHEFS_DEBUG 1334 if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) 1335 goto err; 1336 #else 1337 atomic_long_set(&ca->ref, 1); 1338 #endif 1339 1340 bch2_dev_allocator_background_init(ca); 1341 1342 if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, 1343 PERCPU_REF_INIT_DEAD, GFP_KERNEL) || 1344 !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || 1345 bch2_dev_buckets_alloc(c, ca) || 1346 !(ca->io_done = alloc_percpu(*ca->io_done))) 1347 goto err; 1348 1349 return ca; 1350 err: 1351 bch2_dev_free(ca); 1352 return NULL; 1353 } 1354 1355 static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, 1356 unsigned dev_idx) 1357 { 1358 ca->dev_idx = dev_idx; 1359 __set_bit(ca->dev_idx, ca->self.d); 1360 scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); 1361 1362 ca->fs = c; 1363 rcu_assign_pointer(c->devs[ca->dev_idx], ca); 1364 1365 if (bch2_dev_sysfs_online(c, ca)) 1366 pr_warn("error creating sysfs objects"); 1367 } 1368 1369 static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) 1370 { 1371 struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); 1372 struct bch_dev *ca = NULL; 1373 1374 if (bch2_fs_init_fault("dev_alloc")) 1375 goto err; 1376 1377 ca = __bch2_dev_alloc(c, &member); 1378 if (!ca) 1379 goto err; 1380 1381 ca->fs = c; 1382 1383 bch2_dev_attach(c, ca, dev_idx); 1384 return 0; 1385 err: 1386 return -BCH_ERR_ENOMEM_dev_alloc; 1387 } 1388 1389 static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) 1390 { 1391 unsigned ret; 1392 1393 if (bch2_dev_is_online(ca)) { 1394 bch_err(ca, "already have device online in slot %u", 1395 sb->sb->dev_idx); 1396 return -BCH_ERR_device_already_online; 1397 } 1398 1399 if (get_capacity(sb->bdev->bd_disk) < 1400 ca->mi.bucket_size * ca->mi.nbuckets) { 1401 bch_err(ca, "cannot online: device too small"); 1402 return -BCH_ERR_device_size_too_small; 1403 } 1404 1405 BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); 1406 1407 ret = bch2_dev_journal_init(ca, sb->sb); 1408 if (ret) 1409 return ret; 1410 1411 /* Commit: */ 1412 ca->disk_sb = *sb; 1413 memset(sb, 0, sizeof(*sb)); 1414 1415 ca->dev = ca->disk_sb.bdev->bd_dev; 1416 1417 percpu_ref_reinit(&ca->io_ref); 1418 1419 return 0; 1420 } 1421 1422 static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) 1423 { 1424 struct bch_dev *ca; 1425 int ret; 1426 1427 lockdep_assert_held(&c->state_lock); 1428 1429 if (le64_to_cpu(sb->sb->seq) > 1430 le64_to_cpu(c->disk_sb.sb->seq)) 1431 bch2_sb_to_fs(c, sb->sb); 1432 1433 BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); 1434 1435 ca = bch2_dev_locked(c, sb->sb->dev_idx); 1436 1437 ret = __bch2_dev_attach_bdev(ca, sb); 1438 if (ret) 1439 return ret; 1440 1441 bch2_dev_sysfs_online(c, ca); 1442 1443 struct printbuf name = PRINTBUF; 1444 prt_bdevname(&name, ca->disk_sb.bdev); 1445 1446 if (c->sb.nr_devices == 1) 1447 strscpy(c->name, name.buf, sizeof(c->name)); 1448 strscpy(ca->name, name.buf, sizeof(ca->name)); 1449 1450 printbuf_exit(&name); 1451 1452 rebalance_wakeup(c); 1453 return 0; 1454 } 1455 1456 /* Device management: */ 1457 1458 /* 1459 * Note: this function is also used by the error paths - when a particular 1460 * device sees an error, we call it to determine whether we can just set the 1461 * device RO, or - if this function returns false - we'll set the whole 1462 * filesystem RO: 1463 * 1464 * XXX: maybe we should be more explicit about whether we're changing state 1465 * because we got an error or what have you? 1466 */ 1467 bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, 1468 enum bch_member_state new_state, int flags) 1469 { 1470 struct bch_devs_mask new_online_devs; 1471 int nr_rw = 0, required; 1472 1473 lockdep_assert_held(&c->state_lock); 1474 1475 switch (new_state) { 1476 case BCH_MEMBER_STATE_rw: 1477 return true; 1478 case BCH_MEMBER_STATE_ro: 1479 if (ca->mi.state != BCH_MEMBER_STATE_rw) 1480 return true; 1481 1482 /* do we have enough devices to write to? */ 1483 for_each_member_device(c, ca2) 1484 if (ca2 != ca) 1485 nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; 1486 1487 required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) 1488 ? c->opts.metadata_replicas 1489 : metadata_replicas_required(c), 1490 !(flags & BCH_FORCE_IF_DATA_DEGRADED) 1491 ? c->opts.data_replicas 1492 : data_replicas_required(c)); 1493 1494 return nr_rw >= required; 1495 case BCH_MEMBER_STATE_failed: 1496 case BCH_MEMBER_STATE_spare: 1497 if (ca->mi.state != BCH_MEMBER_STATE_rw && 1498 ca->mi.state != BCH_MEMBER_STATE_ro) 1499 return true; 1500 1501 /* do we have enough devices to read from? */ 1502 new_online_devs = bch2_online_devs(c); 1503 __clear_bit(ca->dev_idx, new_online_devs.d); 1504 1505 return bch2_have_enough_devs(c, new_online_devs, flags, false); 1506 default: 1507 BUG(); 1508 } 1509 } 1510 1511 static bool bch2_fs_may_start(struct bch_fs *c) 1512 { 1513 struct bch_dev *ca; 1514 unsigned i, flags = 0; 1515 1516 if (c->opts.very_degraded) 1517 flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; 1518 1519 if (c->opts.degraded) 1520 flags |= BCH_FORCE_IF_DEGRADED; 1521 1522 if (!c->opts.degraded && 1523 !c->opts.very_degraded) { 1524 mutex_lock(&c->sb_lock); 1525 1526 for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { 1527 if (!bch2_member_exists(c->disk_sb.sb, i)) 1528 continue; 1529 1530 ca = bch2_dev_locked(c, i); 1531 1532 if (!bch2_dev_is_online(ca) && 1533 (ca->mi.state == BCH_MEMBER_STATE_rw || 1534 ca->mi.state == BCH_MEMBER_STATE_ro)) { 1535 mutex_unlock(&c->sb_lock); 1536 return false; 1537 } 1538 } 1539 mutex_unlock(&c->sb_lock); 1540 } 1541 1542 return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); 1543 } 1544 1545 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) 1546 { 1547 /* 1548 * The allocator thread itself allocates btree nodes, so stop it first: 1549 */ 1550 bch2_dev_allocator_remove(c, ca); 1551 bch2_recalc_capacity(c); 1552 bch2_dev_journal_stop(&c->journal, ca); 1553 } 1554 1555 static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) 1556 { 1557 lockdep_assert_held(&c->state_lock); 1558 1559 BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); 1560 1561 bch2_dev_allocator_add(c, ca); 1562 bch2_recalc_capacity(c); 1563 bch2_dev_do_discards(ca); 1564 } 1565 1566 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, 1567 enum bch_member_state new_state, int flags) 1568 { 1569 struct bch_member *m; 1570 int ret = 0; 1571 1572 if (ca->mi.state == new_state) 1573 return 0; 1574 1575 if (!bch2_dev_state_allowed(c, ca, new_state, flags)) 1576 return -BCH_ERR_device_state_not_allowed; 1577 1578 if (new_state != BCH_MEMBER_STATE_rw) 1579 __bch2_dev_read_only(c, ca); 1580 1581 bch_notice(ca, "%s", bch2_member_states[new_state]); 1582 1583 mutex_lock(&c->sb_lock); 1584 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 1585 SET_BCH_MEMBER_STATE(m, new_state); 1586 bch2_write_super(c); 1587 mutex_unlock(&c->sb_lock); 1588 1589 if (new_state == BCH_MEMBER_STATE_rw) 1590 __bch2_dev_read_write(c, ca); 1591 1592 rebalance_wakeup(c); 1593 1594 return ret; 1595 } 1596 1597 int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, 1598 enum bch_member_state new_state, int flags) 1599 { 1600 int ret; 1601 1602 down_write(&c->state_lock); 1603 ret = __bch2_dev_set_state(c, ca, new_state, flags); 1604 up_write(&c->state_lock); 1605 1606 return ret; 1607 } 1608 1609 /* Device add/removal: */ 1610 1611 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) 1612 { 1613 struct bch_member *m; 1614 unsigned dev_idx = ca->dev_idx, data; 1615 int ret; 1616 1617 down_write(&c->state_lock); 1618 1619 /* 1620 * We consume a reference to ca->ref, regardless of whether we succeed 1621 * or fail: 1622 */ 1623 bch2_dev_put(ca); 1624 1625 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { 1626 bch_err(ca, "Cannot remove without losing data"); 1627 ret = -BCH_ERR_device_state_not_allowed; 1628 goto err; 1629 } 1630 1631 __bch2_dev_read_only(c, ca); 1632 1633 ret = bch2_dev_data_drop(c, ca->dev_idx, flags); 1634 bch_err_msg(ca, ret, "bch2_dev_data_drop()"); 1635 if (ret) 1636 goto err; 1637 1638 ret = bch2_dev_remove_alloc(c, ca); 1639 bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); 1640 if (ret) 1641 goto err; 1642 1643 /* 1644 * We need to flush the entire journal to get rid of keys that reference 1645 * the device being removed before removing the superblock entry 1646 */ 1647 bch2_journal_flush_all_pins(&c->journal); 1648 1649 /* 1650 * this is really just needed for the bch2_replicas_gc_(start|end) 1651 * calls, and could be cleaned up: 1652 */ 1653 ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); 1654 bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); 1655 if (ret) 1656 goto err; 1657 1658 ret = bch2_journal_flush(&c->journal); 1659 bch_err_msg(ca, ret, "bch2_journal_flush()"); 1660 if (ret) 1661 goto err; 1662 1663 ret = bch2_replicas_gc2(c); 1664 bch_err_msg(ca, ret, "bch2_replicas_gc2()"); 1665 if (ret) 1666 goto err; 1667 1668 data = bch2_dev_has_data(c, ca); 1669 if (data) { 1670 struct printbuf data_has = PRINTBUF; 1671 1672 prt_bitflags(&data_has, __bch2_data_types, data); 1673 bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); 1674 printbuf_exit(&data_has); 1675 ret = -EBUSY; 1676 goto err; 1677 } 1678 1679 __bch2_dev_offline(c, ca); 1680 1681 mutex_lock(&c->sb_lock); 1682 rcu_assign_pointer(c->devs[ca->dev_idx], NULL); 1683 mutex_unlock(&c->sb_lock); 1684 1685 #ifndef CONFIG_BCACHEFS_DEBUG 1686 percpu_ref_kill(&ca->ref); 1687 #else 1688 ca->dying = true; 1689 bch2_dev_put(ca); 1690 #endif 1691 wait_for_completion(&ca->ref_completion); 1692 1693 bch2_dev_free(ca); 1694 1695 /* 1696 * Free this device's slot in the bch_member array - all pointers to 1697 * this device must be gone: 1698 */ 1699 mutex_lock(&c->sb_lock); 1700 m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); 1701 memset(&m->uuid, 0, sizeof(m->uuid)); 1702 1703 bch2_write_super(c); 1704 1705 mutex_unlock(&c->sb_lock); 1706 up_write(&c->state_lock); 1707 return 0; 1708 err: 1709 if (ca->mi.state == BCH_MEMBER_STATE_rw && 1710 !percpu_ref_is_zero(&ca->io_ref)) 1711 __bch2_dev_read_write(c, ca); 1712 up_write(&c->state_lock); 1713 return ret; 1714 } 1715 1716 /* Add new device to running filesystem: */ 1717 int bch2_dev_add(struct bch_fs *c, const char *path) 1718 { 1719 struct bch_opts opts = bch2_opts_empty(); 1720 struct bch_sb_handle sb; 1721 struct bch_dev *ca = NULL; 1722 struct printbuf errbuf = PRINTBUF; 1723 struct printbuf label = PRINTBUF; 1724 int ret; 1725 1726 ret = bch2_read_super(path, &opts, &sb); 1727 bch_err_msg(c, ret, "reading super"); 1728 if (ret) 1729 goto err; 1730 1731 struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); 1732 1733 if (BCH_MEMBER_GROUP(&dev_mi)) { 1734 bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); 1735 if (label.allocation_failure) { 1736 ret = -ENOMEM; 1737 goto err; 1738 } 1739 } 1740 1741 ret = bch2_dev_may_add(sb.sb, c); 1742 if (ret) 1743 goto err; 1744 1745 ca = __bch2_dev_alloc(c, &dev_mi); 1746 if (!ca) { 1747 ret = -ENOMEM; 1748 goto err; 1749 } 1750 1751 ret = __bch2_dev_attach_bdev(ca, &sb); 1752 if (ret) 1753 goto err; 1754 1755 down_write(&c->state_lock); 1756 mutex_lock(&c->sb_lock); 1757 1758 ret = bch2_sb_from_fs(c, ca); 1759 bch_err_msg(c, ret, "setting up new superblock"); 1760 if (ret) 1761 goto err_unlock; 1762 1763 if (dynamic_fault("bcachefs:add:no_slot")) 1764 goto err_unlock; 1765 1766 ret = bch2_sb_member_alloc(c); 1767 if (ret < 0) { 1768 bch_err_msg(c, ret, "setting up new superblock"); 1769 goto err_unlock; 1770 } 1771 unsigned dev_idx = ret; 1772 1773 /* success: */ 1774 1775 dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); 1776 *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; 1777 1778 ca->disk_sb.sb->dev_idx = dev_idx; 1779 bch2_dev_attach(c, ca, dev_idx); 1780 1781 if (BCH_MEMBER_GROUP(&dev_mi)) { 1782 ret = __bch2_dev_group_set(c, ca, label.buf); 1783 bch_err_msg(c, ret, "creating new label"); 1784 if (ret) 1785 goto err_unlock; 1786 } 1787 1788 bch2_write_super(c); 1789 mutex_unlock(&c->sb_lock); 1790 1791 ret = bch2_dev_usage_init(ca, false); 1792 if (ret) 1793 goto err_late; 1794 1795 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 1796 bch_err_msg(ca, ret, "marking new superblock"); 1797 if (ret) 1798 goto err_late; 1799 1800 ret = bch2_fs_freespace_init(c); 1801 bch_err_msg(ca, ret, "initializing free space"); 1802 if (ret) 1803 goto err_late; 1804 1805 if (ca->mi.state == BCH_MEMBER_STATE_rw) 1806 __bch2_dev_read_write(c, ca); 1807 1808 ret = bch2_dev_journal_alloc(ca, false); 1809 bch_err_msg(c, ret, "allocating journal"); 1810 if (ret) 1811 goto err_late; 1812 1813 up_write(&c->state_lock); 1814 out: 1815 printbuf_exit(&label); 1816 printbuf_exit(&errbuf); 1817 bch_err_fn(c, ret); 1818 return ret; 1819 1820 err_unlock: 1821 mutex_unlock(&c->sb_lock); 1822 up_write(&c->state_lock); 1823 err: 1824 if (ca) 1825 bch2_dev_free(ca); 1826 bch2_free_super(&sb); 1827 goto out; 1828 err_late: 1829 up_write(&c->state_lock); 1830 ca = NULL; 1831 goto err; 1832 } 1833 1834 /* Hot add existing device to running filesystem: */ 1835 int bch2_dev_online(struct bch_fs *c, const char *path) 1836 { 1837 struct bch_opts opts = bch2_opts_empty(); 1838 struct bch_sb_handle sb = { NULL }; 1839 struct bch_dev *ca; 1840 unsigned dev_idx; 1841 int ret; 1842 1843 down_write(&c->state_lock); 1844 1845 ret = bch2_read_super(path, &opts, &sb); 1846 if (ret) { 1847 up_write(&c->state_lock); 1848 return ret; 1849 } 1850 1851 dev_idx = sb.sb->dev_idx; 1852 1853 ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); 1854 bch_err_msg(c, ret, "bringing %s online", path); 1855 if (ret) 1856 goto err; 1857 1858 ret = bch2_dev_attach_bdev(c, &sb); 1859 if (ret) 1860 goto err; 1861 1862 ca = bch2_dev_locked(c, dev_idx); 1863 1864 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 1865 bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); 1866 if (ret) 1867 goto err; 1868 1869 if (ca->mi.state == BCH_MEMBER_STATE_rw) 1870 __bch2_dev_read_write(c, ca); 1871 1872 if (!ca->mi.freespace_initialized) { 1873 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); 1874 bch_err_msg(ca, ret, "initializing free space"); 1875 if (ret) 1876 goto err; 1877 } 1878 1879 if (!ca->journal.nr) { 1880 ret = bch2_dev_journal_alloc(ca, false); 1881 bch_err_msg(ca, ret, "allocating journal"); 1882 if (ret) 1883 goto err; 1884 } 1885 1886 mutex_lock(&c->sb_lock); 1887 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = 1888 cpu_to_le64(ktime_get_real_seconds()); 1889 bch2_write_super(c); 1890 mutex_unlock(&c->sb_lock); 1891 1892 up_write(&c->state_lock); 1893 return 0; 1894 err: 1895 up_write(&c->state_lock); 1896 bch2_free_super(&sb); 1897 return ret; 1898 } 1899 1900 int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) 1901 { 1902 down_write(&c->state_lock); 1903 1904 if (!bch2_dev_is_online(ca)) { 1905 bch_err(ca, "Already offline"); 1906 up_write(&c->state_lock); 1907 return 0; 1908 } 1909 1910 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { 1911 bch_err(ca, "Cannot offline required disk"); 1912 up_write(&c->state_lock); 1913 return -BCH_ERR_device_state_not_allowed; 1914 } 1915 1916 __bch2_dev_offline(c, ca); 1917 1918 up_write(&c->state_lock); 1919 return 0; 1920 } 1921 1922 int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) 1923 { 1924 struct bch_member *m; 1925 u64 old_nbuckets; 1926 int ret = 0; 1927 1928 down_write(&c->state_lock); 1929 old_nbuckets = ca->mi.nbuckets; 1930 1931 if (nbuckets < ca->mi.nbuckets) { 1932 bch_err(ca, "Cannot shrink yet"); 1933 ret = -EINVAL; 1934 goto err; 1935 } 1936 1937 if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { 1938 bch_err(ca, "New device size too big (%llu greater than max %u)", 1939 nbuckets, BCH_MEMBER_NBUCKETS_MAX); 1940 ret = -BCH_ERR_device_size_too_big; 1941 goto err; 1942 } 1943 1944 if (bch2_dev_is_online(ca) && 1945 get_capacity(ca->disk_sb.bdev->bd_disk) < 1946 ca->mi.bucket_size * nbuckets) { 1947 bch_err(ca, "New size larger than device"); 1948 ret = -BCH_ERR_device_size_too_small; 1949 goto err; 1950 } 1951 1952 ret = bch2_dev_buckets_resize(c, ca, nbuckets); 1953 bch_err_msg(ca, ret, "resizing buckets"); 1954 if (ret) 1955 goto err; 1956 1957 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 1958 if (ret) 1959 goto err; 1960 1961 mutex_lock(&c->sb_lock); 1962 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 1963 m->nbuckets = cpu_to_le64(nbuckets); 1964 1965 bch2_write_super(c); 1966 mutex_unlock(&c->sb_lock); 1967 1968 if (ca->mi.freespace_initialized) { 1969 struct disk_accounting_pos acc = { 1970 .type = BCH_DISK_ACCOUNTING_dev_data_type, 1971 .dev_data_type.dev = ca->dev_idx, 1972 .dev_data_type.data_type = BCH_DATA_free, 1973 }; 1974 u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; 1975 1976 ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, 1977 bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: 1978 bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); 1979 if (ret) 1980 goto err; 1981 } 1982 1983 bch2_recalc_capacity(c); 1984 err: 1985 up_write(&c->state_lock); 1986 return ret; 1987 } 1988 1989 /* return with ref on ca->ref: */ 1990 struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) 1991 { 1992 if (!strncmp(name, "/dev/", strlen("/dev/"))) 1993 name += strlen("/dev/"); 1994 1995 for_each_member_device(c, ca) 1996 if (!strcmp(name, ca->name)) 1997 return ca; 1998 return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); 1999 } 2000 2001 /* Filesystem open: */ 2002 2003 static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) 2004 { 2005 return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: 2006 cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); 2007 } 2008 2009 struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, 2010 struct bch_opts opts) 2011 { 2012 DARRAY(struct bch_sb_handle) sbs = { 0 }; 2013 struct bch_fs *c = NULL; 2014 struct bch_sb_handle *best = NULL; 2015 struct printbuf errbuf = PRINTBUF; 2016 int ret = 0; 2017 2018 if (!try_module_get(THIS_MODULE)) 2019 return ERR_PTR(-ENODEV); 2020 2021 if (!nr_devices) { 2022 ret = -EINVAL; 2023 goto err; 2024 } 2025 2026 ret = darray_make_room(&sbs, nr_devices); 2027 if (ret) 2028 goto err; 2029 2030 for (unsigned i = 0; i < nr_devices; i++) { 2031 struct bch_sb_handle sb = { NULL }; 2032 2033 ret = bch2_read_super(devices[i], &opts, &sb); 2034 if (ret) 2035 goto err; 2036 2037 BUG_ON(darray_push(&sbs, sb)); 2038 } 2039 2040 if (opts.nochanges && !opts.read_only) { 2041 ret = -BCH_ERR_erofs_nochanges; 2042 goto err_print; 2043 } 2044 2045 darray_for_each(sbs, sb) 2046 if (!best || sb_cmp(sb->sb, best->sb) > 0) 2047 best = sb; 2048 2049 darray_for_each_reverse(sbs, sb) { 2050 ret = bch2_dev_in_fs(best, sb, &opts); 2051 2052 if (ret == -BCH_ERR_device_has_been_removed || 2053 ret == -BCH_ERR_device_splitbrain) { 2054 bch2_free_super(sb); 2055 darray_remove_item(&sbs, sb); 2056 best -= best > sb; 2057 ret = 0; 2058 continue; 2059 } 2060 2061 if (ret) 2062 goto err_print; 2063 } 2064 2065 c = bch2_fs_alloc(best->sb, opts); 2066 ret = PTR_ERR_OR_ZERO(c); 2067 if (ret) 2068 goto err; 2069 2070 down_write(&c->state_lock); 2071 darray_for_each(sbs, sb) { 2072 ret = bch2_dev_attach_bdev(c, sb); 2073 if (ret) { 2074 up_write(&c->state_lock); 2075 goto err; 2076 } 2077 } 2078 up_write(&c->state_lock); 2079 2080 if (!bch2_fs_may_start(c)) { 2081 ret = -BCH_ERR_insufficient_devices_to_start; 2082 goto err_print; 2083 } 2084 2085 if (!c->opts.nostart) { 2086 ret = bch2_fs_start(c); 2087 if (ret) 2088 goto err; 2089 } 2090 out: 2091 darray_for_each(sbs, sb) 2092 bch2_free_super(sb); 2093 darray_exit(&sbs); 2094 printbuf_exit(&errbuf); 2095 module_put(THIS_MODULE); 2096 return c; 2097 err_print: 2098 pr_err("bch_fs_open err opening %s: %s", 2099 devices[0], bch2_err_str(ret)); 2100 err: 2101 if (!IS_ERR_OR_NULL(c)) 2102 bch2_fs_stop(c); 2103 c = ERR_PTR(ret); 2104 goto out; 2105 } 2106 2107 /* Global interfaces/init */ 2108 2109 static void bcachefs_exit(void) 2110 { 2111 bch2_debug_exit(); 2112 bch2_vfs_exit(); 2113 bch2_chardev_exit(); 2114 bch2_btree_key_cache_exit(); 2115 if (bcachefs_kset) 2116 kset_unregister(bcachefs_kset); 2117 } 2118 2119 static int __init bcachefs_init(void) 2120 { 2121 bch2_bkey_pack_test(); 2122 2123 if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || 2124 bch2_btree_key_cache_init() || 2125 bch2_chardev_init() || 2126 bch2_vfs_init() || 2127 bch2_debug_init()) 2128 goto err; 2129 2130 return 0; 2131 err: 2132 bcachefs_exit(); 2133 return -ENOMEM; 2134 } 2135 2136 #define BCH_DEBUG_PARAM(name, description) \ 2137 bool bch2_##name; \ 2138 module_param_named(name, bch2_##name, bool, 0644); \ 2139 MODULE_PARM_DESC(name, description); 2140 BCH_DEBUG_PARAMS() 2141 #undef BCH_DEBUG_PARAM 2142 2143 __maybe_unused 2144 static unsigned bch2_metadata_version = bcachefs_metadata_version_current; 2145 module_param_named(version, bch2_metadata_version, uint, 0400); 2146 2147 module_exit(bcachefs_exit); 2148 module_init(bcachefs_init); 2149